diff --git a/Cargo.lock b/Cargo.lock
index b17b653b..69390359 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3645,7 +3645,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "aes-gcm",
  "anyhow",
@@ -3735,7 +3735,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-bridge"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "fluxbench",
  "libc",
@@ -3747,7 +3747,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-client"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "async-trait",
  "nodedb-types",
@@ -3765,7 +3765,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-cluster"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "async-trait",
  "crc32c",
@@ -3792,7 +3792,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-codec"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "lz4_flex 0.11.6",
  "pco",
@@ -3808,7 +3808,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-columnar"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "crc32c",
  "nodedb-codec",
@@ -3825,7 +3825,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-crdt"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "hmac 0.12.1",
  "loro",
@@ -3838,7 +3838,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-fts"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "icu_segmenter",
  "lindera",
@@ -3853,7 +3853,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-graph"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nodedb-types",
  "rkyv 0.8.15",
@@ -3867,7 +3867,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-mem"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "fluxbench",
  "libc",
@@ -3882,7 +3882,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-query"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nodedb-fts",
  "nodedb-spatial",
@@ -3898,7 +3898,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-raft"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "rand 0.9.4",
  "rkyv 0.8.15",
@@ -3914,7 +3914,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-spatial"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "h3o",
  "nodedb-types",
@@ -3929,7 +3929,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-sql"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nodedb-query",
  "nodedb-types",
@@ -3939,7 +3939,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-strict"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "arrow",
  "nodedb-types",
@@ -3953,7 +3953,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-types"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nanoid",
  "nodedb-codec",
@@ -3972,7 +3972,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-vector"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "libc",
  "memmap2",
@@ -3989,7 +3989,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-wal"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "aes-gcm",
  "crc32c",
diff --git a/Cargo.toml b/Cargo.toml
index 34822136..5d18c12c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.0.3"
+version = "0.0.4"
 edition = "2024"
 rust-version = "1.94"
 license = "BUSL-1.1"
diff --git a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
index a09b2e2c..6bb20f2b 100644
--- a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
+++ b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
@@ -35,7 +35,8 @@ pub(super) fn bootstrap(config: &ClusterConfig, catalog: &ClusterCatalog) -> Res
     );
 
     // Create MultiRaft with all groups (single-node, no peers).
-    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone());
+    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone())
+        .with_election_timeout(config.election_timeout_min, config.election_timeout_max);
     for group_id in routing.group_ids() {
         multi_raft.add_group(group_id, vec![])?;
     }
@@ -81,6 +82,7 @@ fn generate_cluster_id() -> u64 {
 mod tests {
     use super::*;
     use crate::catalog::ClusterCatalog;
+    use std::time::Duration;
 
     fn temp_catalog() -> (tempfile::TempDir, ClusterCatalog) {
         let dir = tempfile::tempdir().unwrap();
@@ -102,6 +104,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
 
         let state = bootstrap(&config, &catalog).unwrap();
diff --git a/nodedb-cluster/src/bootstrap/config.rs b/nodedb-cluster/src/bootstrap/config.rs
index 933198c5..3b42b1ee 100644
--- a/nodedb-cluster/src/bootstrap/config.rs
+++ b/nodedb-cluster/src/bootstrap/config.rs
@@ -91,6 +91,10 @@ pub struct ClusterConfig {
     /// [`crate::spawn_swim`] after the cluster is up and feed the
     /// seed list from `seed_nodes`.
     pub swim_udp_addr: Option<SocketAddr>,
+    /// Raft election timeout range. Controls how long a follower waits
+    /// before starting an election after losing contact with the leader.
+    pub election_timeout_min: Duration,
+    pub election_timeout_max: Duration,
 }
 
 /// Result of cluster startup — everything needed to run the Raft loop.
diff --git a/nodedb-cluster/src/bootstrap/join.rs b/nodedb-cluster/src/bootstrap/join.rs
index afe6ad7a..67e79854 100644
--- a/nodedb-cluster/src/bootstrap/join.rs
+++ b/nodedb-cluster/src/bootstrap/join.rs
@@ -288,7 +288,8 @@ fn apply_join_response(
     //    learners). A learner-started group boots in the `Learner`
     //    role and will not run an election until a subsequent
     //    `PromoteLearner` conf change is applied.
-    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone());
+    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone())
+        .with_election_timeout(config.election_timeout_min, config.election_timeout_max);
     for g in &resp.groups {
         let is_voter = g.members.contains(&config.node_id);
         let is_learner = g.learners.contains(&config.node_id);
@@ -450,6 +451,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
         let state1 = bootstrap(&config1, &catalog1).unwrap();
 
@@ -499,6 +502,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
 
         let lifecycle = ClusterLifecycleTracker::new();
diff --git a/nodedb-cluster/src/bootstrap/probe.rs b/nodedb-cluster/src/bootstrap/probe.rs
index 1688c87d..4df5838b 100644
--- a/nodedb-cluster/src/bootstrap/probe.rs
+++ b/nodedb-cluster/src/bootstrap/probe.rs
@@ -223,6 +223,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         }
     }
 
diff --git a/nodedb-cluster/src/bootstrap/restart.rs b/nodedb-cluster/src/bootstrap/restart.rs
index 3306142a..1c18186f 100644
--- a/nodedb-cluster/src/bootstrap/restart.rs
+++ b/nodedb-cluster/src/bootstrap/restart.rs
@@ -35,7 +35,8 @@ pub(super) fn restart(
     // as a learner on restart; dropping the group entirely would
     // leave the node permanently without any copy of it and
     // silently broken.
-    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone());
+    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone())
+        .with_election_timeout(config.election_timeout_min, config.election_timeout_max);
     for (group_id, info) in routing.group_members() {
         let is_voter = info.members.contains(&config.node_id);
         let is_learner = info.learners.contains(&config.node_id);
@@ -91,6 +92,7 @@ mod tests {
     use super::super::bootstrap_fn::bootstrap;
     use super::*;
     use crate::catalog::ClusterCatalog;
+    use std::time::Duration;
 
     fn temp_catalog() -> (tempfile::TempDir, ClusterCatalog) {
         let dir = tempfile::tempdir().unwrap();
@@ -112,6 +114,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
 
         // Bootstrap first.
diff --git a/nodedb-cluster/src/circuit_breaker.rs b/nodedb-cluster/src/circuit_breaker.rs
index 3b02d4e5..3c5992f1 100644
--- a/nodedb-cluster/src/circuit_breaker.rs
+++ b/nodedb-cluster/src/circuit_breaker.rs
@@ -145,6 +145,26 @@ impl CircuitBreaker {
             .unwrap_or(CircuitState::Closed)
     }
 
+    /// Return the ids of every peer whose breaker is currently Open.
+    ///
+    /// Used by the reachability driver to find peers that need an
+    /// active probe — without a periodic poke these peers never
+    /// transition back to HalfOpen (no traffic → no `check()` call
+    /// → no cooldown re-evaluation).
+    pub fn open_peers(&self) -> Vec<u64> {
+        let peers = self.peers.read().unwrap_or_else(|p| p.into_inner());
+        peers
+            .iter()
+            .filter_map(|(id, b)| {
+                if b.state == CircuitState::Open {
+                    Some(*id)
+                } else {
+                    None
+                }
+            })
+            .collect()
+    }
+
     /// Get consecutive failure count for a peer.
     pub fn failure_count(&self, peer: u64) -> u32 {
         let peers = self.peers.read().unwrap_or_else(|p| p.into_inner());
diff --git a/nodedb-cluster/src/closed_timestamp.rs b/nodedb-cluster/src/closed_timestamp.rs
new file mode 100644
index 00000000..550fb9f2
--- /dev/null
+++ b/nodedb-cluster/src/closed_timestamp.rs
@@ -0,0 +1,123 @@
+//! Per-group closed-timestamp tracker.
+//!
+//! Every time a Raft group applies a committed entry, the applier
+//! records the wall-clock instant as that group's "closed timestamp".
+//! A follower whose closed timestamp for a group is within the
+//! caller's staleness bound can serve reads locally — no gateway hop
+//! to the leader.
+//!
+//! The tracker is intentionally simple: one `Instant` per group,
+//! updated monotonically. There is no HLC or cross-node coordination
+//! here — the closed timestamp is local to this node. Safety comes
+//! from the fact that a follower's applied index can only advance
+//! (Raft guarantees), so a read served at a given closed timestamp
+//! sees a consistent prefix of the log.
+
+use std::collections::HashMap;
+use std::sync::RwLock;
+use std::time::{Duration, Instant};
+
+/// Tracks the most recent apply instant per Raft group.
+pub struct ClosedTimestampTracker {
+    groups: RwLock<HashMap<u64, Instant>>,
+}
+
+impl ClosedTimestampTracker {
+    pub fn new() -> Self {
+        Self {
+            groups: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Record that `group_id` just applied one or more entries.
+    /// Called by the raft-loop applier after each apply batch.
+    pub fn mark_applied(&self, group_id: u64) {
+        let mut g = self.groups.write().unwrap_or_else(|p| p.into_inner());
+        g.insert(group_id, Instant::now());
+    }
+
+    /// Record that `group_id` just applied, using a caller-supplied
+    /// instant. Exposed for deterministic testing with paused time.
+    pub fn mark_applied_at(&self, group_id: u64, at: Instant) {
+        let mut g = self.groups.write().unwrap_or_else(|p| p.into_inner());
+        g.insert(group_id, at);
+    }
+
+    /// Check whether this node's replica of `group_id` has applied
+    /// recently enough that a read with `max_staleness` can be
+    /// served locally.
+    ///
+    /// Returns `false` if the group has never applied on this node
+    /// (no closed timestamp recorded).
+    pub fn is_fresh_enough(&self, group_id: u64, max_staleness: Duration) -> bool {
+        let g = self.groups.read().unwrap_or_else(|p| p.into_inner());
+        match g.get(&group_id) {
+            Some(last) => last.elapsed() <= max_staleness,
+            None => false,
+        }
+    }
+
+    /// Return the age of the closed timestamp for a group, or `None`
+    /// if the group has never applied on this node. Useful for
+    /// observability (metrics, SHOW commands).
+    pub fn staleness(&self, group_id: u64) -> Option<Duration> {
+        let g = self.groups.read().unwrap_or_else(|p| p.into_inner());
+        g.get(&group_id).map(|last| last.elapsed())
+    }
+}
+
+impl Default for ClosedTimestampTracker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn unknown_group_is_not_fresh() {
+        let tracker = ClosedTimestampTracker::new();
+        assert!(!tracker.is_fresh_enough(99, Duration::from_secs(10)));
+    }
+
+    #[test]
+    fn recently_applied_is_fresh() {
+        let tracker = ClosedTimestampTracker::new();
+        tracker.mark_applied(1);
+        assert!(tracker.is_fresh_enough(1, Duration::from_secs(5)));
+    }
+
+    #[test]
+    fn stale_group_is_not_fresh() {
+        let tracker = ClosedTimestampTracker::new();
+        let old = Instant::now() - Duration::from_secs(30);
+        tracker.mark_applied_at(1, old);
+        assert!(!tracker.is_fresh_enough(1, Duration::from_secs(5)));
+    }
+
+    #[test]
+    fn staleness_returns_none_for_unknown() {
+        let tracker = ClosedTimestampTracker::new();
+        assert!(tracker.staleness(42).is_none());
+    }
+
+    #[test]
+    fn staleness_returns_age_for_known() {
+        let tracker = ClosedTimestampTracker::new();
+        tracker.mark_applied(1);
+        let s = tracker.staleness(1).unwrap();
+        assert!(s < Duration::from_millis(100));
+    }
+
+    #[test]
+    fn mark_applied_updates_monotonically() {
+        let tracker = ClosedTimestampTracker::new();
+        let old = Instant::now() - Duration::from_secs(10);
+        tracker.mark_applied_at(1, old);
+        assert!(!tracker.is_fresh_enough(1, Duration::from_secs(5)));
+        tracker.mark_applied(1);
+        assert!(tracker.is_fresh_enough(1, Duration::from_secs(5)));
+    }
+}
diff --git a/nodedb-cluster/src/decommission/coordinator.rs b/nodedb-cluster/src/decommission/coordinator.rs
new file mode 100644
index 00000000..4f62c4aa
--- /dev/null
+++ b/nodedb-cluster/src/decommission/coordinator.rs
@@ -0,0 +1,222 @@
+//! `DecommissionCoordinator` — drives a [`DecommissionPlan`] through
+//! the metadata Raft group one entry at a time.
+//!
+//! The coordinator is a stateless-looking actor: it owns the plan,
+//! a [`MetadataProposer`] (the injection seam for tests and for
+//! whichever Raft driver is wired up at runtime), and an index
+//! counter. On every call to [`DecommissionCoordinator::run`] it
+//! proposes each entry in order, waiting for each to commit before
+//! advancing. A proposer failure aborts the run at the failed step —
+//! the caller can retry by constructing a fresh coordinator from
+//! the same plan, because every step is idempotent at the metadata
+//! layer (the cache and live-state appliers skip already-applied
+//! indexes).
+//!
+//! The coordinator does not own a timer or a shutdown channel — it
+//! is a one-shot sequence. Higher-level supervisors handle retries
+//! and cancellation.
+
+use async_trait::async_trait;
+use tracing::{debug, info};
+
+use crate::error::Result;
+use crate::metadata_group::MetadataEntry;
+
+use super::flow::DecommissionPlan;
+
+/// Injection seam: proposes a single metadata entry through the
+/// metadata Raft group and waits for it to commit. Returns the
+/// applied index on success so the coordinator can tell it apart
+/// from older commits.
+#[async_trait]
+pub trait MetadataProposer: Send + Sync {
+    async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64>;
+}
+
+// Blanket impl so callers can pass `Arc<T>` wherever a `MetadataProposer`
+// is required without having to write a forwarding impl for every
+// wrapper type. Defined here (rather than in the consumer crate) to
+// avoid orphan-rule issues for downstream test impls.
+#[async_trait]
+impl<T: MetadataProposer + ?Sized> MetadataProposer for std::sync::Arc<T> {
+    async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64> {
+        (**self).propose_and_wait(entry).await
+    }
+}
+
+/// Drives a [`DecommissionPlan`] to completion.
+pub struct DecommissionCoordinator<P: MetadataProposer> {
+    plan: DecommissionPlan,
+    proposer: P,
+}
+
+/// Outcome of a successful coordinator run.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DecommissionRunResult {
+    pub node_id: u64,
+    pub entries_committed: usize,
+    pub last_applied_index: u64,
+}
+
+impl<P: MetadataProposer> DecommissionCoordinator<P> {
+    pub fn new(plan: DecommissionPlan, proposer: P) -> Self {
+        Self { plan, proposer }
+    }
+
+    /// Propose every entry in the plan sequentially, waiting for
+    /// each commit. Returns the total number of entries committed
+    /// and the final applied index.
+    pub async fn run(self) -> Result<DecommissionRunResult> {
+        let node_id = self.plan.node_id;
+        let total = self.plan.entries.len();
+        info!(node_id, steps = total, "decommission coordinator starting");
+        let mut last_applied = 0u64;
+        for (step, entry) in self.plan.entries.into_iter().enumerate() {
+            debug!(node_id, step, total, "proposing decommission entry");
+            last_applied = self.proposer.propose_and_wait(entry).await?;
+        }
+        info!(
+            node_id,
+            entries_committed = total,
+            last_applied,
+            "decommission coordinator finished"
+        );
+        Ok(DecommissionRunResult {
+            node_id,
+            entries_committed: total,
+            last_applied_index: last_applied,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::decommission::flow::plan_full_decommission;
+    use crate::error::ClusterError;
+    use crate::metadata_group::{RoutingChange, TopologyChange};
+    use crate::routing::RoutingTable;
+    use crate::topology::{ClusterTopology, NodeInfo, NodeState};
+    use std::net::SocketAddr;
+    use std::sync::atomic::{AtomicU64, Ordering};
+    use std::sync::{Arc, Mutex};
+
+    struct RecordingProposer {
+        committed: Mutex<Vec<MetadataEntry>>,
+        counter: AtomicU64,
+    }
+
+    impl RecordingProposer {
+        fn new() -> Arc<Self> {
+            Arc::new(Self {
+                committed: Mutex::new(Vec::new()),
+                counter: AtomicU64::new(0),
+            })
+        }
+    }
+
+    #[async_trait]
+    impl MetadataProposer for RecordingProposer {
+        async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64> {
+            let idx = self.counter.fetch_add(1, Ordering::SeqCst) + 1;
+            self.committed.lock().unwrap().push(entry);
+            Ok(idx)
+        }
+    }
+
+    struct FailingProposer {
+        fail_after: usize,
+        counter: AtomicU64,
+    }
+
+    #[async_trait]
+    impl MetadataProposer for FailingProposer {
+        async fn propose_and_wait(&self, _entry: MetadataEntry) -> Result<u64> {
+            let n = self.counter.fetch_add(1, Ordering::SeqCst);
+            if n as usize >= self.fail_after {
+                return Err(ClusterError::Transport {
+                    detail: "injected failure".into(),
+                });
+            }
+            Ok(n + 1)
+        }
+    }
+
+    fn three_node_plan() -> DecommissionPlan {
+        let mut t = ClusterTopology::new();
+        for (i, id) in [1u64, 2, 3].iter().enumerate() {
+            let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+        }
+        let routing = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        plan_full_decommission(1, &t, &routing, 2).unwrap()
+    }
+
+    #[tokio::test]
+    async fn coordinator_proposes_every_entry_in_order() {
+        let plan = three_node_plan();
+        let expected = plan.entries.clone();
+        let proposer = RecordingProposer::new();
+        let coord = DecommissionCoordinator::new(plan, proposer.clone());
+        let result = coord.run().await.unwrap();
+
+        assert_eq!(result.node_id, 1);
+        assert_eq!(result.entries_committed, expected.len());
+        let committed = proposer.committed.lock().unwrap().clone();
+        assert_eq!(committed, expected);
+    }
+
+    #[tokio::test]
+    async fn coordinator_aborts_on_proposer_error() {
+        let plan = three_node_plan();
+        let proposer = FailingProposer {
+            fail_after: 2,
+            counter: AtomicU64::new(0),
+        };
+        let coord = DecommissionCoordinator::new(plan, proposer);
+        let err = coord.run().await.unwrap_err();
+        assert!(err.to_string().contains("injected failure"));
+    }
+
+    #[tokio::test]
+    async fn coordinator_reports_last_applied_index() {
+        let plan = three_node_plan();
+        let proposer = RecordingProposer::new();
+        let coord = DecommissionCoordinator::new(plan, proposer.clone());
+        let result = coord.run().await.unwrap();
+        // The recording proposer returns monotonically increasing
+        // indexes starting from 1; the last one equals the total
+        // entry count.
+        assert_eq!(result.last_applied_index, result.entries_committed as u64);
+    }
+
+    /// Sanity: the plan's shape is preserved end to end — the
+    /// recording proposer sees the same `StartDecommission` /
+    /// `FinishDecommission` / `Leave` bookends.
+    #[tokio::test]
+    async fn coordinator_preserves_bookends() {
+        let plan = three_node_plan();
+        let proposer = RecordingProposer::new();
+        let coord = DecommissionCoordinator::new(plan, proposer.clone());
+        coord.run().await.unwrap();
+
+        let committed = proposer.committed.lock().unwrap().clone();
+        assert!(matches!(
+            committed.first(),
+            Some(MetadataEntry::TopologyChange(
+                TopologyChange::StartDecommission { node_id: 1 }
+            ))
+        ));
+        assert!(matches!(
+            committed.last(),
+            Some(MetadataEntry::TopologyChange(TopologyChange::Leave {
+                node_id: 1
+            }))
+        ));
+        // At least one RemoveMember for the target.
+        assert!(committed.iter().any(|e| matches!(
+            e,
+            MetadataEntry::RoutingChange(RoutingChange::RemoveMember { node_id: 1, .. })
+        )));
+    }
+}
diff --git a/nodedb-cluster/src/decommission/flow.rs b/nodedb-cluster/src/decommission/flow.rs
new file mode 100644
index 00000000..aa2c30c3
--- /dev/null
+++ b/nodedb-cluster/src/decommission/flow.rs
@@ -0,0 +1,227 @@
+//! Decommission flow — emit the full ordered sequence of metadata
+//! entries that move a node from `Active` to fully removed.
+//!
+//! [`plan_full_decommission`] is pure: given a snapshot of topology
+//! and routing, it returns the exact list of
+//! [`MetadataEntry`](crate::metadata_group::MetadataEntry) values the
+//! coordinator will propose through the metadata Raft group, in the
+//! order they must commit. The flow is deterministic — two nodes
+//! looking at the same snapshot produce byte-identical plans, which
+//! means a failed coordinator can be resumed from any consistent
+//! snapshot without needing per-plan state to be replicated.
+
+use crate::error::Result;
+use crate::metadata_group::{MetadataEntry, RoutingChange, TopologyChange};
+use crate::routing::RoutingTable;
+use crate::topology::ClusterTopology;
+
+use super::safety::check_can_decommission;
+
+/// Output of [`plan_full_decommission`] — the caller proposes
+/// `entries` in order, waiting for each to commit before moving on.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DecommissionPlan {
+    pub node_id: u64,
+    pub entries: Vec<MetadataEntry>,
+}
+
+/// Build the complete decommission plan for `node_id`.
+///
+/// Steps (in the order they appear in the returned `entries`):
+///
+/// 1. `TopologyChange::StartDecommission` — flip the target to
+///    `Draining`.
+/// 2. `RoutingChange::LeadershipTransfer` — for every group the
+///    target currently leads, hand leadership to another voter.
+/// 3. `RoutingChange::RemoveMember` — strip the target out of every
+///    group's member (and learner) list.
+/// 4. `TopologyChange::FinishDecommission` — flip the target to
+///    `Decommissioned`.
+/// 5. `TopologyChange::Leave` — remove the target from topology
+///    entirely so future peer lookups return `NodeNotFound`.
+///
+/// The safety gate in [`check_can_decommission`] runs first and
+/// returns an error without producing a plan if any group would drop
+/// below the configured replication factor.
+pub fn plan_full_decommission(
+    node_id: u64,
+    topology: &ClusterTopology,
+    routing: &RoutingTable,
+    replication_factor: usize,
+) -> Result<DecommissionPlan> {
+    check_can_decommission(node_id, topology, routing, replication_factor)?;
+
+    let mut entries = Vec::new();
+    entries.push(MetadataEntry::TopologyChange(
+        TopologyChange::StartDecommission { node_id },
+    ));
+
+    // Collect a stable, sorted group_id ordering so the plan is
+    // reproducible across HashMap iterations.
+    let mut group_ids: Vec<u64> = routing
+        .group_members()
+        .iter()
+        .filter(|(_, info)| info.members.contains(&node_id) || info.learners.contains(&node_id))
+        .map(|(gid, _)| *gid)
+        .collect();
+    group_ids.sort_unstable();
+
+    // 2. Leadership transfers for every group the target currently leads.
+    for gid in &group_ids {
+        let info = routing
+            .group_info(*gid)
+            .expect("group id came from routing snapshot");
+        if info.leader != node_id {
+            continue;
+        }
+        if let Some(&new_leader) = info.members.iter().find(|&&m| m != node_id) {
+            entries.push(MetadataEntry::RoutingChange(
+                RoutingChange::LeadershipTransfer {
+                    group_id: *gid,
+                    new_leader_node_id: new_leader,
+                },
+            ));
+        }
+    }
+
+    // 3. Remove the target from every group's member and learner sets.
+    for gid in &group_ids {
+        entries.push(MetadataEntry::RoutingChange(RoutingChange::RemoveMember {
+            group_id: *gid,
+            node_id,
+        }));
+    }
+
+    // 4. Finish decommission (topology state → Decommissioned).
+    entries.push(MetadataEntry::TopologyChange(
+        TopologyChange::FinishDecommission { node_id },
+    ));
+
+    // 5. Leave — remove from topology entirely.
+    entries.push(MetadataEntry::TopologyChange(TopologyChange::Leave {
+        node_id,
+    }));
+
+    Ok(DecommissionPlan { node_id, entries })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::{NodeInfo, NodeState};
+    use std::net::SocketAddr;
+
+    fn topo(nodes: &[u64]) -> ClusterTopology {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let addr: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, addr, NodeState::Active));
+        }
+        t
+    }
+
+    #[test]
+    fn plan_shape_matches_spec() {
+        let t = topo(&[1, 2, 3]);
+        // 2 groups, RF=3 (each group has all 3 nodes). Decommission
+        // 1 with RF=2 (the surviving quorum).
+        let routing = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        let plan = plan_full_decommission(1, &t, &routing, 2).unwrap();
+        assert_eq!(plan.node_id, 1);
+
+        // First entry: StartDecommission.
+        assert!(matches!(
+            plan.entries.first(),
+            Some(MetadataEntry::TopologyChange(
+                TopologyChange::StartDecommission { node_id: 1 }
+            ))
+        ));
+
+        // Last two entries: FinishDecommission, Leave.
+        let n = plan.entries.len();
+        assert!(matches!(
+            plan.entries[n - 2],
+            MetadataEntry::TopologyChange(TopologyChange::FinishDecommission { node_id: 1 })
+        ));
+        assert!(matches!(
+            plan.entries[n - 1],
+            MetadataEntry::TopologyChange(TopologyChange::Leave { node_id: 1 })
+        ));
+
+        // Every group the target is in must get a RemoveMember.
+        let remove_count = plan
+            .entries
+            .iter()
+            .filter(|e| {
+                matches!(
+                    e,
+                    MetadataEntry::RoutingChange(RoutingChange::RemoveMember { node_id: 1, .. })
+                )
+            })
+            .count();
+        assert_eq!(remove_count, 2);
+    }
+
+    #[test]
+    fn plan_emits_leadership_transfer_when_target_leads() {
+        let t = topo(&[1, 2, 3]);
+        let mut routing = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        routing.set_leader(0, 1);
+        routing.set_leader(1, 2);
+        let plan = plan_full_decommission(1, &t, &routing, 2).unwrap();
+        // Exactly one LeadershipTransfer for group 0.
+        let transfers: Vec<_> = plan
+            .entries
+            .iter()
+            .filter_map(|e| match e {
+                MetadataEntry::RoutingChange(RoutingChange::LeadershipTransfer {
+                    group_id,
+                    new_leader_node_id,
+                }) => Some((*group_id, *new_leader_node_id)),
+                _ => None,
+            })
+            .collect();
+        assert_eq!(transfers.len(), 1);
+        assert_eq!(transfers[0].0, 0);
+        assert_ne!(transfers[0].1, 1, "new leader must not be the target");
+    }
+
+    #[test]
+    fn plan_is_deterministic() {
+        let t = topo(&[1, 2, 3]);
+        let routing = RoutingTable::uniform(4, &[1, 2, 3], 3);
+        let p1 = plan_full_decommission(2, &t, &routing, 2).unwrap();
+        let p2 = plan_full_decommission(2, &t, &routing, 2).unwrap();
+        assert_eq!(p1.entries, p2.entries);
+    }
+
+    #[test]
+    fn plan_rejected_when_safety_fails() {
+        let t = topo(&[1, 2]);
+        let routing = RoutingTable::uniform(2, &[1, 2], 2);
+        let err = plan_full_decommission(1, &t, &routing, 2).unwrap_err();
+        assert!(err.to_string().contains("replication factor"));
+    }
+
+    #[test]
+    fn plan_skips_groups_target_is_not_in() {
+        let t = topo(&[1, 2, 3]);
+        let mut routing = RoutingTable::uniform(4, &[1, 2, 3], 3);
+        routing.set_group_members(0, vec![2, 3]);
+        routing.set_group_members(1, vec![2, 3]);
+        routing.set_group_members(2, vec![1, 2, 3]);
+        routing.set_group_members(3, vec![1, 2, 3]);
+        let plan = plan_full_decommission(1, &t, &routing, 2).unwrap();
+        let removes: Vec<u64> = plan
+            .entries
+            .iter()
+            .filter_map(|e| match e {
+                MetadataEntry::RoutingChange(RoutingChange::RemoveMember { group_id, .. }) => {
+                    Some(*group_id)
+                }
+                _ => None,
+            })
+            .collect();
+        assert_eq!(removes, vec![2, 3]);
+    }
+}
diff --git a/nodedb-cluster/src/decommission/mod.rs b/nodedb-cluster/src/decommission/mod.rs
new file mode 100644
index 00000000..3b0bd5c4
--- /dev/null
+++ b/nodedb-cluster/src/decommission/mod.rs
@@ -0,0 +1,35 @@
+//! Decommission flow — graceful removal of a node from the cluster.
+//!
+//! Decommission is a multi-step, metadata-raft-replicated process:
+//!
+//! 1. **Safety gate** — [`safety::check_can_decommission`] refuses the
+//!    decommission if any Raft group the target is in would drop below
+//!    the configured replication factor after its removal. This is
+//!    the only correctness-critical check — once it passes, every
+//!    subsequent step is just routing/topology bookkeeping.
+//! 2. **Plan** — [`flow::plan_full_decommission`] emits the full ordered
+//!    sequence of [`MetadataEntry`](crate::metadata_group::MetadataEntry)
+//!    values the coordinator will propose: `StartDecommission`, any
+//!    required leadership transfers, a `RemoveMember` per group, then
+//!    `FinishDecommission` and `Leave`.
+//! 3. **Propose** (future batch: `coordinator.rs`) — stateful actor
+//!    proposes each entry in order through a `MetadataProposer` trait,
+//!    waiting for the applied index to advance past each commit before
+//!    advancing its own state.
+//! 4. **Observe** (future batch: `observer.rs`) — the target node
+//!    watches its own topology state and fires a cooperative shutdown
+//!    signal when it transitions to `Decommissioned`.
+//!
+//! This sub-batch ships steps 1 and 2 as pure, side-effect-free
+//! functions so the flow can be exhaustively unit-tested before the
+//! stateful coordinator is wired up.
+
+pub mod coordinator;
+pub mod flow;
+pub mod observer;
+pub mod safety;
+
+pub use coordinator::{DecommissionCoordinator, DecommissionRunResult, MetadataProposer};
+pub use flow::{DecommissionPlan, plan_full_decommission};
+pub use observer::DecommissionObserver;
+pub use safety::{DecommissionSafetyError, check_can_decommission};
diff --git a/nodedb-cluster/src/decommission/observer.rs b/nodedb-cluster/src/decommission/observer.rs
new file mode 100644
index 00000000..d3034c80
--- /dev/null
+++ b/nodedb-cluster/src/decommission/observer.rs
@@ -0,0 +1,196 @@
+//! `DecommissionObserver` — local-node self-shutdown signal.
+//!
+//! The coordinator proposes a full decommission plan through the
+//! metadata Raft group. Every node (including the target itself)
+//! applies the resulting entries through `CacheApplier`, which, when
+//! attached with [`CacheApplier::with_live_state`](crate::metadata_group::CacheApplier::with_live_state),
+//! cascades topology state transitions into the live
+//! `Arc<RwLock<ClusterTopology>>` handle.
+//!
+//! The observer polls that handle for the *local* node id. Once the
+//! node's own state reaches `Decommissioned` — or the node has been
+//! removed from topology entirely by a committed `Leave` — the
+//! observer flips a `tokio::sync::watch` channel to `true`, which is
+//! the cooperative shutdown signal every long-lived background task
+//! on this node is already listening on.
+//!
+//! This is the last link in the decommission chain: once the watch
+//! is flipped, the raft loops, SWIM detector, reachability driver,
+//! and transport accept loops all drain and exit on their own.
+
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+
+use tokio::sync::watch;
+use tokio::time::interval;
+use tracing::{info, warn};
+
+use crate::topology::{ClusterTopology, NodeState};
+
+/// Periodically checks the local node's topology state and fires a
+/// shutdown signal on `Decommissioned` or removal.
+pub struct DecommissionObserver {
+    topology: Arc<RwLock<ClusterTopology>>,
+    local_node_id: u64,
+    shutdown_tx: watch::Sender<bool>,
+    poll_interval: Duration,
+}
+
+impl DecommissionObserver {
+    /// Build an observer and return it alongside the receiver half of
+    /// its shutdown watch channel. Every subsystem that wants to
+    /// cooperatively drain on decommission can call
+    /// [`watch::Receiver::clone`] on the returned receiver.
+    pub fn new(
+        topology: Arc<RwLock<ClusterTopology>>,
+        local_node_id: u64,
+        poll_interval: Duration,
+    ) -> (Self, watch::Receiver<bool>) {
+        let (shutdown_tx, shutdown_rx) = watch::channel(false);
+        (
+            Self {
+                topology,
+                local_node_id,
+                shutdown_tx,
+                poll_interval,
+            },
+            shutdown_rx,
+        )
+    }
+
+    /// Single check. Returns `true` iff the observer fired the
+    /// shutdown signal during this call (or had already fired it
+    /// previously — the watch is level-triggered, not edge).
+    pub fn check_once(&self) -> bool {
+        if *self.shutdown_tx.borrow() {
+            return true;
+        }
+        let topo = self.topology.read().unwrap_or_else(|p| p.into_inner());
+        let should_fire = match topo.get_node(self.local_node_id) {
+            Some(node) => node.state == NodeState::Decommissioned,
+            // Node is gone from topology — either a committed `Leave`
+            // (post-decommission) or manual removal. Either way, we
+            // are no longer part of the cluster.
+            None => true,
+        };
+        if should_fire {
+            info!(
+                local_node_id = self.local_node_id,
+                "decommission observer firing local shutdown signal"
+            );
+            if let Err(e) = self.shutdown_tx.send(true) {
+                warn!(error = %e, "shutdown watch receivers all dropped");
+            }
+            return true;
+        }
+        false
+    }
+
+    /// Run the observer's poll loop until `cancel` flips to `true`.
+    /// Exits immediately after firing its own shutdown signal —
+    /// there is nothing more to watch.
+    pub async fn run(self, mut cancel: watch::Receiver<bool>) {
+        let mut tick = interval(self.poll_interval);
+        tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+        loop {
+            tokio::select! {
+                biased;
+                changed = cancel.changed() => {
+                    if changed.is_ok() && *cancel.borrow() {
+                        return;
+                    }
+                }
+                _ = tick.tick() => {
+                    if self.check_once() {
+                        return;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::NodeInfo;
+    use std::net::SocketAddr;
+
+    fn topo_with(node_id: u64, state: NodeState) -> Arc<RwLock<ClusterTopology>> {
+        let mut t = ClusterTopology::new();
+        let addr: SocketAddr = "127.0.0.1:9000".parse().unwrap();
+        t.add_node(NodeInfo::new(node_id, addr, state));
+        Arc::new(RwLock::new(t))
+    }
+
+    #[test]
+    fn check_once_does_not_fire_while_active() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, _rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(10));
+        assert!(!obs.check_once());
+    }
+
+    #[test]
+    fn check_once_fires_on_decommissioned_state() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, mut rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(10));
+        assert!(!obs.check_once());
+        topo.write()
+            .unwrap()
+            .set_state(5, NodeState::Decommissioned);
+        assert!(obs.check_once());
+        assert!(*rx.borrow_and_update());
+    }
+
+    #[test]
+    fn check_once_fires_when_node_removed_from_topology() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, _rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(10));
+        topo.write().unwrap().remove_node(5);
+        assert!(obs.check_once());
+    }
+
+    #[test]
+    fn check_once_is_idempotent_after_firing() {
+        let topo = topo_with(5, NodeState::Decommissioned);
+        let (obs, _rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(10));
+        assert!(obs.check_once());
+        // Second call sees the fired signal and reports true again.
+        assert!(obs.check_once());
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_fires_shutdown_and_exits() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, mut rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(50));
+        let (_cancel_tx, cancel_rx) = watch::channel(false);
+        let handle = tokio::spawn(async move { obs.run(cancel_rx).await });
+
+        // Advance twice — first tick = no-op, then flip state.
+        tokio::time::advance(Duration::from_millis(60)).await;
+        tokio::task::yield_now().await;
+        topo.write()
+            .unwrap()
+            .set_state(5, NodeState::Decommissioned);
+        tokio::time::advance(Duration::from_millis(60)).await;
+        tokio::task::yield_now().await;
+
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle)
+            .await
+            .expect("observer run loop did not exit");
+        assert!(*rx.borrow_and_update());
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_exits_on_cancel_without_firing() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(50));
+        let (cancel_tx, cancel_rx) = watch::channel(false);
+        let handle = tokio::spawn(async move { obs.run(cancel_rx).await });
+        let _ = cancel_tx.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle)
+            .await
+            .expect("cancel did not end run loop");
+        assert!(!*rx.borrow());
+    }
+}
diff --git a/nodedb-cluster/src/decommission/safety.rs b/nodedb-cluster/src/decommission/safety.rs
new file mode 100644
index 00000000..91533a34
--- /dev/null
+++ b/nodedb-cluster/src/decommission/safety.rs
@@ -0,0 +1,172 @@
+//! Decommission safety gate.
+//!
+//! Before the coordinator proposes a single metadata entry, it must
+//! prove that removing the target node from every Raft group it
+//! belongs to will leave each group with at least `replication_factor`
+//! voting members. Dropping below RF silently is a data-loss bug —
+//! this module is the only place that decision is made.
+
+use crate::error::{ClusterError, Result};
+use crate::routing::RoutingTable;
+use crate::topology::{ClusterTopology, NodeState};
+
+/// Why a decommission request was rejected.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum DecommissionSafetyError {
+    /// The target node id does not exist in the topology.
+    NodeNotFound { node_id: u64 },
+    /// The node is already past the point of decommission.
+    AlreadyDecommissioned { node_id: u64 },
+    /// Removing the node would leave this group below `replication_factor`
+    /// voters. The decommission must wait until a new voter has been
+    /// added to the group (via rebalance / migration executor).
+    WouldViolateReplicationFactor {
+        node_id: u64,
+        group_id: u64,
+        current_voters: usize,
+        replication_factor: usize,
+    },
+}
+
+impl std::fmt::Display for DecommissionSafetyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NodeNotFound { node_id } => {
+                write!(f, "node {node_id} not found in topology")
+            }
+            Self::AlreadyDecommissioned { node_id } => {
+                write!(f, "node {node_id} is already decommissioned")
+            }
+            Self::WouldViolateReplicationFactor {
+                node_id,
+                group_id,
+                current_voters,
+                replication_factor,
+            } => write!(
+                f,
+                "removing node {node_id} from group {group_id} \
+                 would leave {} voter(s), below replication factor {replication_factor}",
+                current_voters.saturating_sub(1)
+            ),
+        }
+    }
+}
+
+impl std::error::Error for DecommissionSafetyError {}
+
+impl From<DecommissionSafetyError> for ClusterError {
+    fn from(value: DecommissionSafetyError) -> Self {
+        ClusterError::Transport {
+            detail: value.to_string(),
+        }
+    }
+}
+
+/// Verify that node `node_id` can be safely stripped out of every
+/// group it participates in without dropping any group below
+/// `replication_factor` voters.
+///
+/// This check is purely structural — it looks at the current routing
+/// table, not the live cluster. Callers must re-run it immediately
+/// before proposing each step if the topology may have shifted since
+/// the plan was computed.
+pub fn check_can_decommission(
+    node_id: u64,
+    topology: &ClusterTopology,
+    routing: &RoutingTable,
+    replication_factor: usize,
+) -> Result<()> {
+    let node = topology
+        .get_node(node_id)
+        .ok_or(DecommissionSafetyError::NodeNotFound { node_id })?;
+
+    if node.state == NodeState::Decommissioned {
+        return Err(DecommissionSafetyError::AlreadyDecommissioned { node_id }.into());
+    }
+
+    for (group_id, info) in routing.group_members() {
+        if !info.members.contains(&node_id) {
+            continue;
+        }
+        let current_voters = info.members.len();
+        // After removal the group would have `current_voters - 1`
+        // voters. Require that to be at least `replication_factor`.
+        if current_voters.saturating_sub(1) < replication_factor {
+            return Err(DecommissionSafetyError::WouldViolateReplicationFactor {
+                node_id,
+                group_id: *group_id,
+                current_voters,
+                replication_factor,
+            }
+            .into());
+        }
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::NodeInfo;
+    use std::net::SocketAddr;
+
+    fn topo(nodes: &[u64]) -> ClusterTopology {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let addr: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, addr, NodeState::Active));
+        }
+        t
+    }
+
+    #[test]
+    fn rejects_unknown_node() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        let err = check_can_decommission(99, &t, &r, 2).unwrap_err();
+        assert!(err.to_string().contains("99"));
+    }
+
+    #[test]
+    fn rejects_already_decommissioned() {
+        let mut t = topo(&[1, 2, 3]);
+        t.set_state(1, NodeState::Decommissioned);
+        let r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        let err = check_can_decommission(1, &t, &r, 2).unwrap_err();
+        assert!(err.to_string().contains("already decommissioned"));
+    }
+
+    #[test]
+    fn rejects_when_rf_would_be_violated() {
+        let t = topo(&[1, 2]);
+        // RF=2 with only 2 nodes → every group has exactly 2 voters.
+        // Removing either one would leave 1 voter (< RF=2).
+        let r = RoutingTable::uniform(2, &[1, 2], 2);
+        let err = check_can_decommission(1, &t, &r, 2).unwrap_err();
+        assert!(err.to_string().contains("replication factor"));
+    }
+
+    #[test]
+    fn accepts_when_extra_voter_available() {
+        let t = topo(&[1, 2, 3]);
+        // 3 nodes × RF=2 means each group has 2 voters but the third
+        // node is a candidate replacement. The safety check doesn't
+        // know about replacements — it only checks current state,
+        // so we need RF=1 for this to pass without a prior rebalance.
+        let r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        check_can_decommission(1, &t, &r, 2).unwrap();
+    }
+
+    #[test]
+    fn skips_groups_target_is_not_member_of() {
+        let t = topo(&[1, 2, 3]);
+        // Node 1 is only in group 0, node 2 is only in group 1.
+        let mut r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        r.set_group_members(0, vec![1, 3]);
+        r.set_group_members(1, vec![2, 3]);
+        // Decommission 1 with RF=1 → group 0 drops to [3], group 1
+        // untouched.
+        check_can_decommission(1, &t, &r, 1).unwrap();
+    }
+}
diff --git a/nodedb-cluster/src/follower_read.rs b/nodedb-cluster/src/follower_read.rs
new file mode 100644
index 00000000..16d0886e
--- /dev/null
+++ b/nodedb-cluster/src/follower_read.rs
@@ -0,0 +1,130 @@
+//! Follower-read decision gate.
+//!
+//! [`FollowerReadGate`] answers a single question: "given the
+//! session's `ReadConsistency` and the local node's role + closed
+//! timestamp for the target Raft group, can this read be served
+//! locally without forwarding to the leader?"
+//!
+//! ## Decision table
+//!
+//! | Consistency           | Local role  | Closed TS fresh? | Serve locally? |
+//! |-----------------------|-------------|------------------|----------------|
+//! | Strong                | *           | *                | Only if leader |
+//! | BoundedStaleness(d)   | Follower    | ≤ d              | Yes            |
+//! | BoundedStaleness(d)   | Follower    | > d              | No → forward   |
+//! | BoundedStaleness(d)   | Leader      | *                | Yes            |
+//! | Eventual              | *           | *                | Yes            |
+//!
+//! The gate is stateless — it reads from shared handles to the
+//! closed-timestamp tracker and the raft-status provider.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::closed_timestamp::ClosedTimestampTracker;
+
+/// Consistency level for a single read — mirrors the `ReadConsistency`
+/// enum in the `nodedb` crate without coupling `nodedb-cluster` to it.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ReadLevel {
+    Strong,
+    BoundedStaleness(Duration),
+    Eventual,
+}
+
+/// Answers "can this read be served locally?"
+pub struct FollowerReadGate {
+    closed_ts: Arc<ClosedTimestampTracker>,
+    /// Type-erased function that returns true if this node is the
+    /// leader for the given group. Injection seam — production wraps
+    /// `MultiRaft::group_statuses`, tests supply a closure.
+    is_leader_fn: Box<dyn Fn(u64) -> bool + Send + Sync>,
+}
+
+impl FollowerReadGate {
+    pub fn new(
+        closed_ts: Arc<ClosedTimestampTracker>,
+        is_leader_fn: Box<dyn Fn(u64) -> bool + Send + Sync>,
+    ) -> Self {
+        Self {
+            closed_ts,
+            is_leader_fn,
+        }
+    }
+
+    /// Returns `true` if the read can be served from this node's
+    /// local replica without forwarding to the leader.
+    pub fn can_serve_locally(&self, group_id: u64, level: ReadLevel) -> bool {
+        match level {
+            ReadLevel::Strong => (self.is_leader_fn)(group_id),
+            ReadLevel::Eventual => true,
+            ReadLevel::BoundedStaleness(max) => {
+                if (self.is_leader_fn)(group_id) {
+                    return true;
+                }
+                self.closed_ts.is_fresh_enough(group_id, max)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn gate(leader_groups: &'static [u64]) -> FollowerReadGate {
+        FollowerReadGate::new(
+            Arc::new(ClosedTimestampTracker::new()),
+            Box::new(move |gid| leader_groups.contains(&gid)),
+        )
+    }
+
+    fn gate_with_tracker(
+        leader_groups: &'static [u64],
+        tracker: Arc<ClosedTimestampTracker>,
+    ) -> FollowerReadGate {
+        FollowerReadGate::new(tracker, Box::new(move |gid| leader_groups.contains(&gid)))
+    }
+
+    #[test]
+    fn strong_requires_leader() {
+        let g = gate(&[1]);
+        assert!(g.can_serve_locally(1, ReadLevel::Strong));
+        assert!(!g.can_serve_locally(2, ReadLevel::Strong));
+    }
+
+    #[test]
+    fn eventual_always_local() {
+        let g = gate(&[]);
+        assert!(g.can_serve_locally(99, ReadLevel::Eventual));
+    }
+
+    #[test]
+    fn bounded_staleness_leader_always_local() {
+        let g = gate(&[1]);
+        assert!(g.can_serve_locally(1, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+
+    #[test]
+    fn bounded_staleness_follower_fresh_enough() {
+        let tracker = Arc::new(ClosedTimestampTracker::new());
+        tracker.mark_applied(2);
+        let g = gate_with_tracker(&[], tracker);
+        assert!(g.can_serve_locally(2, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+
+    #[test]
+    fn bounded_staleness_follower_too_stale() {
+        let tracker = Arc::new(ClosedTimestampTracker::new());
+        let old = std::time::Instant::now() - Duration::from_secs(30);
+        tracker.mark_applied_at(2, old);
+        let g = gate_with_tracker(&[], tracker);
+        assert!(!g.can_serve_locally(2, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+
+    #[test]
+    fn bounded_staleness_unknown_group_not_local() {
+        let g = gate(&[]);
+        assert!(!g.can_serve_locally(99, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+}
diff --git a/nodedb-cluster/src/health.rs b/nodedb-cluster/src/health.rs
index e27e8137..654b4f56 100644
--- a/nodedb-cluster/src/health.rs
+++ b/nodedb-cluster/src/health.rs
@@ -151,14 +151,38 @@ impl HealthMonitor {
         }
     }
 
-    /// Handle a successful pong — reset failure count, mark node Active if needed.
-    fn handle_pong(&self, peer_id: u64, _pong: &PongResponse) -> bool {
+    /// Handle a successful pong — reset failure count, mark node Active
+    /// if needed, and push topology if the peer is behind.
+    fn handle_pong(&self, peer_id: u64, pong: &PongResponse) -> bool {
         // Reset failure count.
         {
             let mut failures = self.ping_failures.lock().unwrap_or_else(|p| p.into_inner());
             failures.remove(&peer_id);
         }
 
+        // Push topology to peers with a stale version. This closes
+        // the convergence gap when the fire-and-forget broadcast
+        // during the join flow is lost (e.g. peer QUIC server not
+        // yet accepting at that instant).
+        let our_version = {
+            let topo = self.topology.read().unwrap_or_else(|p| p.into_inner());
+            topo.version()
+        };
+        if pong.topology_version < our_version {
+            debug!(
+                peer_id,
+                peer_version = pong.topology_version,
+                our_version,
+                "peer has stale topology, pushing update"
+            );
+            let transport = self.transport.clone();
+            let topology = self.topology.clone();
+            let self_id = self.node_id;
+            tokio::spawn(async move {
+                broadcast_topology_to_peer(self_id, peer_id, &topology, &transport).await;
+            });
+        }
+
         // If node was not Active, mark it Active.
         let mut topo = self.topology.write().unwrap_or_else(|p| p.into_inner());
         if let Some(node) = topo.get_node(peer_id)
@@ -264,6 +288,34 @@ pub fn broadcast_topology(
     }
 }
 
+/// Send a topology update to a single peer that has a stale version.
+async fn broadcast_topology_to_peer(
+    _self_node_id: u64,
+    peer_id: u64,
+    topology: &RwLock<ClusterTopology>,
+    transport: &NexarTransport,
+) {
+    let update = {
+        let topo = topology.read().unwrap_or_else(|p| p.into_inner());
+        RaftRpc::TopologyUpdate(TopologyUpdate {
+            version: topo.version(),
+            nodes: topo
+                .all_nodes()
+                .map(|n| JoinNodeInfo {
+                    node_id: n.node_id,
+                    addr: n.addr.clone(),
+                    state: n.state.as_u8(),
+                    raft_groups: n.raft_groups.clone(),
+                    wire_version: n.wire_version,
+                })
+                .collect(),
+        })
+    };
+    if let Err(e) = transport.send_rpc(peer_id, update).await {
+        debug!(peer_id, error = %e, "targeted topology push failed");
+    }
+}
+
 /// Handle an incoming Ping RPC — return a Pong with our topology version.
 pub fn handle_ping(node_id: u64, topology_version: u64, _req: &PingRequest) -> RaftRpc {
     RaftRpc::Pong(PongResponse {
diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs
index ea340b52..8909a79d 100644
--- a/nodedb-cluster/src/lib.rs
+++ b/nodedb-cluster/src/lib.rs
@@ -1,9 +1,11 @@
 pub mod bootstrap;
 pub mod catalog;
 pub mod circuit_breaker;
+pub mod closed_timestamp;
 pub mod cluster_info;
 pub mod conf_change;
 pub mod cross_shard_txn;
+pub mod decommission;
 pub mod distributed_document;
 pub mod distributed_graph;
 pub mod distributed_join;
@@ -11,6 +13,7 @@ pub mod distributed_spatial;
 pub mod distributed_timeseries;
 pub mod distributed_vector;
 pub mod error;
+pub mod follower_read;
 pub mod forward;
 pub mod ghost;
 pub mod ghost_sweeper;
@@ -25,10 +28,13 @@ pub mod quic_transport;
 pub mod raft_loop;
 pub mod raft_storage;
 pub mod rdma_transport;
+pub mod reachability;
 pub mod readiness;
 pub mod rebalance;
 pub mod rebalance_scheduler;
+pub mod rebalancer;
 pub mod routing;
+pub mod routing_liveness;
 pub mod rpc_codec;
 pub mod shard_split;
 pub mod swim;
@@ -39,11 +45,17 @@ pub mod wire;
 
 pub use bootstrap::{ClusterConfig, ClusterState, JoinRetryPolicy, start_cluster};
 pub use catalog::ClusterCatalog;
+pub use closed_timestamp::ClosedTimestampTracker;
 pub use cluster_info::{
     ClusterInfoSnapshot, ClusterObserver, GroupSnapshot, GroupStatusProvider, PeerSnapshot,
 };
 pub use conf_change::{ConfChange, ConfChangeType};
+pub use decommission::{
+    DecommissionCoordinator, DecommissionObserver, DecommissionPlan, DecommissionRunResult,
+    DecommissionSafetyError, MetadataProposer, check_can_decommission, plan_full_decommission,
+};
 pub use error::{ClusterError, Result};
+pub use follower_read::{FollowerReadGate, ReadLevel};
 pub use forward::{NoopPlanExecutor, PlanExecutor};
 pub use ghost::{GhostStub, GhostTable};
 pub use health::{HealthConfig, HealthMonitor};
@@ -54,8 +66,17 @@ pub use migration_executor::{
 };
 pub use multi_raft::{GroupStatus, MultiRaft};
 pub use raft_loop::{CommitApplier, RaftLoop, VShardEnvelopeHandler};
+pub use reachability::{
+    NoopProber, ReachabilityDriver, ReachabilityDriverConfig, ReachabilityProber, TransportProber,
+};
 pub use rebalance::{RebalancePlan, compute_plan, plan_to_requests};
+pub use rebalancer::{
+    AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, LoadWeights,
+    MigrationDispatcher, RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig,
+    RebalancerPlanConfig, compute_load_based_plan, normalized_score,
+};
 pub use routing::RoutingTable;
+pub use routing_liveness::{NodeIdResolver, RoutingLivenessHook};
 pub use rpc_codec::RaftRpc;
 pub use topology::{ClusterTopology, NodeInfo, NodeState};
 pub use transport::{NexarTransport, RaftRpcHandler};
@@ -78,7 +99,8 @@ pub use lifecycle::{
 pub use rdma_transport::{RdmaConfig, RdmaTransport};
 pub use rebalance_scheduler::{NodeMetrics, RebalanceScheduler, RebalanceTrigger, SchedulerConfig};
 pub use shard_split::{SplitPlan, SplitStrategy, plan_graph_split, plan_vector_split};
+pub use swim::bootstrap::spawn_with_subscribers as spawn_swim_with_subscribers;
 pub use swim::{
-    Incarnation, Member, MemberState, MembershipList, SwimConfig, SwimError, SwimHandle,
-    UdpTransport, spawn as spawn_swim,
+    Incarnation, Member, MemberState, MembershipList, MembershipSubscriber, SwimConfig, SwimError,
+    SwimHandle, UdpTransport, spawn as spawn_swim,
 };
diff --git a/nodedb-cluster/src/lifecycle.rs b/nodedb-cluster/src/lifecycle.rs
index 28dd6bd2..43966b8c 100644
--- a/nodedb-cluster/src/lifecycle.rs
+++ b/nodedb-cluster/src/lifecycle.rs
@@ -15,7 +15,7 @@
 use tracing::{info, warn};
 
 use crate::error::{ClusterError, Result};
-use crate::metadata_group::{MetadataEntry, RoutingChange, TopologyChange};
+use crate::metadata_group::{MetadataEntry, TopologyChange};
 use crate::routing::RoutingTable;
 use crate::topology::{ClusterTopology, NodeInfo, NodeState};
 
@@ -27,55 +27,34 @@ pub struct DecommissionResult {
     pub completed: bool,
 }
 
-/// Plan a node decommission: compute which vShards to migrate and where.
-///
-/// Produces a sequence of [`MetadataEntry`] values to be proposed against
-/// the metadata Raft group in order. Steps:
-/// 1. Start decommission (topology transition).
-/// 2. Transfer leadership of all Raft groups led by this node.
+/// Plan a node decommission — thin wrapper over
+/// [`crate::decommission::plan_full_decommission`] that returns the
+/// full ordered sequence of metadata entries. Kept as a public
+/// convenience for older call sites; new code should use the
+/// `decommission` module directly.
 pub fn plan_decommission(
     node_id: u64,
     topology: &ClusterTopology,
     routing: &RoutingTable,
 ) -> Result<Vec<MetadataEntry>> {
-    let node = topology.get_node(node_id).ok_or(ClusterError::Transport {
-        detail: format!("node {node_id} not found in topology"),
-    })?;
-
-    if node.state == NodeState::Decommissioned {
-        return Err(ClusterError::Transport {
-            detail: format!("node {node_id} is already decommissioned"),
-        });
-    }
-
-    let mut entries = Vec::new();
-
-    // Step 1: Start decommission.
-    entries.push(MetadataEntry::TopologyChange(
-        TopologyChange::StartDecommission { node_id },
-    ));
-
-    // Step 2: Leadership transfers for groups led by this node.
-    for group_id in routing.group_ids() {
-        if let Some(info) = routing.group_info(group_id)
-            && info.leader == node_id
-            && let Some(&new_leader) = info.members.iter().find(|&&m| m != node_id)
-        {
-            entries.push(MetadataEntry::RoutingChange(
-                RoutingChange::LeadershipTransfer {
-                    group_id,
-                    new_leader_node_id: new_leader,
-                },
-            ));
-        }
-    }
-
+    // Historical callers assumed the full-cluster RF; derive a safe
+    // lower bound from the smallest existing group so the check is
+    // never stricter than the cluster is already running under.
+    let rf = routing
+        .group_members()
+        .values()
+        .map(|info| info.members.len())
+        .min()
+        .unwrap_or(1)
+        .saturating_sub(1)
+        .max(1);
+    let plan = crate::decommission::plan_full_decommission(node_id, topology, routing, rf)?;
     info!(
         node_id,
-        metadata_entries = entries.len(),
+        metadata_entries = plan.entries.len(),
         "decommission plan computed"
     );
-    Ok(entries)
+    Ok(plan.entries)
 }
 
 /// Check if a node can be safely removed from the cluster.
diff --git a/nodedb-cluster/src/metadata_group/applier.rs b/nodedb-cluster/src/metadata_group/applier.rs
index fd169f04..46d59549 100644
--- a/nodedb-cluster/src/metadata_group/applier.rs
+++ b/nodedb-cluster/src/metadata_group/applier.rs
@@ -1,12 +1,16 @@
 //! [`MetadataApplier`] trait: the contract raft_loop uses to dispatch
 //! committed entries on the metadata group (group 0).
 
+use std::net::SocketAddr;
 use std::sync::{Arc, RwLock};
 
 use tracing::warn;
 
 use crate::metadata_group::cache::MetadataCache;
 use crate::metadata_group::codec::decode_entry;
+use crate::metadata_group::entry::{MetadataEntry, RoutingChange, TopologyChange};
+use crate::routing::RoutingTable;
+use crate::topology::{ClusterTopology, NodeInfo, NodeState};
 
 /// Applies committed metadata entries to local state.
 ///
@@ -29,16 +33,123 @@ pub trait MetadataApplier: Send + Sync + 'static {
 #[derive(Clone)]
 pub struct CacheApplier {
     cache: Arc<RwLock<MetadataCache>>,
+    /// Optional live topology handle. When set, committed
+    /// `TopologyChange` entries mutate this handle in place so the
+    /// rest of the process sees the new state immediately — decommission
+    /// state transitions, joiner promotion, and `Leave` removal all
+    /// flow through here.
+    live_topology: Option<Arc<RwLock<ClusterTopology>>>,
+    /// Optional live routing table handle. When set, committed
+    /// `RoutingChange` entries (leadership transfer, member removal,
+    /// vshard reassignment) mutate this handle in place.
+    live_routing: Option<Arc<RwLock<RoutingTable>>>,
 }
 
 impl CacheApplier {
     pub fn new(cache: Arc<RwLock<MetadataCache>>) -> Self {
-        Self { cache }
+        Self {
+            cache,
+            live_topology: None,
+            live_routing: None,
+        }
+    }
+
+    /// Extend this applier with live topology/routing handles. When
+    /// set, committed `TopologyChange` and `RoutingChange` entries
+    /// mutate the handles in place in addition to the in-memory
+    /// history log kept in `MetadataCache`. Backward-compatible:
+    /// existing callers that don't attach handles see no behaviour
+    /// change.
+    pub fn with_live_state(
+        mut self,
+        topology: Arc<RwLock<ClusterTopology>>,
+        routing: Arc<RwLock<RoutingTable>>,
+    ) -> Self {
+        self.live_topology = Some(topology);
+        self.live_routing = Some(routing);
+        self
     }
 
     pub fn cache(&self) -> Arc<RwLock<MetadataCache>> {
         self.cache.clone()
     }
+
+    /// Mutate the live topology handle (if attached) in response to
+    /// a committed `TopologyChange`. Silent no-op when no handle is
+    /// set — backward-compatible with older test wiring.
+    fn apply_topology_change(&self, change: &TopologyChange) {
+        let Some(live) = &self.live_topology else {
+            return;
+        };
+        let mut topo = live.write().unwrap_or_else(|p| p.into_inner());
+        match change {
+            TopologyChange::Join { node_id, addr } => {
+                if topo.contains(*node_id) {
+                    return;
+                }
+                let parsed: SocketAddr = addr.parse().unwrap_or_else(|_| {
+                    warn!(node_id, addr, "join: invalid address, using placeholder");
+                    SocketAddr::from(([0, 0, 0, 0], 0))
+                });
+                topo.join_as_learner(NodeInfo::new(*node_id, parsed, NodeState::Joining));
+            }
+            TopologyChange::PromoteToVoter { node_id } => {
+                topo.promote_to_voter(*node_id);
+            }
+            TopologyChange::StartDecommission { node_id } => {
+                topo.set_state(*node_id, NodeState::Draining);
+            }
+            TopologyChange::FinishDecommission { node_id } => {
+                topo.set_state(*node_id, NodeState::Decommissioned);
+            }
+            TopologyChange::Leave { node_id } => {
+                topo.remove_node(*node_id);
+            }
+        }
+    }
+
+    /// Cascade live-state mutations for a committed entry. Handles
+    /// `Batch` by recursing into each sub-entry.
+    fn cascade_live_state(&self, entry: &MetadataEntry) {
+        match entry {
+            MetadataEntry::TopologyChange(change) => self.apply_topology_change(change),
+            MetadataEntry::RoutingChange(change) => self.apply_routing_change(change),
+            MetadataEntry::Batch { entries } => {
+                for sub in entries {
+                    self.cascade_live_state(sub);
+                }
+            }
+            _ => {}
+        }
+    }
+
+    /// Mutate the live routing handle (if attached) in response to
+    /// a committed `RoutingChange`.
+    fn apply_routing_change(&self, change: &RoutingChange) {
+        let Some(live) = &self.live_routing else {
+            return;
+        };
+        let mut rt = live.write().unwrap_or_else(|p| p.into_inner());
+        match change {
+            RoutingChange::ReassignVShard {
+                vshard_id,
+                new_group_id,
+                new_leaseholder_node_id,
+            } => {
+                rt.reassign_vshard(*vshard_id, *new_group_id);
+                rt.set_leader(*new_group_id, *new_leaseholder_node_id);
+            }
+            RoutingChange::LeadershipTransfer {
+                group_id,
+                new_leader_node_id,
+            } => {
+                rt.set_leader(*group_id, *new_leader_node_id);
+            }
+            RoutingChange::RemoveMember { group_id, node_id } => {
+                rt.remove_group_member(*group_id, *node_id);
+            }
+        }
+    }
 }
 
 impl MetadataApplier for CacheApplier {
@@ -54,7 +165,10 @@ impl MetadataApplier for CacheApplier {
                 continue;
             }
             match decode_entry(data) {
-                Ok(entry) => guard.apply(*index, &entry),
+                Ok(entry) => {
+                    guard.apply(*index, &entry);
+                    self.cascade_live_state(&entry);
+                }
                 Err(e) => warn!(index = *index, error = %e, "metadata decode failed"),
             }
         }
@@ -120,6 +234,72 @@ mod tests {
         assert_eq!(guard.catalog_entries_applied, 1);
     }
 
+    #[test]
+    fn cache_applier_mutates_live_topology_on_start_decommission() {
+        use crate::topology::{ClusterTopology, NodeInfo, NodeState};
+        use std::net::SocketAddr;
+
+        let cache = Arc::new(RwLock::new(MetadataCache::new()));
+        let mut t = ClusterTopology::new();
+        let addr: SocketAddr = "127.0.0.1:9000".parse().unwrap();
+        t.add_node(NodeInfo::new(7, addr, NodeState::Active));
+        let topology = Arc::new(RwLock::new(t));
+        let routing = Arc::new(RwLock::new(crate::routing::RoutingTable::uniform(
+            1,
+            &[7],
+            1,
+        )));
+        let applier =
+            CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone());
+
+        let bytes = encode_entry(&MetadataEntry::TopologyChange(
+            TopologyChange::StartDecommission { node_id: 7 },
+        ))
+        .unwrap();
+        applier.apply(&[(1, bytes)]);
+
+        let topo = topology.read().unwrap();
+        assert_eq!(topo.get_node(7).unwrap().state, NodeState::Draining);
+    }
+
+    #[test]
+    fn cache_applier_mutates_live_routing_on_remove_member() {
+        use crate::metadata_group::entry::RoutingChange;
+
+        let cache = Arc::new(RwLock::new(MetadataCache::new()));
+        let topology = Arc::new(RwLock::new(crate::topology::ClusterTopology::new()));
+        let routing = Arc::new(RwLock::new(crate::routing::RoutingTable::uniform(
+            1,
+            &[1, 2, 3],
+            3,
+        )));
+        let applier =
+            CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone());
+
+        let bytes = encode_entry(&MetadataEntry::RoutingChange(RoutingChange::RemoveMember {
+            group_id: 0,
+            node_id: 2,
+        }))
+        .unwrap();
+        applier.apply(&[(1, bytes)]);
+
+        let rt = routing.read().unwrap();
+        assert!(!rt.group_info(0).unwrap().members.contains(&2));
+    }
+
+    #[test]
+    fn cache_applier_without_live_state_stays_log_only() {
+        let cache = Arc::new(RwLock::new(MetadataCache::new()));
+        let applier = CacheApplier::new(cache.clone());
+        let bytes = encode_entry(&MetadataEntry::TopologyChange(
+            TopologyChange::StartDecommission { node_id: 5 },
+        ))
+        .unwrap();
+        // Must not panic and must still advance the applied index.
+        let last = applier.apply(&[(1, bytes)]);
+        assert_eq!(last, 1);
+    }
+
     #[test]
     fn noop_applier_advances_watermark() {
         let noop = NoopMetadataApplier;
diff --git a/nodedb-cluster/src/metadata_group/cache.rs b/nodedb-cluster/src/metadata_group/cache.rs
index 24f7a4ba..23ae959c 100644
--- a/nodedb-cluster/src/metadata_group/cache.rs
+++ b/nodedb-cluster/src/metadata_group/cache.rs
@@ -106,6 +106,11 @@ impl MetadataCache {
                 }
             }
             MetadataEntry::DescriptorDrainEnd { .. } => {}
+            MetadataEntry::Batch { entries } => {
+                for sub in entries {
+                    self.apply(index, sub);
+                }
+            }
         }
     }
 }
diff --git a/nodedb-cluster/src/metadata_group/entry.rs b/nodedb-cluster/src/metadata_group/entry.rs
index 301a9bf8..8c8c18a2 100644
--- a/nodedb-cluster/src/metadata_group/entry.rs
+++ b/nodedb-cluster/src/metadata_group/entry.rs
@@ -39,6 +39,14 @@ pub enum MetadataEntry {
         payload: Vec<u8>,
     },
 
+    /// Atomic batch of metadata entries proposed by a transactional
+    /// DDL session (`BEGIN; CREATE ...; CREATE ...; COMMIT;`). The
+    /// applier unpacks and applies each sub-entry in order at a
+    /// single raft log index, so either all commit or none do.
+    Batch {
+        entries: Vec<MetadataEntry>,
+    },
+
     // ── Topology / routing ─────────────────────────────────────────────
     TopologyChange(TopologyChange),
     RoutingChange(RoutingChange),
@@ -123,4 +131,11 @@ pub enum RoutingChange {
         group_id: u64,
         new_leader_node_id: u64,
     },
+    /// Remove a node from a Raft group's member and learner sets.
+    ///
+    /// Used by the decommission flow to strip a draining node out of
+    /// every group it belongs to. Proposing this is only safe once
+    /// `safety::check_can_decommission` has confirmed the group will
+    /// still satisfy the configured replication factor.
+    RemoveMember { group_id: u64, node_id: u64 },
 }
diff --git a/nodedb-cluster/src/migration_executor.rs b/nodedb-cluster/src/migration_executor.rs
index 9caeee80..18ea6b18 100644
--- a/nodedb-cluster/src/migration_executor.rs
+++ b/nodedb-cluster/src/migration_executor.rs
@@ -16,8 +16,10 @@ use std::time::Duration;
 use tracing::{debug, info};
 
 use crate::conf_change::{ConfChange, ConfChangeType};
+use crate::decommission::MetadataProposer;
 use crate::error::{ClusterError, Result};
 use crate::ghost::{GhostStub, GhostTable};
+use crate::metadata_group::{MetadataEntry, RoutingChange};
 use crate::migration::{MigrationPhase, MigrationState};
 use crate::multi_raft::MultiRaft;
 use crate::routing::RoutingTable;
@@ -65,6 +67,13 @@ pub struct MigrationExecutor {
     topology: Arc<RwLock<ClusterTopology>>,
     transport: Arc<NexarTransport>,
     ghost_table: Arc<Mutex<GhostTable>>,
+    /// Optional metadata proposer for replicated routing updates.
+    /// When set, Phase 3 cut-over proposes a `RoutingChange` through
+    /// the metadata Raft group so every node applies the routing
+    /// update atomically on commit. When `None`, falls back to
+    /// local-only routing mutation (used by tests that don't stand
+    /// up a metadata group).
+    metadata_proposer: Option<Arc<dyn MetadataProposer>>,
 }
 
 impl MigrationExecutor {
@@ -80,9 +89,17 @@ impl MigrationExecutor {
             topology,
             transport,
             ghost_table: Arc::new(Mutex::new(GhostTable::new())),
+            metadata_proposer: None,
         }
     }
 
+    /// Attach a metadata proposer for replicated Phase 3 cut-over.
+    /// Production wiring calls this; tests may omit it for simplicity.
+    pub fn with_metadata_proposer(mut self, proposer: Arc<dyn MetadataProposer>) -> Self {
+        self.metadata_proposer = Some(proposer);
+        self
+    }
+
     /// Access the ghost table (for scatter-gather resolution).
     pub fn ghost_table(&self) -> &Arc<Mutex<GhostTable>> {
         &self.ghost_table
@@ -180,9 +197,11 @@ impl MigrationExecutor {
             "phase 1: adding target to raft group"
         );
 
-        // Add target node as a voter to the Raft group via ConfChange.
+        // Add target node as a LEARNER so it can catch up via Raft
+        // replication without participating in elections or voting.
+        // Promotion to voter happens after Phase 2 confirms catch-up.
         let change = ConfChange {
-            change_type: ConfChangeType::AddNode,
+            change_type: ConfChangeType::AddLearner,
             node_id: req.target_node,
         };
 
@@ -202,12 +221,13 @@ impl MigrationExecutor {
 
         // The ConfChange will be replicated and applied. The target node
         // receives the full log through Raft's normal replication.
-        // Mark base copy as complete immediately — Raft handles the transfer.
+        // Mark base copy as complete — Raft replication is now in
+        // progress; the real progress signal is match_index in Phase 2.
         state.update_base_copy(committed);
 
         debug!(
             vshard = req.vshard_id,
-            "phase 1 complete: target added to raft group"
+            "phase 1 complete: target added as learner to raft group"
         );
 
         Ok(())
@@ -313,9 +333,21 @@ impl MigrationExecutor {
             state.update_wal_catchup(leader_commit, target_match);
 
             if state.is_catchup_ready() {
+                // Learner has caught up — promote to voter so the
+                // group has enough replicas for a safe cut-over.
+                let promote = ConfChange {
+                    change_type: ConfChangeType::PromoteLearner,
+                    node_id: req.target_node,
+                };
+                {
+                    let mut mr = self.multi_raft.lock().unwrap_or_else(|p| p.into_inner());
+                    mr.propose_conf_change(group_id, &promote)?;
+                }
                 debug!(
                     vshard = req.vshard_id,
-                    leader_commit, target_match, "phase 2 complete: target caught up"
+                    leader_commit,
+                    target_match,
+                    "phase 2 complete: target caught up and promoted to voter"
                 );
                 return Ok(());
             }
@@ -331,15 +363,20 @@ impl MigrationExecutor {
         }
     }
 
-    /// Phase 3: Atomic routing table update via Raft.
+    /// Phase 3: Atomic routing table update.
+    ///
+    /// When a [`MetadataProposer`] is attached, the cut-over proposes
+    /// a `LeadershipTransfer` through the metadata Raft group so
+    /// every node applies the routing update atomically on commit.
+    /// Without a proposer (tests), falls back to a local-only
+    /// mutation.
     async fn phase3_cutover(
         &self,
         state: &mut MigrationState,
         group_id: u64,
         req: &MigrationRequest,
     ) -> Result<()> {
-        // Estimate pause (time to propose + commit the routing update).
-        let estimated_pause_us = 10_000; // ~10ms estimate for Raft round-trip.
+        let estimated_pause_us = 10_000;
 
         state.start_cutover(estimated_pause_us).map_err(|e| {
             state.fail(format!("cutover rejected: {e}"));
@@ -353,28 +390,23 @@ impl MigrationExecutor {
             estimated_pause_us, "phase 3: atomic cut-over"
         );
 
-        // Propose the routing update as a Raft entry so all nodes apply it
-        // atomically when committed. The entry is serialized as a ConfChange
-        // with a special routing marker that the applier interprets.
-        let routing_change = ConfChange {
-            change_type: ConfChangeType::AddNode,
-            node_id: req.target_node,
-        };
-        {
-            let mut mr = self.multi_raft.lock().unwrap_or_else(|p| p.into_inner());
-            mr.propose_conf_change(group_id, &routing_change)?;
-        }
-
-        // Update the local routing table. Other nodes update theirs when they
-        // apply the committed entry through their own applier.
-        {
+        // Propose the routing change. With a metadata proposer the
+        // `CacheApplier::with_live_state` on every node handles the
+        // actual routing mutation when the entry commits; without a
+        // proposer we mutate locally for backward-compat.
+        if let Some(proposer) = &self.metadata_proposer {
+            let entry = MetadataEntry::RoutingChange(RoutingChange::LeadershipTransfer {
+                group_id,
+                new_leader_node_id: req.target_node,
+            });
+            proposer.propose_and_wait(entry).await?;
+        } else {
             let mut routing = self.routing.write().unwrap_or_else(|p| p.into_inner());
-            routing.reassign_vshard(req.vshard_id, group_id);
+            routing.set_leader(group_id, req.target_node);
         }
 
-        // Install ghost stub on source so scatter-gather queries that arrive
-        // before the client refreshes its routing table are transparently
-        // forwarded to the new owner.
+        // Ghost stub so in-flight scatter-gather queries that still
+        // target the old leader are transparently forwarded.
         {
             let mut ghosts = self.ghost_table.lock().unwrap_or_else(|p| p.into_inner());
             ghosts.insert(GhostStub {
@@ -387,18 +419,13 @@ impl MigrationExecutor {
                     .as_millis() as u64,
             });
         }
-        debug!(
-            vshard = req.vshard_id,
-            target = req.target_node,
-            "ghost stub registered for transparent forwarding"
-        );
 
         let actual_pause_us = cutover_start.elapsed().as_micros() as u64;
         state.complete(actual_pause_us);
 
         debug!(
             vshard = req.vshard_id,
-            actual_pause_us, "phase 3 complete: routing updated via raft"
+            actual_pause_us, "phase 3 complete: routing updated"
         );
 
         Ok(())
@@ -521,14 +548,14 @@ mod tests {
             write_pause_budget_us: 500_000,
         };
 
-        // Phase 1 should succeed (adds node 2 to group 0).
+        // Phase 1 should succeed (adds node 2 as learner to group 0).
         executor
             .phase1_base_copy(&mut state, 0, &req)
             .await
             .unwrap();
 
-        // Verify: the ConfChange was proposed (it's in the Raft log).
-        // The actual application happens when committed, which requires tick().
+        // Verify: the ConfChange (AddLearner) was proposed in the Raft log.
+        // Application happens on next tick/commit cycle.
     }
 
     #[test]
diff --git a/nodedb-cluster/src/multi_raft/core.rs b/nodedb-cluster/src/multi_raft/core.rs
index 9aa60bc4..72029096 100644
--- a/nodedb-cluster/src/multi_raft/core.rs
+++ b/nodedb-cluster/src/multi_raft/core.rs
@@ -77,8 +77,8 @@ impl MultiRaft {
             node_id,
             groups: HashMap::new(),
             routing,
-            election_timeout_min: Duration::from_millis(150),
-            election_timeout_max: Duration::from_millis(300),
+            election_timeout_min: Duration::from_secs(2),
+            election_timeout_max: Duration::from_secs(5),
             heartbeat_interval: Duration::from_millis(50),
             data_dir,
         }
diff --git a/nodedb-cluster/src/raft_loop/loop_core.rs b/nodedb-cluster/src/raft_loop/loop_core.rs
index e73787dc..ed1ccd98 100644
--- a/nodedb-cluster/src/raft_loop/loop_core.rs
+++ b/nodedb-cluster/src/raft_loop/loop_core.rs
@@ -580,35 +580,38 @@ mod tests {
         let sr1h = shutdown_tx.subscribe();
         tokio::spawn(async move { t1.serve(rl1_h, sr1h).await });
 
-        tokio::time::sleep(Duration::from_millis(200)).await;
-
-        assert!(
-            rl1.applier.count() >= 1,
-            "node 1 should have committed at least the no-op, got {}",
-            rl1.applier.count()
-        );
+        // Poll until node 1 commits at least the no-op (election done).
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
+        loop {
+            if rl1.applier.count() >= 1 {
+                break;
+            }
+            assert!(
+                tokio::time::Instant::now() < deadline,
+                "node 1 should have committed at least the no-op, got {}",
+                rl1.applier.count()
+            );
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
 
         let (_gid, idx) = rl1.propose(0, b"distributed-cmd".to_vec()).unwrap();
         assert!(idx >= 2);
 
-        tokio::time::sleep(Duration::from_millis(200)).await;
-
-        assert!(
-            rl1.applier.count() >= 2,
-            "node 1: expected >= 2 applied, got {}",
-            rl1.applier.count()
-        );
-
-        assert!(
-            rl2.applier.count() >= 1,
-            "node 2: expected >= 1 applied, got {}",
-            rl2.applier.count()
-        );
-        assert!(
-            rl3.applier.count() >= 1,
-            "node 3: expected >= 1 applied, got {}",
-            rl3.applier.count()
-        );
+        // Poll until all nodes replicate the proposed command.
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
+        loop {
+            if rl1.applier.count() >= 2 && rl2.applier.count() >= 1 && rl3.applier.count() >= 1 {
+                break;
+            }
+            assert!(
+                tokio::time::Instant::now() < deadline,
+                "replication timed out: n1={}, n2={}, n3={}",
+                rl1.applier.count(),
+                rl2.applier.count(),
+                rl3.applier.count()
+            );
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
 
         shutdown_tx.send(true).unwrap();
     }
diff --git a/nodedb-cluster/src/reachability/driver.rs b/nodedb-cluster/src/reachability/driver.rs
new file mode 100644
index 00000000..b677ba0c
--- /dev/null
+++ b/nodedb-cluster/src/reachability/driver.rs
@@ -0,0 +1,220 @@
+//! [`ReachabilityDriver`] — periodic open-breaker probe loop.
+//!
+//! Every `interval`, the driver asks the shared [`CircuitBreaker`]
+//! for its currently-Open peer set and fires a probe at each via the
+//! injected [`ReachabilityProber`]. Probes run in parallel via
+//! `tokio::spawn` so a slow peer never blocks the next one. Probe
+//! results are intentionally ignored: the production `TransportProber`
+//! routes through `NexarTransport::send_rpc`, which already walks the
+//! circuit breaker's `check → record_success|record_failure` path, so
+//! the driver does not need to bookkeep anything itself.
+//!
+//! Shutdown is cooperative via `tokio::sync::watch`. On `true` the
+//! run loop breaks at the next tick or immediately if it is waiting.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio::sync::watch;
+use tokio::time::{MissedTickBehavior, interval};
+use tracing::{debug, trace};
+
+use crate::circuit_breaker::CircuitBreaker;
+
+use super::prober::ReachabilityProber;
+
+/// Configuration for the reachability driver.
+#[derive(Debug, Clone)]
+pub struct ReachabilityDriverConfig {
+    /// Period between open-peer sweeps. Defaults to 30 s in
+    /// production; tests override to milliseconds.
+    pub interval: Duration,
+}
+
+impl Default for ReachabilityDriverConfig {
+    fn default() -> Self {
+        Self {
+            interval: Duration::from_secs(30),
+        }
+    }
+}
+
+/// Drives periodic reachability probes against every Open-state peer.
+pub struct ReachabilityDriver {
+    breaker: Arc<CircuitBreaker>,
+    prober: Arc<dyn ReachabilityProber>,
+    cfg: ReachabilityDriverConfig,
+}
+
+impl ReachabilityDriver {
+    pub fn new(
+        breaker: Arc<CircuitBreaker>,
+        prober: Arc<dyn ReachabilityProber>,
+        cfg: ReachabilityDriverConfig,
+    ) -> Self {
+        Self {
+            breaker,
+            prober,
+            cfg,
+        }
+    }
+
+    /// Run the driver until `shutdown` flips to `true`.
+    pub async fn run(self: Arc<Self>, mut shutdown: watch::Receiver<bool>) {
+        let mut tick = interval(self.cfg.interval);
+        // Skip the immediate first tick so the first probe fires one
+        // full interval after start. Otherwise every process restart
+        // would stampede every open breaker at once.
+        tick.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        tick.tick().await;
+        loop {
+            tokio::select! {
+                biased;
+                changed = shutdown.changed() => {
+                    if changed.is_ok() && *shutdown.borrow() {
+                        break;
+                    }
+                }
+                _ = tick.tick() => {
+                    self.sweep_once().await;
+                }
+            }
+        }
+        debug!("reachability driver shutting down");
+    }
+
+    /// Single sweep — exposed for tests that drive the loop manually.
+    pub async fn sweep_once(&self) {
+        let open = self.breaker.open_peers();
+        if open.is_empty() {
+            return;
+        }
+        trace!(count = open.len(), "reachability sweep: probing open peers");
+        for peer in open {
+            let prober = Arc::clone(&self.prober);
+            tokio::spawn(async move {
+                let _ = prober.probe(peer).await;
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::circuit_breaker::CircuitBreakerConfig;
+    use async_trait::async_trait;
+    use std::sync::Mutex;
+
+    struct RecordingProber {
+        calls: Mutex<Vec<u64>>,
+    }
+
+    impl RecordingProber {
+        fn new() -> Arc<Self> {
+            Arc::new(Self {
+                calls: Mutex::new(Vec::new()),
+            })
+        }
+        fn take(&self) -> Vec<u64> {
+            let mut g = self.calls.lock().unwrap();
+            let out = g.clone();
+            g.clear();
+            out
+        }
+    }
+
+    #[async_trait]
+    impl ReachabilityProber for RecordingProber {
+        async fn probe(&self, peer: u64) -> Result<(), crate::error::ClusterError> {
+            self.calls.lock().unwrap().push(peer);
+            Ok(())
+        }
+    }
+
+    fn open_breaker() -> Arc<CircuitBreaker> {
+        Arc::new(CircuitBreaker::new(CircuitBreakerConfig {
+            failure_threshold: 1,
+            cooldown: Duration::from_secs(60),
+        }))
+    }
+
+    #[tokio::test]
+    async fn sweep_probes_every_open_peer() {
+        let breaker = open_breaker();
+        breaker.record_failure(1);
+        breaker.record_failure(2);
+        breaker.record_failure(3);
+
+        let prober = RecordingProber::new();
+        let driver = Arc::new(ReachabilityDriver::new(
+            Arc::clone(&breaker),
+            prober.clone() as Arc<dyn ReachabilityProber>,
+            ReachabilityDriverConfig {
+                interval: Duration::from_millis(50),
+            },
+        ));
+        driver.sweep_once().await;
+        // Let spawned probe tasks run.
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        let mut calls = prober.take();
+        calls.sort_unstable();
+        assert_eq!(calls, vec![1, 2, 3]);
+    }
+
+    #[tokio::test]
+    async fn sweep_skips_closed_peers() {
+        let breaker = open_breaker();
+        breaker.record_success(1); // Registers 1 as Closed.
+        breaker.record_failure(2); // Opens 2.
+        let prober = RecordingProber::new();
+        let driver = Arc::new(ReachabilityDriver::new(
+            Arc::clone(&breaker),
+            prober.clone() as Arc<dyn ReachabilityProber>,
+            ReachabilityDriverConfig::default(),
+        ));
+        driver.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert_eq!(prober.take(), vec![2]);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_fires_sweeps_on_interval_and_shuts_down() {
+        let breaker = open_breaker();
+        breaker.record_failure(7);
+        let prober = RecordingProber::new();
+        let driver = Arc::new(ReachabilityDriver::new(
+            Arc::clone(&breaker),
+            prober.clone() as Arc<dyn ReachabilityProber>,
+            ReachabilityDriverConfig {
+                interval: Duration::from_millis(100),
+            },
+        ));
+        let (tx, rx) = watch::channel(false);
+        let handle = tokio::spawn({
+            let d = Arc::clone(&driver);
+            async move { d.run(rx).await }
+        });
+
+        // First tick is skipped, second delivers a sweep.
+        tokio::time::advance(Duration::from_millis(120)).await;
+        tokio::task::yield_now().await;
+        tokio::time::advance(Duration::from_millis(120)).await;
+        tokio::task::yield_now().await;
+        for _ in 0..16 {
+            tokio::task::yield_now().await;
+        }
+
+        assert!(
+            !prober.take().is_empty(),
+            "driver never probed in run-loop mode"
+        );
+
+        let _ = tx.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle).await;
+    }
+}
diff --git a/nodedb-cluster/src/reachability/mod.rs b/nodedb-cluster/src/reachability/mod.rs
new file mode 100644
index 00000000..6423b087
--- /dev/null
+++ b/nodedb-cluster/src/reachability/mod.rs
@@ -0,0 +1,22 @@
+//! Reachability driver — the active half of circuit-breaker recovery.
+//!
+//! `CircuitBreaker` transitions `Open → HalfOpen` only on the next
+//! `check()` call. Without periodic traffic to an offline peer, that
+//! check never happens and the breaker stays `Open` forever even after
+//! the peer has recovered. This module closes that blind spot:
+//!
+//! - [`ReachabilityDriver`] periodically walks the breaker's open set
+//!   and sends a lightweight probe RPC to each peer via the existing
+//!   `send_rpc` path, which drives the normal HalfOpen → Closed /
+//!   HalfOpen → Open transitions.
+//! - [`ReachabilityProber`] is the injection seam: production wraps
+//!   [`crate::transport::NexarTransport`], tests use a mock.
+//!
+//! The driver is shutdown-aware (watch channel) and bounded — one
+//! probe per open peer per tick, fire-and-forget.
+
+pub mod driver;
+pub mod prober;
+
+pub use driver::{ReachabilityDriver, ReachabilityDriverConfig};
+pub use prober::{NoopProber, ReachabilityProber, TransportProber};
diff --git a/nodedb-cluster/src/reachability/prober.rs b/nodedb-cluster/src/reachability/prober.rs
new file mode 100644
index 00000000..47607e1d
--- /dev/null
+++ b/nodedb-cluster/src/reachability/prober.rs
@@ -0,0 +1,68 @@
+//! [`ReachabilityProber`] — the injection seam for reachability probes.
+//!
+//! Implementations:
+//!
+//! - [`TransportProber`] wraps an `Arc<NexarTransport>` and sends a
+//!   `RaftRpc::Ping` to the peer. `send_rpc` already handles the
+//!   circuit-breaker check, the QUIC dial, retries, and
+//!   `record_success` / `record_failure` — the prober is a one-line
+//!   adapter.
+//! - [`NoopProber`] always succeeds. Useful for tests that only want
+//!   to verify the loop's tick cadence and shutdown.
+//!
+//! Tests that want deterministic open→closed transitions construct
+//! their own trait impls; see `tests/reachability_loop.rs`.
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use crate::error::Result;
+use crate::rpc_codec::{PingRequest, RaftRpc};
+use crate::transport::NexarTransport;
+
+/// Abstract probe operation over a single peer.
+#[async_trait]
+pub trait ReachabilityProber: Send + Sync {
+    /// Send one probe to `peer`. Returns `Ok(())` iff the probe
+    /// completed successfully (implying the peer is reachable).
+    async fn probe(&self, peer: u64) -> Result<()>;
+}
+
+/// Production prober: sends a `Ping` via the live transport. The
+/// transport's internal circuit breaker records success/failure
+/// automatically — the driver does not need to bookkeep anything.
+pub struct TransportProber {
+    transport: Arc<NexarTransport>,
+    self_node_id: u64,
+}
+
+impl TransportProber {
+    pub fn new(transport: Arc<NexarTransport>, self_node_id: u64) -> Self {
+        Self {
+            transport,
+            self_node_id,
+        }
+    }
+}
+
+#[async_trait]
+impl ReachabilityProber for TransportProber {
+    async fn probe(&self, peer: u64) -> Result<()> {
+        let rpc = RaftRpc::Ping(PingRequest {
+            sender_id: self.self_node_id,
+            topology_version: 0,
+        });
+        self.transport.send_rpc(peer, rpc).await.map(|_| ())
+    }
+}
+
+/// Always-succeeds prober for cadence/shutdown tests.
+pub struct NoopProber;
+
+#[async_trait]
+impl ReachabilityProber for NoopProber {
+    async fn probe(&self, _peer: u64) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/nodedb-cluster/src/rebalancer/driver.rs b/nodedb-cluster/src/rebalancer/driver.rs
new file mode 100644
index 00000000..2150a474
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/driver.rs
@@ -0,0 +1,427 @@
+//! Rebalancer driver loop.
+//!
+//! [`RebalancerLoop`] is the active half of the load-based rebalancer.
+//! Every `interval` it walks this sequence:
+//!
+//! 1. Ask the injected `ElectionGate` whether any raft group is
+//!    currently mid-election. If so, skip this tick entirely —
+//!    moves during an election race with the new leader's log and
+//!    are almost guaranteed to be wasted work.
+//! 2. Ask the injected [`LoadMetricsProvider`] for a snapshot of
+//!    every node's current load metrics.
+//! 3. Call [`compute_load_based_plan`] against the live routing +
+//!    topology with the configured plan config. If the plan is
+//!    empty (cluster within threshold, or no cold candidates), do
+//!    nothing.
+//! 4. Dispatch each planned move through the injected
+//!    [`MigrationDispatcher`], fire-and-forget. The dispatcher is
+//!    where the bridge to the production `MigrationExecutor` lives
+//!    — tests use a mock that records the calls.
+//!
+//! The loop holds no state of its own; the dispatcher tracks
+//! in-flight work and the breaker/scheduler state is on the
+//! underlying subsystems. This keeps the driver trivially
+//! restartable: crash mid-tick, respawn, resume.
+
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::sync::{Notify, watch};
+use tokio::time::{MissedTickBehavior, interval};
+use tracing::{debug, info, warn};
+
+use crate::error::Result;
+use crate::rebalance::PlannedMove;
+use crate::routing::RoutingTable;
+use crate::topology::ClusterTopology;
+
+use super::metrics::LoadMetricsProvider;
+use super::plan::{RebalancerPlanConfig, compute_load_based_plan};
+
+/// Injection seam: tells the driver whether it's safe to dispatch
+/// moves. Production wraps a `MultiRaft` status probe; tests return
+/// a constant boolean.
+#[async_trait]
+pub trait ElectionGate: Send + Sync {
+    /// Return `true` if **any** raft group is currently holding an
+    /// election (no stable leader). The driver skips its tick when
+    /// this is `true`.
+    async fn any_group_electing(&self) -> bool;
+}
+
+/// Permissive gate that never blocks the driver. Useful in tests
+/// and in single-node clusters where elections are instantaneous.
+pub struct AlwaysReadyGate;
+
+#[async_trait]
+impl ElectionGate for AlwaysReadyGate {
+    async fn any_group_electing(&self) -> bool {
+        false
+    }
+}
+
+/// Injection seam: executes a single planned move. Production
+/// wraps `MigrationExecutor::execute` and reports success/failure
+/// via logging + the tracker; tests record the move.
+#[async_trait]
+pub trait MigrationDispatcher: Send + Sync {
+    async fn dispatch(&self, mv: PlannedMove) -> Result<()>;
+}
+
+/// Configuration for [`RebalancerLoop`].
+#[derive(Debug, Clone)]
+pub struct RebalancerLoopConfig {
+    /// Period between rebalance sweeps. Defaults to 30 s.
+    pub interval: Duration,
+    /// Plan computation config propagated to
+    /// [`compute_load_based_plan`] on every tick.
+    pub plan: RebalancerPlanConfig,
+    /// CPU utilization threshold (0.0–1.0) above which the
+    /// rebalancer pauses to avoid amplifying load. If ANY node in
+    /// the metrics snapshot exceeds this value, the sweep is skipped
+    /// and a STATUS event is logged. Default 0.80 (80%).
+    pub backpressure_cpu_threshold: f64,
+}
+
+impl Default for RebalancerLoopConfig {
+    fn default() -> Self {
+        Self {
+            interval: Duration::from_secs(30),
+            plan: RebalancerPlanConfig::default(),
+            backpressure_cpu_threshold: 0.80,
+        }
+    }
+}
+
+/// The driver itself.
+pub struct RebalancerLoop {
+    cfg: RebalancerLoopConfig,
+    metrics: Arc<dyn LoadMetricsProvider>,
+    dispatcher: Arc<dyn MigrationDispatcher>,
+    gate: Arc<dyn ElectionGate>,
+    routing: Arc<RwLock<RoutingTable>>,
+    topology: Arc<RwLock<ClusterTopology>>,
+    /// Membership-change notification. When any caller (a SWIM
+    /// subscriber, a manual admin trigger, etc.) calls
+    /// [`notify`](Notify::notify_one) on this handle, the run loop
+    /// wakes up immediately and runs an extra sweep instead of
+    /// waiting for the next 30 s tick.
+    kick: Arc<Notify>,
+}
+
+impl RebalancerLoop {
+    pub fn new(
+        cfg: RebalancerLoopConfig,
+        metrics: Arc<dyn LoadMetricsProvider>,
+        dispatcher: Arc<dyn MigrationDispatcher>,
+        gate: Arc<dyn ElectionGate>,
+        routing: Arc<RwLock<RoutingTable>>,
+        topology: Arc<RwLock<ClusterTopology>>,
+    ) -> Self {
+        Self {
+            cfg,
+            metrics,
+            dispatcher,
+            gate,
+            routing,
+            topology,
+            kick: Arc::new(Notify::new()),
+        }
+    }
+
+    /// Return a handle that callers can use to trigger an immediate
+    /// sweep. Cloning the `Arc<Notify>` is cheap; every clone
+    /// shares the same waker.
+    pub fn kick_handle(&self) -> Arc<Notify> {
+        Arc::clone(&self.kick)
+    }
+
+    /// Run the driver until `shutdown` flips to `true`.
+    pub async fn run(self: Arc<Self>, mut shutdown: watch::Receiver<bool>) {
+        let mut tick = interval(self.cfg.interval);
+        tick.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        // Consume the immediate first tick so the first sweep fires
+        // a full interval after start. Prevents start-up stampedes
+        // when many nodes restart together.
+        tick.tick().await;
+        loop {
+            tokio::select! {
+                biased;
+                changed = shutdown.changed() => {
+                    if changed.is_ok() && *shutdown.borrow() {
+                        break;
+                    }
+                }
+                _ = tick.tick() => {
+                    self.sweep_once().await;
+                }
+                _ = self.kick.notified() => {
+                    debug!("rebalancer: membership-change kick received");
+                    self.sweep_once().await;
+                }
+            }
+        }
+        debug!("rebalancer loop shutting down");
+    }
+
+    /// Run a single sweep. Exposed for tests that drive the loop
+    /// manually rather than through `run`.
+    pub async fn sweep_once(&self) {
+        if self.gate.any_group_electing().await {
+            debug!("rebalancer: raft election in progress, skipping tick");
+            return;
+        }
+        let metrics = match self.metrics.snapshot().await {
+            Ok(m) => m,
+            Err(e) => {
+                warn!(error = %e, "rebalancer: failed to collect metrics");
+                return;
+            }
+        };
+        if let Some(hot) = metrics
+            .iter()
+            .find(|m| m.cpu_utilization > self.cfg.backpressure_cpu_threshold)
+        {
+            info!(
+                node_id = hot.node_id,
+                cpu = format!("{:.0}%", hot.cpu_utilization * 100.0),
+                threshold = format!("{:.0}%", self.cfg.backpressure_cpu_threshold * 100.0),
+                "rebalancer: back-pressure — cluster under load, skipping sweep"
+            );
+            return;
+        }
+        let plan = {
+            let routing = self.routing.read().unwrap_or_else(|p| p.into_inner());
+            let topo = self.topology.read().unwrap_or_else(|p| p.into_inner());
+            compute_load_based_plan(&metrics, &routing, &topo, &self.cfg.plan)
+        };
+        if plan.is_empty() {
+            debug!("rebalancer: no moves needed this tick");
+            return;
+        }
+        info!(
+            move_count = plan.len(),
+            "rebalancer: dispatching planned moves"
+        );
+        for mv in plan {
+            let dispatcher = Arc::clone(&self.dispatcher);
+            tokio::spawn(async move {
+                if let Err(e) = dispatcher.dispatch(mv).await {
+                    warn!(error = %e, "rebalancer: dispatch failed");
+                }
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::rebalancer::metrics::LoadMetrics;
+    use crate::topology::{NodeInfo, NodeState};
+    use std::net::SocketAddr;
+    use std::sync::Mutex;
+
+    struct StaticMetrics(Vec<LoadMetrics>);
+
+    #[async_trait]
+    impl LoadMetricsProvider for StaticMetrics {
+        async fn snapshot(&self) -> Result<Vec<LoadMetrics>> {
+            Ok(self.0.clone())
+        }
+    }
+
+    struct RecordingDispatcher {
+        calls: Mutex<Vec<PlannedMove>>,
+    }
+
+    impl RecordingDispatcher {
+        fn new() -> Arc<Self> {
+            Arc::new(Self {
+                calls: Mutex::new(Vec::new()),
+            })
+        }
+        fn take(&self) -> Vec<PlannedMove> {
+            let mut g = self.calls.lock().unwrap();
+            let out = g.clone();
+            g.clear();
+            out
+        }
+    }
+
+    #[async_trait]
+    impl MigrationDispatcher for RecordingDispatcher {
+        async fn dispatch(&self, mv: PlannedMove) -> Result<()> {
+            self.calls.lock().unwrap().push(mv);
+            Ok(())
+        }
+    }
+
+    struct BlockingGate(bool);
+
+    #[async_trait]
+    impl ElectionGate for BlockingGate {
+        async fn any_group_electing(&self) -> bool {
+            self.0
+        }
+    }
+
+    fn topo(nodes: &[u64]) -> Arc<RwLock<ClusterTopology>> {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+        }
+        Arc::new(RwLock::new(t))
+    }
+
+    fn routing_hot_on(node: u64) -> Arc<RwLock<RoutingTable>> {
+        let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+        for gid in 0..6 {
+            r.set_leader(gid, node);
+        }
+        Arc::new(RwLock::new(r))
+    }
+
+    fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+        LoadMetrics {
+            node_id: id,
+            vshards_led: v,
+            bytes_stored: bytes_mib * 1_048_576,
+            writes_per_sec: w,
+            reads_per_sec: r,
+            cpu_utilization: 0.0,
+        }
+    }
+
+    fn hot_cluster_loop(
+        gate: Arc<dyn ElectionGate>,
+    ) -> (Arc<RebalancerLoop>, Arc<RecordingDispatcher>) {
+        let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticMetrics(vec![
+            lm(1, 500, 5000, 200.0, 200.0),
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 5, 5, 5.0, 5.0),
+        ]));
+        let dispatcher = RecordingDispatcher::new();
+        let disp_dyn: Arc<dyn MigrationDispatcher> = dispatcher.clone();
+        let rloop = Arc::new(RebalancerLoop::new(
+            RebalancerLoopConfig {
+                interval: Duration::from_millis(50),
+                ..Default::default()
+            },
+            metrics,
+            disp_dyn,
+            gate,
+            routing_hot_on(1),
+            topo(&[1, 2, 3]),
+        ));
+        (rloop, dispatcher)
+    }
+
+    #[tokio::test]
+    async fn sweep_dispatches_moves_when_imbalanced() {
+        let (rloop, dispatcher) = hot_cluster_loop(Arc::new(AlwaysReadyGate));
+        rloop.sweep_once().await;
+        for _ in 0..16 {
+            tokio::task::yield_now().await;
+        }
+        let calls = dispatcher.take();
+        assert!(!calls.is_empty());
+        for c in &calls {
+            assert_eq!(c.source_node, 1);
+        }
+    }
+
+    #[tokio::test]
+    async fn sweep_skipped_during_election() {
+        let (rloop, dispatcher) = hot_cluster_loop(Arc::new(BlockingGate(true)));
+        rloop.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert!(dispatcher.take().is_empty());
+    }
+
+    #[tokio::test]
+    async fn sweep_noop_on_balanced_cluster() {
+        let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticMetrics(vec![
+            lm(1, 50, 500, 100.0, 100.0),
+            lm(2, 50, 500, 100.0, 100.0),
+            lm(3, 50, 500, 100.0, 100.0),
+        ]));
+        let dispatcher = RecordingDispatcher::new();
+        let rloop = Arc::new(RebalancerLoop::new(
+            RebalancerLoopConfig::default(),
+            metrics,
+            dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+            Arc::new(AlwaysReadyGate),
+            routing_hot_on(1),
+            topo(&[1, 2, 3]),
+        ));
+        rloop.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert!(dispatcher.take().is_empty());
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_fires_sweeps_and_shuts_down() {
+        let (rloop, dispatcher) = hot_cluster_loop(Arc::new(AlwaysReadyGate));
+        let (tx, rx) = watch::channel(false);
+        let handle = tokio::spawn({
+            let d = Arc::clone(&rloop);
+            async move { d.run(rx).await }
+        });
+        // First tick consumed immediately by run(); advance past a
+        // couple of real intervals with interleaved yields so the
+        // run-loop's select + spawned dispatch tasks all get to poll.
+        for _ in 0..4 {
+            tokio::time::advance(Duration::from_millis(80)).await;
+            for _ in 0..16 {
+                tokio::task::yield_now().await;
+            }
+        }
+        assert!(!dispatcher.take().is_empty());
+
+        let _ = tx.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle).await;
+    }
+
+    #[tokio::test]
+    async fn sweep_skipped_under_cpu_backpressure() {
+        let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticMetrics(vec![
+            LoadMetrics {
+                node_id: 1,
+                vshards_led: 500,
+                bytes_stored: 5000 * 1_048_576,
+                writes_per_sec: 200.0,
+                reads_per_sec: 200.0,
+                cpu_utilization: 0.95, // above 80% threshold
+            },
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 5, 5, 5.0, 5.0),
+        ]));
+        let dispatcher = RecordingDispatcher::new();
+        let rloop = Arc::new(RebalancerLoop::new(
+            RebalancerLoopConfig {
+                interval: Duration::from_millis(50),
+                ..Default::default()
+            },
+            metrics,
+            dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+            Arc::new(AlwaysReadyGate),
+            routing_hot_on(1),
+            topo(&[1, 2, 3]),
+        ));
+        rloop.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert!(
+            dispatcher.take().is_empty(),
+            "dispatcher should not fire when cluster is under CPU backpressure"
+        );
+    }
+}
diff --git a/nodedb-cluster/src/rebalancer/elastic.rs b/nodedb-cluster/src/rebalancer/elastic.rs
new file mode 100644
index 00000000..36903741
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/elastic.rs
@@ -0,0 +1,147 @@
+//! Elastic scaling glue — ties SWIM membership transitions to the
+//! rebalancer loop so new/departing nodes trigger an immediate sweep
+//! instead of waiting for the next 30 s tick.
+//!
+//! ## Add-node path
+//!
+//! 1. Node joins via the existing bootstrap/join RPC path.
+//! 2. `CacheApplier` with live state applies `TopologyChange::Join`
+//!    + `PromoteToVoter`, adding the node to the live topology.
+//! 3. SWIM detects the new node as `Alive` through gossip.
+//! 4. [`RebalancerKickHook`] (a [`MembershipSubscriber`]) fires
+//!    [`Notify::notify_one`] on the rebalancer loop's kick handle.
+//! 5. The loop wakes, collects metrics (including the new node's
+//!    low load score), and dispatches moves to the new node.
+//!
+//! ## Remove-node path
+//!
+//! 1. Operator runs `cluster decommission N` (Phase E.4).
+//! 2. The decommission flow strips the node from all groups and
+//!    removes it from topology.
+//! 3. SWIM detects the node as `Dead` / `Left`.
+//! 4. The same kick hook wakes the rebalancer so it re-evaluates
+//!    whether the remaining nodes are balanced.
+//!
+//! No new data types or traits — just a [`MembershipSubscriber`]
+//! impl holding a shared `Arc<Notify>`.
+
+use std::sync::Arc;
+
+use nodedb_types::NodeId;
+use tokio::sync::Notify;
+use tracing::debug;
+
+use crate::swim::member::MemberState;
+use crate::swim::subscriber::MembershipSubscriber;
+
+/// SWIM [`MembershipSubscriber`] that triggers an immediate
+/// rebalancer sweep on membership-relevant transitions.
+///
+/// Relevant transitions are:
+/// - `None → Alive` (first time a new node is seen — add path)
+/// - `_ → Dead` / `_ → Left` (node departure — remove path)
+/// - `_ → Alive` after `Dead`/`Left` (node recovery)
+///
+/// All other transitions (Alive → Suspect, Suspect → Alive) are
+/// transient and do not change the set of Active nodes, so they
+/// are ignored.
+pub struct RebalancerKickHook {
+    kick: Arc<Notify>,
+}
+
+impl RebalancerKickHook {
+    pub fn new(kick: Arc<Notify>) -> Self {
+        Self { kick }
+    }
+}
+
+impl MembershipSubscriber for RebalancerKickHook {
+    fn on_state_change(&self, node_id: &NodeId, old: Option<MemberState>, new: MemberState) {
+        let relevant = match (old, new) {
+            // First-time insert as Alive (new node joined).
+            (None, MemberState::Alive) => true,
+            // Node died or left.
+            (_, MemberState::Dead) | (_, MemberState::Left) => true,
+            // Node recovered from Dead/Left back to Alive.
+            (Some(MemberState::Dead), MemberState::Alive)
+            | (Some(MemberState::Left), MemberState::Alive) => true,
+            _ => false,
+        };
+        if relevant {
+            debug!(?node_id, ?old, ?new, "rebalancer kick: membership change");
+            self.kick.notify_one();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::atomic::{AtomicU32, Ordering};
+
+    fn counting_notify() -> (Arc<Notify>, Arc<AtomicU32>, tokio::task::JoinHandle<()>) {
+        let notify = Arc::new(Notify::new());
+        let counter = Arc::new(AtomicU32::new(0));
+        let n = notify.clone();
+        let c = counter.clone();
+        let handle = tokio::spawn(async move {
+            loop {
+                n.notified().await;
+                c.fetch_add(1, Ordering::SeqCst);
+            }
+        });
+        (notify, counter, handle)
+    }
+
+    #[tokio::test]
+    async fn kick_fires_on_new_node_alive() {
+        let (notify, counter, handle) = counting_notify();
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(&NodeId::new("new"), None, MemberState::Alive);
+        tokio::task::yield_now().await;
+        tokio::task::yield_now().await;
+        assert!(counter.load(Ordering::SeqCst) >= 1);
+        handle.abort();
+    }
+
+    #[tokio::test]
+    async fn kick_fires_on_dead() {
+        let (notify, counter, handle) = counting_notify();
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Dead,
+        );
+        tokio::task::yield_now().await;
+        tokio::task::yield_now().await;
+        assert!(counter.load(Ordering::SeqCst) >= 1);
+        handle.abort();
+    }
+
+    #[tokio::test]
+    async fn kick_fires_on_left() {
+        let (notify, counter, handle) = counting_notify();
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Left,
+        );
+        tokio::task::yield_now().await;
+        tokio::task::yield_now().await;
+        assert!(counter.load(Ordering::SeqCst) >= 1);
+        handle.abort();
+    }
+
+    #[test]
+    fn kick_does_not_fire_on_suspect() {
+        let notify = Arc::new(Notify::new());
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Suspect,
+        );
+    }
+}
diff --git a/nodedb-cluster/src/rebalancer/metrics.rs b/nodedb-cluster/src/rebalancer/metrics.rs
new file mode 100644
index 00000000..b9c9f5b5
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/metrics.rs
@@ -0,0 +1,147 @@
+//! Per-node load metrics and scoring.
+//!
+//! `LoadMetrics` is the raw per-node observation the rebalancer loop
+//! consumes. `normalized_score` folds a `LoadMetrics` plus a set of
+//! `LoadWeights` into a single `f64` so different nodes can be
+//! compared on one axis — the hotter the score, the more work the
+//! node is doing relative to the cluster.
+//!
+//! Weights are configurable because different workloads care about
+//! different dimensions: a write-heavy OLTP cluster wants high
+//! `writes` weight, an analytical cluster wants high `bytes`
+//! weight, and a very uniform vshard layout wants high `vshards`
+//! weight. The defaults (1.0 each) are a balanced starting point.
+
+use async_trait::async_trait;
+
+use crate::error::Result;
+
+/// Raw load observation for a single node.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct LoadMetrics {
+    pub node_id: u64,
+    /// Count of vshards this node is currently leading.
+    pub vshards_led: u32,
+    /// Total bytes stored across all vshards on this node.
+    pub bytes_stored: u64,
+    /// Writes per second (rolling average, caller-defined window).
+    pub writes_per_sec: f64,
+    /// Reads per second (rolling average, caller-defined window).
+    pub reads_per_sec: f64,
+    /// Per-core CPU utilization (0.0–1.0). Used by the
+    /// back-pressure gate to pause the rebalancer when the cluster
+    /// is already stressed.
+    pub cpu_utilization: f64,
+}
+
+/// Relative weights for the four load dimensions. Scaled linearly;
+/// the absolute values don't matter, only their ratios.
+#[derive(Debug, Clone, Copy)]
+pub struct LoadWeights {
+    pub vshards: f64,
+    pub bytes: f64,
+    pub writes: f64,
+    pub reads: f64,
+}
+
+impl Default for LoadWeights {
+    fn default() -> Self {
+        Self {
+            vshards: 1.0,
+            bytes: 1.0,
+            writes: 1.0,
+            reads: 1.0,
+        }
+    }
+}
+
+/// Collapse a `LoadMetrics` observation into a single scalar score
+/// using `weights`. Higher = hotter.
+///
+/// The implementation is a straightforward weighted sum — each field
+/// is scaled by its weight and added. Bytes are divided by a
+/// reasonable unit (1 MiB) so the float stays in a comparable range
+/// to the per-second rates; otherwise a moderately-sized dataset
+/// would swamp the qps signal entirely.
+pub fn normalized_score(m: &LoadMetrics, weights: &LoadWeights) -> f64 {
+    const BYTES_UNIT: f64 = 1_048_576.0; // 1 MiB
+    weights.vshards * m.vshards_led as f64
+        + weights.bytes * (m.bytes_stored as f64 / BYTES_UNIT)
+        + weights.writes * m.writes_per_sec
+        + weights.reads * m.reads_per_sec
+}
+
+/// Injection seam for collecting load metrics from every node in the
+/// cluster. Production impls talk to the metrics endpoint via the
+/// transport; tests inject synthetic values.
+#[async_trait]
+pub trait LoadMetricsProvider: Send + Sync {
+    /// Return a snapshot of every known node's current load metrics.
+    /// The returned slice may be in any order — the rebalancer plan
+    /// sorts internally for determinism.
+    async fn snapshot(&self) -> Result<Vec<LoadMetrics>>;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn m(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+        LoadMetrics {
+            node_id: id,
+            vshards_led: v,
+            bytes_stored: bytes_mib * 1_048_576,
+            writes_per_sec: w,
+            reads_per_sec: r,
+            cpu_utilization: 0.0,
+        }
+    }
+
+    #[test]
+    fn default_weights_are_uniform() {
+        let w = LoadWeights::default();
+        assert_eq!(w.vshards, 1.0);
+        assert_eq!(w.bytes, 1.0);
+        assert_eq!(w.writes, 1.0);
+        assert_eq!(w.reads, 1.0);
+    }
+
+    #[test]
+    fn zero_metrics_score_zero() {
+        let metrics = m(1, 0, 0, 0.0, 0.0);
+        assert_eq!(normalized_score(&metrics, &LoadWeights::default()), 0.0);
+    }
+
+    #[test]
+    fn score_sums_all_dimensions_with_default_weights() {
+        // 4 vshards + 8 MiB + 2 wps + 3 rps = 17.0
+        let metrics = m(1, 4, 8, 2.0, 3.0);
+        let score = normalized_score(&metrics, &LoadWeights::default());
+        assert!((score - 17.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn weights_scale_dimensions_independently() {
+        let metrics = m(1, 10, 0, 0.0, 0.0);
+        let w = LoadWeights {
+            vshards: 5.0,
+            ..Default::default()
+        };
+        assert!((normalized_score(&metrics, &w) - 50.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn hotter_node_has_higher_score() {
+        let cold = m(1, 1, 1, 1.0, 1.0);
+        let hot = m(2, 10, 100, 100.0, 100.0);
+        let w = LoadWeights::default();
+        assert!(normalized_score(&hot, &w) > normalized_score(&cold, &w));
+    }
+
+    #[test]
+    fn bytes_scale_via_mib_unit() {
+        // 1 MiB with bytes weight = 1.0 contributes 1.0, not 1_048_576.
+        let metrics = m(1, 0, 1, 0.0, 0.0);
+        assert!((normalized_score(&metrics, &LoadWeights::default()) - 1.0).abs() < 1e-9);
+    }
+}
diff --git a/nodedb-cluster/src/rebalancer/mod.rs b/nodedb-cluster/src/rebalancer/mod.rs
new file mode 100644
index 00000000..3374ca06
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/mod.rs
@@ -0,0 +1,34 @@
+//! Load-based automatic rebalancer.
+//!
+//! This module is the *signal* side of the rebalancer: given a
+//! per-node snapshot of load metrics (vshards led, bytes stored,
+//! writes/sec, reads/sec) it computes whether the cluster is
+//! imbalanced enough to warrant moves, and emits a bounded plan of
+//! vshard migrations from the hottest nodes to the coldest ones.
+//!
+//! The actual driver loop (`loop_driver.rs`) and the bridge to
+//! `MigrationExecutor` land in a follow-up sub-batch. Everything
+//! shipped here is pure, side-effect-free, and fully deterministic
+//! so it can be unit-tested exhaustively before any tokio task is
+//! spawned against it.
+//!
+//! ## Why a new module
+//!
+//! The existing [`crate::rebalance_scheduler::RebalanceScheduler`]
+//! triggers on CPU utilization, SPSC queue pressure, and shard-count
+//! ratio. Those are fast-path overload signals and belong where they
+//! are. This module is a distinct, storage-shape-driven rebalancer
+//! (bytes + qps + vshard count) with bounded in-flight moves and a
+//! 30 s cadence, complementing the overload path.
+
+pub mod driver;
+pub mod elastic;
+pub mod metrics;
+pub mod plan;
+
+pub use driver::{
+    AlwaysReadyGate, ElectionGate, MigrationDispatcher, RebalancerLoop, RebalancerLoopConfig,
+};
+pub use elastic::RebalancerKickHook;
+pub use metrics::{LoadMetrics, LoadMetricsProvider, LoadWeights, normalized_score};
+pub use plan::{RebalancerPlanConfig, compute_load_based_plan};
diff --git a/nodedb-cluster/src/rebalancer/plan.rs b/nodedb-cluster/src/rebalancer/plan.rs
new file mode 100644
index 00000000..ae68712a
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/plan.rs
@@ -0,0 +1,366 @@
+//! Load-imbalance plan computation.
+//!
+//! Given a snapshot of per-node `LoadMetrics` and the current routing
+//! table, decide whether the cluster is imbalanced enough to justify
+//! moves and, if so, emit a bounded list of `PlannedMove`s from the
+//! hottest nodes to the coldest ones.
+//!
+//! ## Trigger
+//!
+//! The rebalancer fires when, after normalizing every node's score:
+//!
+//! > `max - min  >  threshold_pct / 100  *  mean`
+//!
+//! ...i.e. the hottest node is more than `threshold_pct`% above the
+//! cluster mean relative to the coldest one. This is intentionally
+//! not a per-node check: single-hot-node scenarios below the
+//! cluster mean delta are handled by the separate
+//! `rebalance_scheduler` CPU/queue triggers.
+//!
+//! ## Move selection
+//!
+//! For each hot→cold pair, the planner walks the routing table in
+//! stable (sorted by group_id, then vshard_id) order and picks
+//! vshards the hot node is currently leading. It caps moves at
+//! `max_moves_per_group` moves from any single group (so one
+//! over-replicated group can't consume the entire in-flight budget)
+//! and at `max_moves_total` across the whole plan (so the dispatcher
+//! never has more than that many migrations in flight at once).
+//!
+//! Determinism: the plan is deterministic given the same inputs,
+//! including tie-breaks. Two nodes computing the plan at the same
+//! instant produce byte-identical outputs.
+
+use std::collections::HashMap;
+
+use tracing::debug;
+
+use crate::rebalance::PlannedMove;
+use crate::routing::RoutingTable;
+use crate::topology::ClusterTopology;
+
+use super::metrics::{LoadMetrics, LoadWeights, normalized_score};
+
+/// Configuration for [`compute_load_based_plan`].
+#[derive(Debug, Clone)]
+pub struct RebalancerPlanConfig {
+    /// If `(max - min) > (threshold_pct / 100) * mean`, we plan moves.
+    /// Default: 20%.
+    pub imbalance_threshold_pct: u8,
+    /// Maximum moves from any single Raft group per plan. Default 1.
+    pub max_moves_per_group: usize,
+    /// Maximum moves in the entire plan. Default 10.
+    pub max_moves_total: usize,
+    /// Weights applied to the load dimensions when scoring.
+    pub weights: LoadWeights,
+}
+
+impl Default for RebalancerPlanConfig {
+    fn default() -> Self {
+        Self {
+            imbalance_threshold_pct: 20,
+            max_moves_per_group: 1,
+            max_moves_total: 10,
+            weights: LoadWeights::default(),
+        }
+    }
+}
+
+/// Compute a load-driven rebalance plan. Returns an empty vector if
+/// the cluster is already within the imbalance threshold or if there
+/// are fewer than two nodes to compare.
+pub fn compute_load_based_plan(
+    metrics: &[LoadMetrics],
+    routing: &RoutingTable,
+    topology: &ClusterTopology,
+    cfg: &RebalancerPlanConfig,
+) -> Vec<PlannedMove> {
+    if metrics.len() < 2 {
+        return Vec::new();
+    }
+
+    // Score every node, then sort ascending so the hot list and cold
+    // list are natural slices. `f64` isn't Ord, so use total_cmp for
+    // NaN-free deterministic ordering.
+    let mut scored: Vec<(u64, f64)> = metrics
+        .iter()
+        .map(|m| (m.node_id, normalized_score(m, &cfg.weights)))
+        .collect();
+    scored.sort_by(|a, b| a.1.total_cmp(&b.1).then_with(|| a.0.cmp(&b.0)));
+
+    let min = scored.first().map(|(_, s)| *s).unwrap_or(0.0);
+    let max = scored.last().map(|(_, s)| *s).unwrap_or(0.0);
+    let mean: f64 = scored.iter().map(|(_, s)| *s).sum::<f64>() / scored.len() as f64;
+
+    // Imbalance gate. A zero-mean cluster (everything idle) is
+    // considered already balanced — nothing to move.
+    if mean <= 0.0 {
+        return Vec::new();
+    }
+    let threshold = (cfg.imbalance_threshold_pct as f64 / 100.0) * mean;
+    if (max - min) <= threshold {
+        debug!(
+            max,
+            min, mean, threshold, "rebalancer: cluster within imbalance threshold"
+        );
+        return Vec::new();
+    }
+
+    // Only Active nodes are valid migration targets. Cold candidates
+    // must be Active and must not already be the source for a move.
+    let active_set: std::collections::HashSet<u64> =
+        topology.active_nodes().iter().map(|n| n.node_id).collect();
+
+    // Hot = strictly above mean; cold = strictly below mean. Using
+    // the mean as the split point (rather than index-based halving)
+    // correctly handles asymmetric distributions where a single
+    // outlier pulls one node above an otherwise balanced cluster —
+    // the below-mean nodes stay in the cold set even if they tie
+    // with each other.
+    let hot_nodes: Vec<u64> = scored
+        .iter()
+        .rev() // hottest first
+        .filter(|(_, s)| *s > mean)
+        .map(|(id, _)| *id)
+        .collect();
+    let cold_nodes: Vec<u64> = scored
+        .iter()
+        .filter(|(_, s)| *s < mean)
+        .filter(|(id, _)| active_set.contains(id))
+        .map(|(id, _)| *id)
+        .collect();
+
+    if cold_nodes.is_empty() {
+        return Vec::new();
+    }
+
+    // Walk routing in stable order — group id ascending, then vshard
+    // id ascending — and pick moves until we hit the caps.
+    let mut group_ids: Vec<u64> = routing.group_members().keys().copied().collect();
+    group_ids.sort_unstable();
+
+    let mut moves: Vec<PlannedMove> = Vec::new();
+    let mut per_group_count: HashMap<u64, usize> = HashMap::new();
+    let mut cold_cursor = 0usize;
+
+    'outer: for hot in &hot_nodes {
+        if !active_set.contains(hot) {
+            continue;
+        }
+        for &gid in &group_ids {
+            if moves.len() >= cfg.max_moves_total {
+                break 'outer;
+            }
+            let info = match routing.group_info(gid) {
+                Some(i) => i,
+                None => continue,
+            };
+            if info.leader != *hot {
+                continue;
+            }
+            if *per_group_count.get(&gid).unwrap_or(&0) >= cfg.max_moves_per_group {
+                continue;
+            }
+            // Pick the group's lowest vshard id deterministically.
+            let mut vshards = routing.vshards_for_group(gid);
+            vshards.sort_unstable();
+            let Some(&vshard_id) = vshards.first() else {
+                continue;
+            };
+            let target = cold_nodes[cold_cursor % cold_nodes.len()];
+            if target == *hot {
+                continue;
+            }
+            moves.push(PlannedMove {
+                vshard_id,
+                source_node: *hot,
+                target_node: target,
+                source_group: gid,
+            });
+            *per_group_count.entry(gid).or_default() += 1;
+            cold_cursor += 1;
+        }
+    }
+
+    moves
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::{NodeInfo, NodeState};
+    use std::net::SocketAddr;
+
+    fn topo(nodes: &[u64]) -> ClusterTopology {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+        }
+        t
+    }
+
+    fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+        LoadMetrics {
+            node_id: id,
+            vshards_led: v,
+            bytes_stored: bytes_mib * 1_048_576,
+            writes_per_sec: w,
+            reads_per_sec: r,
+            cpu_utilization: 0.0,
+        }
+    }
+
+    #[test]
+    fn empty_metrics_returns_empty_plan() {
+        let t = topo(&[1, 2]);
+        let r = RoutingTable::uniform(2, &[1, 2], 1);
+        let plan = compute_load_based_plan(&[], &r, &t, &RebalancerPlanConfig::default());
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn single_node_returns_empty_plan() {
+        let t = topo(&[1]);
+        let r = RoutingTable::uniform(1, &[1], 1);
+        let plan = compute_load_based_plan(
+            &[lm(1, 100, 100, 100.0, 100.0)],
+            &r,
+            &t,
+            &RebalancerPlanConfig::default(),
+        );
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn balanced_cluster_no_moves() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(3, &[1, 2, 3], 1);
+        let metrics = vec![
+            lm(1, 10, 100, 50.0, 50.0),
+            lm(2, 10, 100, 50.0, 50.0),
+            lm(3, 10, 100, 50.0, 50.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn imbalance_above_threshold_triggers_moves() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+        // Node 1 massively overloaded.
+        let metrics = vec![
+            lm(1, 200, 1000, 500.0, 500.0),
+            lm(2, 10, 50, 25.0, 25.0),
+            lm(3, 10, 50, 25.0, 25.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        assert!(!plan.is_empty());
+        // Every move must source from node 1.
+        for m in &plan {
+            assert_eq!(m.source_node, 1);
+        }
+    }
+
+    #[test]
+    fn plan_respects_max_moves_total() {
+        let t = topo(&[1, 2]);
+        // 20 groups so node 1 can lead many.
+        let mut r = RoutingTable::uniform(20, &[1, 2], 1);
+        for gid in 0..20 {
+            r.set_leader(gid, 1);
+        }
+        let metrics = vec![lm(1, 2000, 10_000, 5000.0, 5000.0), lm(2, 1, 1, 1.0, 1.0)];
+        let cfg = RebalancerPlanConfig {
+            max_moves_total: 4,
+            max_moves_per_group: 1,
+            ..Default::default()
+        };
+        let plan = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        assert_eq!(plan.len(), 4);
+    }
+
+    #[test]
+    fn plan_respects_max_moves_per_group() {
+        let t = topo(&[1, 2]);
+        let mut r = RoutingTable::uniform(3, &[1, 2], 1);
+        for gid in 0..3 {
+            r.set_leader(gid, 1);
+        }
+        let metrics = vec![lm(1, 2000, 10_000, 5000.0, 5000.0), lm(2, 1, 1, 1.0, 1.0)];
+        let cfg = RebalancerPlanConfig {
+            max_moves_total: 99,
+            max_moves_per_group: 1,
+            ..Default::default()
+        };
+        let plan = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        // With max_moves_per_group=1 and 3 groups, at most 3 moves.
+        assert!(plan.len() <= 3);
+        let mut by_group: HashMap<u64, usize> = HashMap::new();
+        for m in &plan {
+            *by_group.entry(m.source_group).or_default() += 1;
+        }
+        for (_, count) in by_group {
+            assert!(count <= 1);
+        }
+    }
+
+    #[test]
+    fn plan_is_deterministic() {
+        let t = topo(&[1, 2, 3]);
+        let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+        for gid in 0..6 {
+            r.set_leader(gid, 1);
+        }
+        let metrics = vec![
+            lm(1, 500, 5000, 200.0, 200.0),
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 5, 5, 5.0, 5.0),
+        ];
+        let cfg = RebalancerPlanConfig::default();
+        let p1 = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        let p2 = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        let p1_tuples: Vec<_> = p1
+            .iter()
+            .map(|m| (m.vshard_id, m.source_node, m.target_node, m.source_group))
+            .collect();
+        let p2_tuples: Vec<_> = p2
+            .iter()
+            .map(|m| (m.vshard_id, m.source_node, m.target_node, m.source_group))
+            .collect();
+        assert_eq!(p1_tuples, p2_tuples);
+    }
+
+    #[test]
+    fn idle_cluster_never_triggers() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(3, &[1, 2, 3], 1);
+        let metrics = vec![
+            lm(1, 0, 0, 0.0, 0.0),
+            lm(2, 0, 0, 0.0, 0.0),
+            lm(3, 0, 0, 0.0, 0.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn cold_node_must_be_active() {
+        // Node 3 is not Active (it's Draining) → cannot receive.
+        let mut t = topo(&[1, 2, 3]);
+        t.set_state(3, NodeState::Draining);
+        let mut r = RoutingTable::uniform(2, &[1, 2, 3], 1);
+        r.set_leader(0, 1);
+        r.set_leader(1, 1);
+        let metrics = vec![
+            lm(1, 500, 5000, 200.0, 200.0),
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 0, 0, 0.0, 0.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        for m in &plan {
+            assert_ne!(m.target_node, 3, "Draining node must not receive moves");
+        }
+    }
+}
diff --git a/nodedb-cluster/src/routing.rs b/nodedb-cluster/src/routing.rs
index fdaaab83..1587bd37 100644
--- a/nodedb-cluster/src/routing.rs
+++ b/nodedb-cluster/src/routing.rs
@@ -155,6 +155,28 @@ impl RoutingTable {
         }
     }
 
+    /// Remove a node from a group's voter and learner lists. If the
+    /// removed node was the current leader hint, the hint is cleared
+    /// so the next query drives a fresh discovery. Returns `true` if
+    /// the group existed and anything was actually removed.
+    ///
+    /// The caller is responsible for safety: dropping below the
+    /// configured replication factor must be gated by
+    /// `decommission::safety::check_can_decommission`.
+    pub fn remove_group_member(&mut self, group_id: u64, node_id: u64) -> bool {
+        let Some(info) = self.group_members.get_mut(&group_id) else {
+            return false;
+        };
+        let before_members = info.members.len();
+        let before_learners = info.learners.len();
+        info.members.retain(|&id| id != node_id);
+        info.learners.retain(|&id| id != node_id);
+        if info.leader == node_id {
+            info.leader = 0;
+        }
+        info.members.len() != before_members || info.learners.len() != before_learners
+    }
+
     /// Update the learner list for a Raft group.
     pub fn set_group_learners(&mut self, group_id: u64, learners: Vec<u64>) {
         if let Some(info) = self.group_members.get_mut(&group_id) {
@@ -274,6 +296,31 @@ mod tests {
         assert_eq!(rt.leader_for_vshard(0).unwrap(), 99);
     }
 
+    #[test]
+    fn remove_group_member_strips_voter_and_clears_leader() {
+        let mut rt = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        rt.set_leader(0, 2);
+        assert!(rt.remove_group_member(0, 2));
+        let info = rt.group_info(0).unwrap();
+        assert!(!info.members.contains(&2));
+        assert_eq!(info.leader, 0, "leader hint should be cleared");
+    }
+
+    #[test]
+    fn remove_group_member_strips_learner_only() {
+        let mut rt = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        rt.add_group_learner(0, 9);
+        assert!(rt.remove_group_member(0, 9));
+        let info = rt.group_info(0).unwrap();
+        assert!(!info.learners.contains(&9));
+    }
+
+    #[test]
+    fn remove_group_member_unknown_group_returns_false() {
+        let mut rt = RoutingTable::uniform(1, &[1, 2], 2);
+        assert!(!rt.remove_group_member(99, 1));
+    }
+
     #[test]
     fn vshard_not_mapped() {
         let rt = RoutingTable::uniform(2, &[1, 2], 2);
diff --git a/nodedb-cluster/src/routing_liveness.rs b/nodedb-cluster/src/routing_liveness.rs
new file mode 100644
index 00000000..3e98a3c6
--- /dev/null
+++ b/nodedb-cluster/src/routing_liveness.rs
@@ -0,0 +1,182 @@
+//! Liveness-driven routing invalidation.
+//!
+//! [`RoutingLivenessHook`] is a [`MembershipSubscriber`] that clears
+//! the leader hint for every Raft group whose leaseholder has just
+//! been marked `Suspect`, `Dead`, or `Left` by the SWIM failure
+//! detector. After the hook fires, the next query that consults the
+//! routing table observes `leader == 0` (the "no leader known"
+//! sentinel) and falls through to a fresh leader discovery via the
+//! existing `NotLeader`-triggered election path. Clients see at most
+//! one retry: the stale hint, the failed dispatch, and a refreshed
+//! leader lookup.
+//!
+//! The hook is storage-agnostic: it holds `Arc<RwLock<RoutingTable>>`
+//! and a resolver closure that maps the string-keyed SWIM `NodeId`
+//! to the numeric `u64` id used throughout the rest of the cluster
+//! crate. Wiring layers (start_cluster, tests) supply the resolver
+//! appropriate to their topology source.
+//!
+//! The hook is intentionally sync and cheap — a single `RwLock::write`,
+//! a linear scan over group_members, and `set_leader(gid, 0)` for
+//! each affected group. No I/O, no spawning. That keeps it safe to
+//! call directly from the detector run loop.
+
+use std::sync::{Arc, RwLock};
+
+use nodedb_types::NodeId;
+use tracing::debug;
+
+use crate::routing::RoutingTable;
+use crate::swim::MemberState;
+use crate::swim::subscriber::MembershipSubscriber;
+
+/// Resolver mapping SWIM `NodeId` → numeric `u64` routing-table id.
+///
+/// Returns `None` for members SWIM knows about but the routing table
+/// does not (placeholder `seed:<addr>` entries before the first real
+/// probe, transient learners, etc.). Those are silently ignored.
+pub type NodeIdResolver = Arc<dyn Fn(&NodeId) -> Option<u64> + Send + Sync>;
+
+/// Clears the leader hint for every group led by a node that SWIM
+/// has marked Suspect/Dead/Left.
+pub struct RoutingLivenessHook {
+    routing: Arc<RwLock<RoutingTable>>,
+    resolver: NodeIdResolver,
+}
+
+impl RoutingLivenessHook {
+    pub fn new(routing: Arc<RwLock<RoutingTable>>, resolver: NodeIdResolver) -> Self {
+        Self { routing, resolver }
+    }
+}
+
+impl MembershipSubscriber for RoutingLivenessHook {
+    fn on_state_change(&self, node_id: &NodeId, _old: Option<MemberState>, new: MemberState) {
+        // Alive transitions are a no-op: the next query will refresh
+        // the leader hint naturally on NotLeader. We only invalidate
+        // when a leader has observably stopped being reachable.
+        if !matches!(
+            new,
+            MemberState::Suspect | MemberState::Dead | MemberState::Left
+        ) {
+            return;
+        }
+
+        let Some(numeric_id) = (self.resolver)(node_id) else {
+            // SWIM knows about a node the routing table doesn't — a
+            // seed placeholder, a learner mid-join, or a node that
+            // was never registered. Nothing to invalidate.
+            return;
+        };
+
+        let mut rt = self.routing.write().unwrap_or_else(|p| p.into_inner());
+        let affected: Vec<u64> = rt
+            .group_members()
+            .iter()
+            .filter(|(_, info)| info.leader == numeric_id)
+            .map(|(gid, _)| *gid)
+            .collect();
+        for gid in &affected {
+            rt.set_leader(*gid, 0);
+        }
+        if !affected.is_empty() {
+            debug!(
+                ?node_id,
+                ?new,
+                numeric_id,
+                groups_invalidated = affected.len(),
+                "routing liveness hook cleared leader hints"
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn rt_with_leaders(pairs: &[(u64, u64)], rf: usize) -> Arc<RwLock<RoutingTable>> {
+        // Build a routing table with `pairs.len()` groups where group
+        // `gid` has leader `leader`. Uses the uniform constructor to
+        // pick a membership, then overrides the leader.
+        let nodes: Vec<u64> = pairs.iter().map(|(_, l)| *l).collect();
+        let mut rt = RoutingTable::uniform(pairs.len() as u64, &nodes, rf);
+        for (gid, leader) in pairs {
+            rt.set_leader(*gid, *leader);
+        }
+        Arc::new(RwLock::new(rt))
+    }
+
+    fn resolver_for(map: &'static [(&'static str, u64)]) -> NodeIdResolver {
+        Arc::new(move |nid: &NodeId| {
+            map.iter()
+                .find(|(s, _)| *s == nid.as_str())
+                .map(|(_, n)| *n)
+        })
+    }
+
+    #[test]
+    fn dead_transition_clears_leader_for_owned_groups() {
+        let rt = rt_with_leaders(&[(0, 1), (1, 2), (2, 1), (3, 3)], 1);
+        let hook =
+            RoutingLivenessHook::new(rt.clone(), resolver_for(&[("a", 1), ("b", 2), ("c", 3)]));
+
+        hook.on_state_change(
+            &NodeId::new("a"),
+            Some(MemberState::Alive),
+            MemberState::Dead,
+        );
+
+        let guard = rt.read().unwrap();
+        assert_eq!(guard.group_info(0).unwrap().leader, 0);
+        assert_eq!(guard.group_info(1).unwrap().leader, 2);
+        assert_eq!(guard.group_info(2).unwrap().leader, 0);
+        assert_eq!(guard.group_info(3).unwrap().leader, 3);
+    }
+
+    #[test]
+    fn suspect_transition_also_invalidates() {
+        let rt = rt_with_leaders(&[(0, 7)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("x", 7)]));
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Suspect,
+        );
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 0);
+    }
+
+    #[test]
+    fn alive_transition_is_noop() {
+        let rt = rt_with_leaders(&[(0, 5)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("q", 5)]));
+        hook.on_state_change(&NodeId::new("q"), None, MemberState::Alive);
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 5);
+    }
+
+    #[test]
+    fn unresolved_node_id_is_ignored() {
+        let rt = rt_with_leaders(&[(0, 1)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("a", 1)]));
+        // NodeId "seed:127.0.0.1:9000" is not in the resolver map.
+        hook.on_state_change(
+            &NodeId::new("seed:127.0.0.1:9000"),
+            Some(MemberState::Alive),
+            MemberState::Dead,
+        );
+        // Leader untouched because the resolver returned None.
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 1);
+    }
+
+    #[test]
+    fn left_is_also_invalidating() {
+        let rt = rt_with_leaders(&[(0, 2)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("b", 2)]));
+        hook.on_state_change(
+            &NodeId::new("b"),
+            Some(MemberState::Alive),
+            MemberState::Left,
+        );
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 0);
+    }
+}
diff --git a/nodedb-cluster/src/swim/bootstrap.rs b/nodedb-cluster/src/swim/bootstrap.rs
index 943190d1..739e6ab1 100644
--- a/nodedb-cluster/src/swim/bootstrap.rs
+++ b/nodedb-cluster/src/swim/bootstrap.rs
@@ -29,6 +29,7 @@ use super::incarnation::Incarnation;
 use super::member::MemberState;
 use super::member::record::MemberUpdate;
 use super::membership::MembershipList;
+use super::subscriber::MembershipSubscriber;
 
 /// Owns a running SWIM detector and its shutdown plumbing.
 ///
@@ -88,6 +89,20 @@ pub async fn spawn(
     local_addr: SocketAddr,
     seeds: Vec<SocketAddr>,
     transport: Arc<dyn Transport>,
+) -> Result<SwimHandle, SwimError> {
+    spawn_with_subscribers(cfg, local_id, local_addr, seeds, transport, Vec::new()).await
+}
+
+/// Same as [`spawn`] but installs the given [`MembershipSubscriber`]s
+/// on the detector before its run loop starts, so every state
+/// transition is observed from the very first probe round.
+pub async fn spawn_with_subscribers(
+    cfg: SwimConfig,
+    local_id: NodeId,
+    local_addr: SocketAddr,
+    seeds: Vec<SocketAddr>,
+    transport: Arc<dyn Transport>,
+    subscribers: Vec<Arc<dyn MembershipSubscriber>>,
 ) -> Result<SwimHandle, SwimError> {
     cfg.validate()?;
 
@@ -112,11 +127,12 @@ pub async fn spawn(
     }
 
     let initial_inc = cfg.initial_incarnation;
-    let detector = Arc::new(FailureDetector::new(
+    let detector = Arc::new(FailureDetector::with_subscribers(
         cfg,
         Arc::clone(&membership),
         transport,
         ProbeScheduler::new(),
+        subscribers,
     ));
 
     // Prime the dissemination queue with our own Alive record so the
diff --git a/nodedb-cluster/src/swim/detector/mod.rs b/nodedb-cluster/src/swim/detector/mod.rs
index 829d285a..5dccfc7b 100644
--- a/nodedb-cluster/src/swim/detector/mod.rs
+++ b/nodedb-cluster/src/swim/detector/mod.rs
@@ -4,8 +4,8 @@
 //! probe scheduler, the suspicion timer, and the main `tokio::select!`
 //! loop. All actual networking is pushed behind the [`Transport`] trait
 //! so unit tests can run fully in-process against [`InMemoryTransport`]
-//! and the real UDP transport in E-ε can slot in without touching the
-//! detector logic.
+//! while production uses [`UdpTransport`] — both slot into the same
+//! detector without touching its logic.
 
 pub mod probe_round;
 pub mod runner;
diff --git a/nodedb-cluster/src/swim/detector/probe_round.rs b/nodedb-cluster/src/swim/detector/probe_round.rs
index 882a6a78..02fd0b99 100644
--- a/nodedb-cluster/src/swim/detector/probe_round.rs
+++ b/nodedb-cluster/src/swim/detector/probe_round.rs
@@ -413,13 +413,16 @@ mod tests {
         );
     }
 
-    #[tokio::test(start_paused = true)]
+    #[tokio::test]
     async fn indirect_ack_saves_target() {
+        // No `start_paused` — paused time auto-advances timeouts
+        // before polling channel-woken tasks, making the indirect
+        // path race the timeout. With real time, the 40ms probe
+        // timeout is ample for the in-memory fabric (sub-µs delivery).
         let fab = TransportFabric::new();
-        let local = Arc::new(fab.bind(addr(7000)).await) as Arc<dyn Transport>;
-        // Target bound but silent on the direct channel.
+        let local = Arc::new(fab.bind(addr(7000)).await);
         let _silent = fab.bind(addr(7001)).await;
-        let helper = fab.bind(addr(7002)).await;
+        let helper = Arc::new(fab.bind(addr(7002)).await);
         let list = membership_with_peers(
             "local",
             7000,
@@ -432,38 +435,74 @@ mod tests {
         let mut sched = ProbeScheduler::with_seed(1);
         let inflight = Arc::new(InflightProbes::new());
 
-        // Helper task: forwards any PingReq it sees into an Ack via the
-        // inflight registry. Paused-runtime auto-advance drives the
-        // direct-ping timeout on the main task.
-        let inflight_helper = Arc::clone(&inflight);
+        // Helper task: respond to Ping (direct probe) with Ack and to
+        // PingReq (indirect probe) with a forwarded Ack — mirrors the
+        // production runner recv-loop + handle_ping_req path. The
+        // scheduler may pick n2 as direct target or as indirect helper
+        // depending on the shuffle seed, so both must be handled.
+        let helper_t: Arc<dyn Transport> = helper.clone();
         let responder = tokio::spawn(async move {
             loop {
-                let (_from, msg) = match helper.recv().await {
+                let (from, msg) = match helper_t.recv().await {
                     Ok(v) => v,
                     Err(_) => return,
                 };
-                if let SwimMessage::PingReq(req) = msg {
-                    inflight_helper
-                        .resolve(
-                            req.probe_id,
-                            SwimMessage::Ack(Ack {
-                                probe_id: req.probe_id,
-                                from: req.target.clone(),
-                                incarnation: Incarnation::new(9),
-                                piggyback: vec![],
-                            }),
-                        )
-                        .await;
-                    return;
+                match msg {
+                    SwimMessage::Ping(ping) => {
+                        let _ = helper_t
+                            .send(
+                                from,
+                                SwimMessage::Ack(Ack {
+                                    probe_id: ping.probe_id,
+                                    from: NodeId::new("n2"),
+                                    incarnation: Incarnation::new(9),
+                                    piggyback: vec![],
+                                }),
+                            )
+                            .await;
+                        return;
+                    }
+                    SwimMessage::PingReq(req) => {
+                        let _ = helper_t
+                            .send(
+                                from,
+                                SwimMessage::Ack(Ack {
+                                    probe_id: req.probe_id,
+                                    from: req.target.clone(),
+                                    incarnation: Incarnation::new(9),
+                                    piggyback: vec![],
+                                }),
+                            )
+                            .await;
+                        return;
+                    }
+                    _ => {}
                 }
             }
         });
 
+        // Recv-loop on the local endpoint: resolves inflight probes
+        // when Acks arrive — mirrors the production runner recv-loop.
+        let recv_t: Arc<dyn Transport> = local.clone();
+        let recv_inflight = Arc::clone(&inflight);
+        let recv_loop = tokio::spawn(async move {
+            loop {
+                let (_from, msg) = match recv_t.recv().await {
+                    Ok(v) => v,
+                    Err(_) => return,
+                };
+                if let SwimMessage::Ack(ref ack) = msg {
+                    recv_inflight.resolve(ack.probe_id, msg).await;
+                }
+            }
+        });
+
+        let local_dyn: Arc<dyn Transport> = local.clone();
         let dissemination = Arc::new(DisseminationQueue::new());
         let outcome = ProbeRound {
             scheduler: &mut sched,
             membership: &list,
-            transport: &local,
+            transport: &local_dyn,
             inflight: &inflight,
             dissemination: &dissemination,
             probe_timeout: cfg().probe_timeout,
@@ -476,9 +515,8 @@ mod tests {
         .execute()
         .await
         .expect("run");
-        let _ = responder.await;
-        // Either direct (unlikely — n1 is silent) or indirect ack via n2.
-        // Whichever path fires, the outcome must be Acked.
+        responder.abort();
+        recv_loop.abort();
         assert!(matches!(outcome, ProbeOutcome::Acked { .. }));
     }
 
diff --git a/nodedb-cluster/src/swim/detector/runner.rs b/nodedb-cluster/src/swim/detector/runner.rs
index 23997583..ef16bb2b 100644
--- a/nodedb-cluster/src/swim/detector/runner.rs
+++ b/nodedb-cluster/src/swim/detector/runner.rs
@@ -19,6 +19,7 @@ use crate::swim::incarnation::Incarnation;
 use crate::swim::member::MemberState;
 use crate::swim::member::record::MemberUpdate;
 use crate::swim::membership::{MembershipList, MergeOutcome};
+use crate::swim::subscriber::MembershipSubscriber;
 use crate::swim::wire::{Ack, Ping, PingReq, ProbeId, SwimMessage};
 
 use super::probe_round::{InflightProbes, ProbeOutcome, ProbeRound};
@@ -41,6 +42,7 @@ pub struct FailureDetector {
     dissemination: Arc<DisseminationQueue>,
     probe_counter: AtomicU64,
     local_incarnation: Mutex<Incarnation>,
+    subscribers: Vec<Arc<dyn MembershipSubscriber>>,
 }
 
 impl FailureDetector {
@@ -51,6 +53,18 @@ impl FailureDetector {
         membership: Arc<MembershipList>,
         transport: Arc<dyn Transport>,
         scheduler: ProbeScheduler,
+    ) -> Self {
+        Self::with_subscribers(cfg, membership, transport, scheduler, Vec::new())
+    }
+
+    /// Construct with a list of [`MembershipSubscriber`]s that will be
+    /// notified on every member state transition.
+    pub fn with_subscribers(
+        cfg: SwimConfig,
+        membership: Arc<MembershipList>,
+        transport: Arc<dyn Transport>,
+        scheduler: ProbeScheduler,
+        subscribers: Vec<Arc<dyn MembershipSubscriber>>,
     ) -> Self {
         let initial_inc = cfg.initial_incarnation;
         Self {
@@ -63,7 +77,30 @@ impl FailureDetector {
             dissemination: Arc::new(DisseminationQueue::new()),
             probe_counter: AtomicU64::new(0),
             local_incarnation: Mutex::new(initial_inc),
+            subscribers,
+        }
+    }
+
+    /// Apply an update via [`apply_and_disseminate`] while notifying
+    /// every subscriber of any resulting state transition. Returns the
+    /// raw [`MergeOutcome`] so callers can still react to
+    /// `SelfRefute` etc.
+    fn apply_and_notify(&self, update: &MemberUpdate) -> MergeOutcome {
+        let old_state = self.membership.get(&update.node_id).map(|m| m.state);
+        let outcome = apply_and_disseminate(&self.membership, &self.dissemination, update);
+        if self.subscribers.is_empty() {
+            return outcome;
+        }
+        let new_state = match self.membership.get(&update.node_id) {
+            Some(m) => m.state,
+            None => return outcome,
+        };
+        if old_state != Some(new_state) {
+            for sub in &self.subscribers {
+                sub.on_state_change(&update.node_id, old_state, new_state);
+            }
         }
+        outcome
     }
 
     /// Shared reference to the dissemination queue. Tests use it to
@@ -78,7 +115,7 @@ impl FailureDetector {
     /// local incarnation so subsequent probes advertise the new value.
     async fn ingest_piggyback(&self, piggyback: &[MemberUpdate]) {
         for update in piggyback {
-            let outcome = apply_and_disseminate(&self.membership, &self.dissemination, update);
+            let outcome = self.apply_and_notify(update);
             if let MergeOutcome::SelfRefute { new_incarnation } = outcome {
                 let mut guard = self.local_incarnation.lock().await;
                 if new_incarnation > *guard {
@@ -140,7 +177,7 @@ impl FailureDetector {
                     state: MemberState::Dead,
                     incarnation: member.incarnation,
                 };
-                apply_and_disseminate(&self.membership, &self.dissemination, &dead_update);
+                self.apply_and_notify(&dead_update);
             }
         }
 
@@ -174,7 +211,7 @@ impl FailureDetector {
                         state: MemberState::Suspect,
                         incarnation: member.incarnation,
                     };
-                    apply_and_disseminate(&self.membership, &self.dissemination, &suspect_update);
+                    self.apply_and_notify(&suspect_update);
                     let cluster_size = self.membership.len();
                     self.suspicion.lock().await.arm(
                         target,
@@ -282,9 +319,9 @@ impl FailureDetector {
     }
 
     /// Refute a self-suspect rumour by bumping local incarnation and
-    /// rebroadcasting `Alive`. E-γ exposes the handle so tests can
-    /// assert the behaviour; the dissemination queue in E-δ will call
-    /// this automatically from the piggyback ingestor.
+    /// rebroadcasting `Alive`. Exposed for tests that assert the
+    /// refutation machinery directly; the piggyback ingestor calls
+    /// the same underlying path automatically in production.
     #[cfg(test)]
     pub async fn bump_local_incarnation(&self, past: Incarnation) -> Incarnation {
         let mut guard = self.local_incarnation.lock().await;
diff --git a/nodedb-cluster/src/swim/member/record.rs b/nodedb-cluster/src/swim/member/record.rs
index 22bde368..8aa98653 100644
--- a/nodedb-cluster/src/swim/member/record.rs
+++ b/nodedb-cluster/src/swim/member/record.rs
@@ -55,7 +55,7 @@ impl Member {
 }
 
 /// Serializable subset of a `Member` — everything except the monotonic
-/// instant. E-β will use this as the wire payload for membership deltas.
+/// instant. Used as the wire payload for membership deltas.
 #[derive(
     Debug,
     Clone,
diff --git a/nodedb-cluster/src/swim/membership/list.rs b/nodedb-cluster/src/swim/membership/list.rs
index be2d975a..e2049625 100644
--- a/nodedb-cluster/src/swim/membership/list.rs
+++ b/nodedb-cluster/src/swim/membership/list.rs
@@ -117,7 +117,7 @@ impl MembershipList {
     }
 
     /// Apply a rumour to the table. Returns the merge outcome so the caller
-    /// can drive the dissemination queue (E-δ). On `SelfRefute`, the local
+    /// can drive the dissemination queue. On `SelfRefute`, the local
     /// record is updated in place to carry the bumped incarnation before
     /// returning, so the caller only needs to gossip the new record.
     pub fn apply(&self, update: &MemberUpdate) -> MergeOutcome {
diff --git a/nodedb-cluster/src/swim/mod.rs b/nodedb-cluster/src/swim/mod.rs
index 2500e5f7..0fa706e6 100644
--- a/nodedb-cluster/src/swim/mod.rs
+++ b/nodedb-cluster/src/swim/mod.rs
@@ -6,19 +6,16 @@
 //! incarnation refutation, dedicated acks) used by modern systems such as
 //! Hashicorp memberlist and Cassandra's gossiper.
 //!
-//! ## Layer map (Phase E)
+//! ## Layer map
 //!
-//! | Sub-batch | Contents                                                   |
-//! |-----------|------------------------------------------------------------|
-//! | **E-α**   | Core types — `config`, `error`, `incarnation`, `member`, `membership` (this file's children) |
-//! | E-β       | Wire messages (`Ping`/`PingReq`/`Ack`/`Nack`) + zerompk codec |
-//! | E-γ       | Failure detector loop over an injected transport trait     |
-//! | E-δ       | Piggyback dissemination queue + convergence tests          |
-//! | E-ε       | Real UDP transport, bootstrap seeding, cluster integration |
-//!
-//! E-α is deliberately side-effect-free: no tasks, no I/O, no wire formats.
-//! It exposes the pure data model — member states, incarnation numbers, and
-//! the state-merge rule — that every later sub-batch builds on.
+//! - `config`, `error`, `incarnation`, `member`, `membership` — pure
+//!   data model: states, incarnation numbers, and the merge rule.
+//! - `wire` — `Ping` / `PingReq` / `Ack` / `Nack` datagrams + codec.
+//! - `detector` — failure detector loop over a pluggable transport
+//!   trait, scheduler, suspicion timer, probe round machinery.
+//! - `dissemination` — piggyback queue with `lambda * log(n)` fanout.
+//! - `bootstrap` — one-stop `spawn` entry point.
+//! - `subscriber` — hook trait fired on every membership transition.
 
 pub mod bootstrap;
 pub mod config;
@@ -28,6 +25,7 @@ pub mod error;
 pub mod incarnation;
 pub mod member;
 pub mod membership;
+pub mod subscriber;
 pub mod wire;
 
 pub use bootstrap::{SwimHandle, spawn};
@@ -40,4 +38,5 @@ pub use error::SwimError;
 pub use incarnation::Incarnation;
 pub use member::{Member, MemberState};
 pub use membership::{MembershipList, MembershipSnapshot, merge_update};
+pub use subscriber::MembershipSubscriber;
 pub use wire::{Ack, Nack, NackReason, Ping, PingReq, ProbeId, SwimMessage};
diff --git a/nodedb-cluster/src/swim/subscriber.rs b/nodedb-cluster/src/swim/subscriber.rs
new file mode 100644
index 00000000..e7a20746
--- /dev/null
+++ b/nodedb-cluster/src/swim/subscriber.rs
@@ -0,0 +1,30 @@
+//! `MembershipSubscriber` — hook fired whenever SWIM observes a
+//! member state transition.
+//!
+//! The failure detector invokes every registered subscriber *after*
+//! applying an update to the [`MembershipList`](super::membership::MembershipList)
+//! and dissemination queue, so subscribers see the post-merge view.
+//!
+//! Subscribers are synchronous and must not block — they typically do
+//! cheap in-memory bookkeeping (e.g. clearing a routing leader hint).
+//! Heavier work belongs on a dedicated task the subscriber spawns
+//! itself.
+//!
+//! ## Lifecycle
+//!
+//! - `old = None` means "first time we've seen this node" (insert).
+//! - `old = Some(state)` means the member existed and transitioned to
+//!   a strictly different `new` state. The detector never calls the
+//!   hook for no-op reapplies.
+//! - `Left` is terminal — after it fires once the member is gone.
+
+use nodedb_types::NodeId;
+
+use super::member::MemberState;
+
+/// Hook trait for observers that react to SWIM membership changes.
+pub trait MembershipSubscriber: Send + Sync {
+    /// Called after the membership list has accepted a state change
+    /// for `node_id`. `old` is `None` on first-time insert.
+    fn on_state_change(&self, node_id: &NodeId, old: Option<MemberState>, new: MemberState);
+}
diff --git a/nodedb-cluster/src/swim/wire/message.rs b/nodedb-cluster/src/swim/wire/message.rs
index da884b96..56d16636 100644
--- a/nodedb-cluster/src/swim/wire/message.rs
+++ b/nodedb-cluster/src/swim/wire/message.rs
@@ -31,7 +31,7 @@ pub enum SwimMessage {
 
 impl SwimMessage {
     /// Mutable borrow of the piggyback slot, independent of variant.
-    /// Used by the dissemination queue (E-δ) to stamp outgoing deltas
+    /// Used by the dissemination queue to stamp outgoing deltas
     /// without caring which message type it is stamping onto.
     pub fn piggyback_mut(&mut self) -> &mut Vec<MemberUpdate> {
         match self {
@@ -53,7 +53,7 @@ impl SwimMessage {
     }
 
     /// Drop piggyback entries beyond `max`. Used before encoding to keep
-    /// a datagram below the UDP MTU — the dissemination queue (E-δ) will
+    /// a datagram below the UDP MTU — the dissemination queue will
     /// decide which updates are highest-priority; this helper just
     /// enforces the upper bound.
     pub fn truncate_piggyback(&mut self, max: usize) {
diff --git a/nodedb-cluster/src/swim/wire/probe.rs b/nodedb-cluster/src/swim/wire/probe.rs
index 3a115019..d17b0373 100644
--- a/nodedb-cluster/src/swim/wire/probe.rs
+++ b/nodedb-cluster/src/swim/wire/probe.rs
@@ -1,9 +1,8 @@
 //! SWIM probe message structs.
 //!
-//! These are the four datagram types the failure detector exchanges over
-//! the network once E-ε wires in a transport. They are pure data types
-//! with `serde` derives — no I/O, no validation beyond what the type
-//! system enforces.
+//! These are the four datagram types the failure detector exchanges
+//! over the network. They are pure data types with `serde` derives —
+//! no I/O, no validation beyond what the type system enforces.
 //!
 //! ## Message flow (reference)
 //!
@@ -21,9 +20,7 @@
 //! ```
 //!
 //! Every message carries a bounded `piggyback: Vec<MemberUpdate>` slot
-//! used for gossip-style dissemination of membership deltas (E-δ). The
-//! wire format reserves the slot now so later sub-batches don't need a
-//! compatibility break.
+//! used for gossip-style dissemination of membership deltas.
 
 use nodedb_types::NodeId;
 use serde::{Deserialize, Serialize};
diff --git a/nodedb-cluster/src/transport/client.rs b/nodedb-cluster/src/transport/client.rs
index 71a7d5fd..3fbf3e15 100644
--- a/nodedb-cluster/src/transport/client.rs
+++ b/nodedb-cluster/src/transport/client.rs
@@ -245,15 +245,23 @@ impl NexarTransport {
                 .ok_or(ClusterError::NodeUnreachable { node_id: target })?
         };
 
-        // Connect.
-        let conn = self
+        // Connect — bounded by rpc_timeout so a hung QUIC handshake
+        // (peer not yet serving) doesn't block for the full 30s idle timeout.
+        let connecting = self
             .listener
             .endpoint()
             .connect_with(self.client_config.clone(), addr, SNI_HOSTNAME)
             .map_err(|e| ClusterError::Transport {
                 detail: format!("connect to node {target} at {addr}: {e}"),
-            })?
+            })?;
+        let conn = tokio::time::timeout(self.rpc_timeout, connecting)
             .await
+            .map_err(|_| ClusterError::Transport {
+                detail: format!(
+                    "handshake timeout ({}ms) with node {target} at {addr}",
+                    self.rpc_timeout.as_millis()
+                ),
+            })?
             .map_err(|e| ClusterError::Transport {
                 detail: format!("handshake with node {target} at {addr}: {e}"),
             })?;
diff --git a/nodedb-cluster/tests/common/mod.rs b/nodedb-cluster/tests/common/mod.rs
index b2c1d137..341682e6 100644
--- a/nodedb-cluster/tests/common/mod.rs
+++ b/nodedb-cluster/tests/common/mod.rs
@@ -187,6 +187,8 @@ impl TestNode {
                 max_backoff_secs: 2,
             },
             swim_udp_addr: None,
+            election_timeout_min: std::time::Duration::from_millis(150),
+            election_timeout_max: std::time::Duration::from_millis(300),
         };
 
         let lifecycle = ClusterLifecycleTracker::new();
@@ -292,9 +294,9 @@ impl TestNode {
     }
 
     /// Number of committed `CatalogDdl` entries observed by this
-    /// node's cache applier. After batch 1e the cluster crate
-    /// treats catalog DDL payloads as opaque — this counter is
-    /// what tests assert on for replication correctness.
+    /// node's cache applier. The cluster crate treats catalog DDL
+    /// payloads as opaque — this counter is what tests assert on
+    /// for replication correctness.
     pub fn catalog_entries_applied(&self) -> u64 {
         self.metadata_cache
             .read()
diff --git a/nodedb-cluster/tests/decommission_flow.rs b/nodedb-cluster/tests/decommission_flow.rs
new file mode 100644
index 00000000..ef317a9d
--- /dev/null
+++ b/nodedb-cluster/tests/decommission_flow.rs
@@ -0,0 +1,153 @@
+//! End-to-end decommission flow.
+//!
+//! Wires every piece of the decommission subsystem together without
+//! standing up a real metadata Raft group:
+//!
+//! - `CacheApplier::with_live_state` holds shared topology + routing.
+//! - A direct in-memory `MetadataProposer` encodes each proposed
+//!   entry, feeds it straight into the applier with a synthetic
+//!   monotonically-increasing index, and returns the index — i.e. a
+//!   "propose and wait for commit" that is instantaneous.
+//! - `DecommissionCoordinator` walks a `plan_full_decommission`
+//!   output through that proposer.
+//! - `DecommissionObserver` watches the local topology for the
+//!   target's state transition and fires its shutdown watch.
+//!
+//! The real metadata Raft path is already exercised by
+//! `metadata_replication.rs`; this test focuses on the decommission
+//! state machine end to end: plan → propose → apply → live state
+//! → observer signal.
+
+use std::net::SocketAddr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+
+use nodedb_cluster::decommission::{
+    DecommissionCoordinator, DecommissionObserver, MetadataProposer, plan_full_decommission,
+};
+use nodedb_cluster::error::Result;
+use nodedb_cluster::metadata_group::{CacheApplier, MetadataApplier, MetadataCache, encode_entry};
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState};
+use nodedb_cluster::{DecommissionRunResult, MetadataEntry};
+
+/// In-memory proposer that encodes every entry and immediately feeds
+/// it through an attached `CacheApplier`, returning a synthetic
+/// monotonically-increasing index. This is the "one-node metadata
+/// group" equivalent the test uses to drive the decommission
+/// state machine end to end in a few hundred microseconds.
+struct DirectProposer {
+    applier: Arc<CacheApplier>,
+    next_index: AtomicU64,
+    proposed: Mutex<Vec<MetadataEntry>>,
+}
+
+impl DirectProposer {
+    fn new(applier: Arc<CacheApplier>) -> Arc<Self> {
+        Arc::new(Self {
+            applier,
+            next_index: AtomicU64::new(1),
+            proposed: Mutex::new(Vec::new()),
+        })
+    }
+}
+
+#[async_trait]
+impl MetadataProposer for DirectProposer {
+    async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64> {
+        let idx = self.next_index.fetch_add(1, Ordering::SeqCst);
+        let bytes = encode_entry(&entry).expect("encode metadata entry");
+        self.applier.apply(&[(idx, bytes)]);
+        self.proposed.lock().unwrap().push(entry);
+        Ok(idx)
+    }
+}
+
+#[tokio::test]
+async fn end_to_end_decommission_drains_node_and_signals_shutdown() {
+    // --- 3 active nodes, 4 groups, RF=3. Decommission node 3
+    //     while RF=2 is the surviving quorum target.
+    let mut topo = ClusterTopology::new();
+    for (i, id) in [1u64, 2, 3].iter().enumerate() {
+        let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+        topo.add_node(NodeInfo::new(*id, a, NodeState::Active));
+    }
+    let topology = Arc::new(RwLock::new(topo));
+    let mut rt = RoutingTable::uniform(4, &[1, 2, 3], 3);
+    // Make node 3 the leader of at least one group so the plan
+    // emits a LeadershipTransfer entry and the applier must handle
+    // it live.
+    rt.set_leader(0, 3);
+    rt.set_leader(1, 1);
+    rt.set_leader(2, 3);
+    rt.set_leader(3, 2);
+    let routing = Arc::new(RwLock::new(rt));
+
+    // --- Applier with live topology + routing cascading.
+    let cache = Arc::new(RwLock::new(MetadataCache::new()));
+    let applier = Arc::new(
+        CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone()),
+    );
+    let proposer = DirectProposer::new(applier.clone());
+
+    // --- Observer running on node 3 (the target).
+    let (observer, mut shutdown_rx) =
+        DecommissionObserver::new(topology.clone(), 3, Duration::from_millis(10));
+
+    // --- Build the plan from a snapshot of the live state.
+    let plan = {
+        let t = topology.read().unwrap();
+        let r = routing.read().unwrap();
+        plan_full_decommission(3, &t, &r, 2).expect("plan")
+    };
+    let plan_len = plan.entries.len();
+
+    // --- Drive the coordinator.
+    let coordinator = DecommissionCoordinator::new(plan, proposer.clone());
+    let result: DecommissionRunResult = coordinator.run().await.expect("coordinator run");
+    assert_eq!(result.node_id, 3);
+    assert_eq!(result.entries_committed, plan_len);
+
+    // --- Assert live state now reflects the decommission outcome.
+    //
+    // Topology: node 3 is gone (final `Leave` entry removed it).
+    {
+        let t = topology.read().unwrap();
+        assert!(
+            t.get_node(3).is_none(),
+            "node 3 should be removed from topology after Leave"
+        );
+        // Node 1 and 2 still present and unchanged.
+        assert_eq!(t.get_node(1).unwrap().state, NodeState::Active);
+        assert_eq!(t.get_node(2).unwrap().state, NodeState::Active);
+    }
+
+    // Routing: node 3 is no longer in any group's member set, and
+    // the groups it used to lead have had their leader hints
+    // updated via LeadershipTransfer.
+    {
+        let r = routing.read().unwrap();
+        for (gid, info) in r.group_members() {
+            assert!(
+                !info.members.contains(&3),
+                "group {gid} still contains node 3 after decommission"
+            );
+            assert!(
+                !info.learners.contains(&3),
+                "group {gid} still has node 3 as learner after decommission"
+            );
+        }
+        // Group 0 was led by 3 → LeadershipTransfer emitted a new
+        // non-3 leader; group 2 likewise.
+        assert_ne!(r.group_info(0).unwrap().leader, 3);
+        assert_ne!(r.group_info(2).unwrap().leader, 3);
+    }
+
+    // --- Observer must now fire its shutdown signal on the very
+    //     next check — the topology change already landed.
+    assert!(observer.check_once());
+    assert!(*shutdown_rx.borrow_and_update());
+}
diff --git a/nodedb-cluster/tests/elastic_scaling.rs b/nodedb-cluster/tests/elastic_scaling.rs
new file mode 100644
index 00000000..4574c721
--- /dev/null
+++ b/nodedb-cluster/tests/elastic_scaling.rs
@@ -0,0 +1,169 @@
+//! Elastic add/remove — proves the end-to-end path from membership
+//! change to rebalancer dispatch.
+//!
+//! - **Add-node**: 3 balanced nodes, 4th node joins with zero load →
+//!   kick fires → sweep dispatches moves to the new node.
+//! - **Remove-node**: covered by `decommission_flow.rs` — the
+//!   decommission plan strips the node from all groups, and the
+//!   rebalancer loop naturally re-evaluates on its next tick.
+
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+
+use nodedb_cluster::error::Result;
+use nodedb_cluster::rebalance::PlannedMove;
+use nodedb_cluster::rebalancer::{
+    AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher,
+    RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig,
+};
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::swim::MemberState;
+use nodedb_cluster::swim::subscriber::MembershipSubscriber;
+use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState};
+use nodedb_types::NodeId;
+
+struct DynamicProvider {
+    metrics: Mutex<Vec<LoadMetrics>>,
+}
+
+impl DynamicProvider {
+    fn new(initial: Vec<LoadMetrics>) -> Arc<Self> {
+        Arc::new(Self {
+            metrics: Mutex::new(initial),
+        })
+    }
+    fn push(&self, m: LoadMetrics) {
+        self.metrics.lock().unwrap().push(m);
+    }
+}
+
+#[async_trait]
+impl LoadMetricsProvider for DynamicProvider {
+    async fn snapshot(&self) -> Result<Vec<LoadMetrics>> {
+        Ok(self.metrics.lock().unwrap().clone())
+    }
+}
+
+struct RecordingDispatcher {
+    calls: Mutex<Vec<PlannedMove>>,
+    fired: AtomicBool,
+}
+
+impl RecordingDispatcher {
+    fn new() -> Arc<Self> {
+        Arc::new(Self {
+            calls: Mutex::new(Vec::new()),
+            fired: AtomicBool::new(false),
+        })
+    }
+}
+
+#[async_trait]
+impl MigrationDispatcher for RecordingDispatcher {
+    async fn dispatch(&self, mv: PlannedMove) -> Result<()> {
+        self.calls.lock().unwrap().push(mv);
+        self.fired.store(true, Ordering::SeqCst);
+        Ok(())
+    }
+}
+
+fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+    LoadMetrics {
+        node_id: id,
+        vshards_led: v,
+        bytes_stored: bytes_mib * 1_048_576,
+        writes_per_sec: w,
+        reads_per_sec: r,
+        cpu_utilization: 0.0,
+    }
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn add_node_triggers_rebalance_via_kick() {
+    // --- Initial state: 3 balanced nodes, 6 groups.
+    let mut topo = ClusterTopology::new();
+    for (i, id) in [1u64, 2, 3].iter().enumerate() {
+        let a: std::net::SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+        topo.add_node(NodeInfo::new(*id, a, NodeState::Active));
+    }
+    let topology = Arc::new(RwLock::new(topo));
+    let mut rt = RoutingTable::uniform(6, &[1, 2, 3], 1);
+    // Node 1 leads all 6 groups → hot.
+    for gid in 0..6 {
+        rt.set_leader(gid, 1);
+    }
+    let routing = Arc::new(RwLock::new(rt));
+
+    // Metrics: node 1 hot, 2 and 3 moderate.
+    let provider = DynamicProvider::new(vec![
+        lm(1, 200, 2000, 200.0, 200.0),
+        lm(2, 50, 500, 50.0, 50.0),
+        lm(3, 50, 500, 50.0, 50.0),
+    ]);
+
+    let dispatcher = RecordingDispatcher::new();
+    let gate: Arc<dyn ElectionGate> = Arc::new(AlwaysReadyGate);
+
+    // Use a long interval so the normal tick doesn't fire before the
+    // kick does — the kick is the signal we're testing.
+    let rloop = Arc::new(RebalancerLoop::new(
+        RebalancerLoopConfig {
+            interval: Duration::from_secs(300),
+            ..Default::default()
+        },
+        provider.clone() as Arc<dyn LoadMetricsProvider>,
+        dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+        gate,
+        routing.clone(),
+        topology.clone(),
+    ));
+
+    // Wire the kick hook.
+    let kick_hook = RebalancerKickHook::new(rloop.kick_handle());
+
+    let (shutdown_tx, shutdown_rx) = tokio::sync::watch::channel(false);
+    let handle = tokio::spawn({
+        let d = Arc::clone(&rloop);
+        async move { d.run(shutdown_rx).await }
+    });
+
+    // --- Simulate node 4 joining.
+    {
+        let mut t = topology.write().unwrap();
+        let a: std::net::SocketAddr = "127.0.0.1:9003".parse().unwrap();
+        t.add_node(NodeInfo::new(4, a, NodeState::Active));
+    }
+    // Add node 4's zero-load metrics so the planner sees it as cold.
+    provider.push(lm(4, 0, 0, 0.0, 0.0));
+
+    // Fire the SWIM membership hook — this should kick the loop.
+    kick_hook.on_state_change(&NodeId::new("node-4"), None, MemberState::Alive);
+
+    // Wait for the dispatcher to fire.
+    let deadline = std::time::Instant::now() + Duration::from_secs(3);
+    while std::time::Instant::now() < deadline {
+        if dispatcher.fired.load(Ordering::SeqCst) {
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+    assert!(
+        dispatcher.fired.load(Ordering::SeqCst),
+        "kick did not trigger a rebalancer dispatch"
+    );
+
+    // At least one move should target node 4 (the cold newcomer).
+    let calls = dispatcher.calls.lock().unwrap().clone();
+    assert!(!calls.is_empty());
+    let to_4 = calls.iter().filter(|m| m.target_node == 4).count();
+    assert!(
+        to_4 > 0,
+        "expected at least one move targeting node 4, got {to_4}"
+    );
+
+    let _ = shutdown_tx.send(true);
+    let _ = tokio::time::timeout(Duration::from_secs(1), handle).await;
+}
diff --git a/nodedb-cluster/tests/metadata_replication.rs b/nodedb-cluster/tests/metadata_replication.rs
index 6ed9c58f..dd0133ee 100644
--- a/nodedb-cluster/tests/metadata_replication.rs
+++ b/nodedb-cluster/tests/metadata_replication.rs
@@ -1,8 +1,8 @@
 //! Integration test: replicated metadata group commits + cache apply.
 //!
-//! After batch 1e the `nodedb-cluster` crate no longer understands
-//! per-DDL-object descriptor shapes — `CatalogDdl { payload }` is
-//! opaque here. This test verifies the cluster-side plumbing
+//! The `nodedb-cluster` crate does not understand per-DDL-object
+//! descriptor shapes — `CatalogDdl { payload }` is opaque here.
+//! This test verifies the cluster-side plumbing
 //! (raft commit + metadata applier dispatch + cache watermark)
 //! using synthetic opaque payloads. End-to-end cross-node DDL
 //! visibility (applier decoding + redb writeback + pgwire visibility)
diff --git a/nodedb-cluster/tests/reachability_loop.rs b/nodedb-cluster/tests/reachability_loop.rs
new file mode 100644
index 00000000..9e0ab007
--- /dev/null
+++ b/nodedb-cluster/tests/reachability_loop.rs
@@ -0,0 +1,142 @@
+//! Reachability loop closes the circuit-breaker blind spot.
+//!
+//! Scenario: peer 42 starts out unreachable so its breaker opens.
+//! After a few seconds the peer "recovers" (the mock prober flips
+//! from Err to Ok). The reachability driver must observe the next
+//! sweep as a success and drive the breaker back to `Closed` without
+//! any user traffic.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::{Duration, Instant};
+
+use async_trait::async_trait;
+
+use nodedb_cluster::circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, CircuitState};
+use nodedb_cluster::error::{ClusterError, Result};
+use nodedb_cluster::reachability::{
+    ReachabilityDriver, ReachabilityDriverConfig, ReachabilityProber,
+};
+use tokio::sync::watch;
+
+/// Mock prober whose success/failure can be flipped at runtime by
+/// the test. Every probe call increments a hit counter so the test
+/// can prove the sweep actually ran.
+struct Flappy {
+    healthy: AtomicBool,
+}
+
+impl Flappy {
+    fn new() -> Arc<Self> {
+        Arc::new(Self {
+            healthy: AtomicBool::new(false),
+        })
+    }
+    fn heal(&self) {
+        self.healthy.store(true, Ordering::SeqCst);
+    }
+}
+
+#[async_trait]
+impl ReachabilityProber for Flappy {
+    async fn probe(&self, peer: u64) -> Result<()> {
+        if self.healthy.load(Ordering::SeqCst) {
+            Ok(())
+        } else {
+            Err(ClusterError::Transport {
+                detail: format!("mock: peer {peer} unreachable"),
+            })
+        }
+    }
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn reachability_loop_recovers_open_breaker_without_user_traffic() {
+    // --- Shared breaker, opened immediately for peer 42. ---
+    let breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig {
+        failure_threshold: 1,
+        // Short cooldown so HalfOpen is eligible quickly — the
+        // driver still needs to drive the actual transition.
+        cooldown: Duration::from_millis(100),
+    }));
+    breaker.record_failure(42);
+    assert_eq!(breaker.state(42), CircuitState::Open);
+
+    // --- Flappy prober starts "unhealthy". ---
+    let prober = Flappy::new();
+
+    // The driver's sweep_once calls probe() but does NOT itself
+    // drive record_success/record_failure — production relies on
+    // NexarTransport::send_rpc for that, and the mock has no such
+    // wrapper. So we install a relay closure that records the
+    // outcome against the breaker on the driver's behalf. This is
+    // the minimal glue needed to exercise the real loop end-to-end.
+    struct RelayProber {
+        inner: Arc<Flappy>,
+        breaker: Arc<CircuitBreaker>,
+    }
+    #[async_trait]
+    impl ReachabilityProber for RelayProber {
+        async fn probe(&self, peer: u64) -> Result<()> {
+            // Mirror send_rpc: check → probe → record outcome.
+            if self.breaker.check(peer).is_err() {
+                return Err(ClusterError::CircuitOpen {
+                    node_id: peer,
+                    failures: self.breaker.failure_count(peer),
+                });
+            }
+            match self.inner.probe(peer).await {
+                Ok(()) => {
+                    self.breaker.record_success(peer);
+                    Ok(())
+                }
+                Err(e) => {
+                    self.breaker.record_failure(peer);
+                    Err(e)
+                }
+            }
+        }
+    }
+    let relay: Arc<dyn ReachabilityProber> = Arc::new(RelayProber {
+        inner: prober.clone(),
+        breaker: Arc::clone(&breaker),
+    });
+
+    let driver = Arc::new(ReachabilityDriver::new(
+        Arc::clone(&breaker),
+        relay,
+        ReachabilityDriverConfig {
+            interval: Duration::from_millis(150),
+        },
+    ));
+    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let handle = tokio::spawn({
+        let d = Arc::clone(&driver);
+        async move { d.run(shutdown_rx).await }
+    });
+
+    // --- First few sweeps: probe keeps failing, breaker stays Open. ---
+    tokio::time::sleep(Duration::from_millis(500)).await;
+    assert_eq!(
+        breaker.state(42),
+        CircuitState::Open,
+        "breaker should stay open while peer is unhealthy"
+    );
+
+    // --- Heal the peer. Next sweep should drive Open → HalfOpen → Closed. ---
+    prober.heal();
+
+    let deadline = Instant::now() + Duration::from_secs(3);
+    loop {
+        if breaker.state(42) == CircuitState::Closed {
+            break;
+        }
+        if Instant::now() >= deadline {
+            panic!("breaker never recovered; state = {:?}", breaker.state(42));
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+
+    let _ = shutdown_tx.send(true);
+    let _ = tokio::time::timeout(Duration::from_secs(1), handle).await;
+}
diff --git a/nodedb-cluster/tests/rebalancer_loop.rs b/nodedb-cluster/tests/rebalancer_loop.rs
new file mode 100644
index 00000000..b32cf2b6
--- /dev/null
+++ b/nodedb-cluster/tests/rebalancer_loop.rs
@@ -0,0 +1,177 @@
+//! End-to-end rebalancer driver loop.
+//!
+//! Wires every piece of the rebalancer together without standing up
+//! the real `MigrationExecutor`:
+//!
+//! - A shared `Arc<RwLock<RoutingTable>>` + `Arc<RwLock<ClusterTopology>>`.
+//! - A `StaticProvider` returning a canned set of `LoadMetrics` so
+//!   node 1 is massively hotter than nodes 2 and 3.
+//! - A `DirectDispatcher` that simulates instantaneous migration
+//!   completion by reassigning the vshard's group leader in the
+//!   live routing table and recording the call for assertions.
+//! - An `AlwaysReadyGate` — no election gating in this synthetic
+//!   scenario.
+//!
+//! The test spawns the loop, advances through one sweep, asserts
+//! the dispatcher observed moves exclusively from node 1 as source,
+//! and asserts the routing table was actually mutated — proving the
+//! full plan → dispatch → apply chain.
+
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::sync::watch;
+
+use nodedb_cluster::error::Result;
+use nodedb_cluster::rebalance::PlannedMove;
+use nodedb_cluster::rebalancer::{
+    AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher,
+    RebalancerLoop, RebalancerLoopConfig,
+};
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState};
+
+struct StaticProvider(Vec<LoadMetrics>);
+
+#[async_trait]
+impl LoadMetricsProvider for StaticProvider {
+    async fn snapshot(&self) -> Result<Vec<LoadMetrics>> {
+        Ok(self.0.clone())
+    }
+}
+
+struct DirectDispatcher {
+    routing: Arc<RwLock<RoutingTable>>,
+    calls: Mutex<Vec<PlannedMove>>,
+    fired: AtomicBool,
+}
+
+impl DirectDispatcher {
+    fn new(routing: Arc<RwLock<RoutingTable>>) -> Arc<Self> {
+        Arc::new(Self {
+            routing,
+            calls: Mutex::new(Vec::new()),
+            fired: AtomicBool::new(false),
+        })
+    }
+    fn calls(&self) -> Vec<PlannedMove> {
+        self.calls.lock().unwrap().clone()
+    }
+    fn fired(&self) -> bool {
+        self.fired.load(Ordering::SeqCst)
+    }
+}
+
+#[async_trait]
+impl MigrationDispatcher for DirectDispatcher {
+    async fn dispatch(&self, mv: PlannedMove) -> Result<()> {
+        // Simulate a completed migration by flipping the group
+        // leader to the target node.
+        {
+            let mut rt = self.routing.write().unwrap_or_else(|p| p.into_inner());
+            rt.set_leader(mv.source_group, mv.target_node);
+        }
+        self.calls.lock().unwrap().push(mv);
+        self.fired.store(true, Ordering::SeqCst);
+        Ok(())
+    }
+}
+
+fn topo(nodes: &[u64]) -> Arc<RwLock<ClusterTopology>> {
+    let mut t = ClusterTopology::new();
+    for (i, id) in nodes.iter().enumerate() {
+        let a: std::net::SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+        t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+    }
+    Arc::new(RwLock::new(t))
+}
+
+fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+    LoadMetrics {
+        node_id: id,
+        vshards_led: v,
+        bytes_stored: bytes_mib * 1_048_576,
+        writes_per_sec: w,
+        reads_per_sec: r,
+        cpu_utilization: 0.0,
+    }
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn rebalancer_loop_dispatches_and_mutates_routing() {
+    // --- 3 active nodes, 6 groups, node 1 leads all of them (hot).
+    let topology = topo(&[1, 2, 3]);
+    let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+    for gid in 0..6 {
+        r.set_leader(gid, 1);
+    }
+    let routing = Arc::new(RwLock::new(r));
+
+    // --- Hot node 1, cold 2 and 3.
+    let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticProvider(vec![
+        lm(1, 500, 5000, 200.0, 200.0),
+        lm(2, 5, 5, 5.0, 5.0),
+        lm(3, 5, 5, 5.0, 5.0),
+    ]));
+
+    let dispatcher = DirectDispatcher::new(routing.clone());
+    let gate: Arc<dyn ElectionGate> = Arc::new(AlwaysReadyGate);
+
+    let rloop = Arc::new(RebalancerLoop::new(
+        RebalancerLoopConfig {
+            interval: Duration::from_millis(50),
+            ..Default::default()
+        },
+        metrics,
+        dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+        gate,
+        routing.clone(),
+        topology.clone(),
+    ));
+
+    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let handle = tokio::spawn({
+        let d = Arc::clone(&rloop);
+        async move { d.run(shutdown_rx).await }
+    });
+
+    // Wall-clock wait — the loop uses real time, so just give it a
+    // couple of intervals to sweep + spawn + dispatch.
+    let deadline = std::time::Instant::now() + Duration::from_secs(3);
+    while std::time::Instant::now() < deadline {
+        if dispatcher.fired() {
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+    assert!(
+        dispatcher.fired(),
+        "rebalancer loop never dispatched a move"
+    );
+
+    // Every move must have node 1 as source.
+    let calls = dispatcher.calls();
+    assert!(!calls.is_empty());
+    for c in &calls {
+        assert_eq!(c.source_node, 1, "source must be the hot node");
+        assert_ne!(c.target_node, 1, "target must differ from source");
+    }
+
+    // Routing mutation: at least one group previously led by 1 now
+    // has a non-1 leader.
+    {
+        let rt = routing.read().unwrap();
+        let still_on_1 = (0..6)
+            .filter(|gid| rt.group_info(*gid).unwrap().leader == 1)
+            .count();
+        assert!(
+            still_on_1 < 6,
+            "at least one group should have moved off node 1"
+        );
+    }
+
+    let _ = shutdown_tx.send(true);
+    let _ = tokio::time::timeout(Duration::from_secs(1), handle).await;
+}
diff --git a/nodedb-cluster/tests/swim_routing_invalidation.rs b/nodedb-cluster/tests/swim_routing_invalidation.rs
new file mode 100644
index 00000000..6dd67422
--- /dev/null
+++ b/nodedb-cluster/tests/swim_routing_invalidation.rs
@@ -0,0 +1,159 @@
+//! Liveness drives routing invalidation.
+//!
+//! Three UDP-backed SWIM nodes form a full mesh. A shared
+//! `RoutingTable` declares node B as the leader of group 0. A
+//! `RoutingLivenessHook` subscribed to node A's detector is wired to
+//! that routing table. When B is shut down, A's detector must observe
+//! the Suspect→Dead transition and the hook must clear the leader
+//! hint for group 0 within a few suspicion timeouts.
+
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, Instant};
+
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::routing_liveness::{NodeIdResolver, RoutingLivenessHook};
+use nodedb_cluster::swim::Transport;
+use nodedb_cluster::swim::bootstrap::spawn_with_subscribers;
+use nodedb_cluster::{
+    Incarnation, MembershipSubscriber, SwimConfig, SwimHandle, UdpTransport, spawn_swim,
+};
+use nodedb_types::NodeId;
+
+fn fast_cfg() -> SwimConfig {
+    SwimConfig {
+        probe_interval: Duration::from_millis(50),
+        probe_timeout: Duration::from_millis(20),
+        indirect_probes: 2,
+        suspicion_mult: 3,
+        min_suspicion: Duration::from_millis(150),
+        initial_incarnation: Incarnation::ZERO,
+        max_piggyback: 6,
+        fanout_lambda: 3,
+    }
+}
+
+fn resolver_static() -> NodeIdResolver {
+    Arc::new(|nid: &NodeId| match nid.as_str() {
+        "a" => Some(1),
+        "b" => Some(2),
+        "c" => Some(3),
+        _ => None,
+    })
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn swim_dead_leader_clears_routing_hint() {
+    // --- Build three real UDP transports on ephemeral ports. ---
+    let t_a = Arc::new(
+        UdpTransport::bind("127.0.0.1:0".parse().unwrap())
+            .await
+            .unwrap(),
+    );
+    let t_b = Arc::new(
+        UdpTransport::bind("127.0.0.1:0".parse().unwrap())
+            .await
+            .unwrap(),
+    );
+    let t_c = Arc::new(
+        UdpTransport::bind("127.0.0.1:0".parse().unwrap())
+            .await
+            .unwrap(),
+    );
+    let addr_a = t_a.local_addr();
+    let addr_b = t_b.local_addr();
+    let addr_c = t_c.local_addr();
+
+    // --- Shared routing table: 4 groups, leader = node b (id=2) for groups 0 and 2. ---
+    let rt = Arc::new(RwLock::new(RoutingTable::uniform(4, &[1, 2, 3], 3)));
+    {
+        let mut guard = rt.write().unwrap();
+        guard.set_leader(0, 2);
+        guard.set_leader(1, 1);
+        guard.set_leader(2, 2);
+        guard.set_leader(3, 3);
+    }
+
+    // --- Hook node A to the routing table. ---
+    let hook: Arc<dyn MembershipSubscriber> =
+        Arc::new(RoutingLivenessHook::new(rt.clone(), resolver_static()));
+
+    let h_a: SwimHandle = spawn_with_subscribers(
+        fast_cfg(),
+        NodeId::new("a"),
+        addr_a,
+        vec![addr_b, addr_c],
+        t_a.clone() as Arc<dyn Transport>,
+        vec![hook],
+    )
+    .await
+    .expect("spawn a");
+    let h_b: SwimHandle = spawn_swim(
+        fast_cfg(),
+        NodeId::new("b"),
+        addr_b,
+        vec![addr_a, addr_c],
+        t_b.clone() as Arc<dyn Transport>,
+    )
+    .await
+    .expect("spawn b");
+    let h_c: SwimHandle = spawn_swim(
+        fast_cfg(),
+        NodeId::new("c"),
+        addr_c,
+        vec![addr_a, addr_b],
+        t_c.clone() as Arc<dyn Transport>,
+    )
+    .await
+    .expect("spawn c");
+
+    // --- Wait for A to learn about B (real id, not placeholder). ---
+    let deadline = Instant::now() + Duration::from_secs(5);
+    loop {
+        let seen = h_a.membership().get(&NodeId::new("b")).is_some();
+        if seen || Instant::now() >= deadline {
+            assert!(seen, "A never learned B's real NodeId");
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+
+    // --- Sanity: group 0 still led by node 2. ---
+    {
+        let guard = rt.read().unwrap();
+        assert_eq!(guard.group_info(0).unwrap().leader, 2);
+    }
+
+    // --- Shut B down and wait for A to invalidate the leader hint. ---
+    h_b.shutdown().await;
+
+    let deadline = Instant::now() + Duration::from_secs(5);
+    loop {
+        let cleared = {
+            let guard = rt.read().unwrap();
+            guard.group_info(0).unwrap().leader == 0 && guard.group_info(2).unwrap().leader == 0
+        };
+        if cleared {
+            break;
+        }
+        if Instant::now() >= deadline {
+            panic!("routing hook never cleared leader hints for groups led by B");
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+
+    // Groups led by A must be untouched — A is still alive and probing.
+    // We do NOT assert on groups led by C: under real UDP races the
+    // detector may transiently flag C as Suspect while B is being
+    // demoted, which is the correct behaviour of the hook.
+    {
+        let guard = rt.read().unwrap();
+        assert_eq!(
+            guard.group_info(1).unwrap().leader,
+            1,
+            "group led by local node A must not be invalidated"
+        );
+    }
+
+    h_a.shutdown().await;
+    h_c.shutdown().await;
+}
diff --git a/nodedb-raft/src/node/core.rs b/nodedb-raft/src/node/core.rs
index 33ef4d11..0df2408a 100644
--- a/nodedb-raft/src/node/core.rs
+++ b/nodedb-raft/src/node/core.rs
@@ -5,6 +5,7 @@
 //! replication) live in [`super::internal`]. RPC handlers live in
 //! [`super::rpc`].
 
+use std::collections::HashSet;
 use std::time::Instant;
 
 use crate::error::{RaftError, Result};
@@ -61,7 +62,7 @@ pub struct RaftNode<S: LogStorage> {
     /// When the next heartbeat should be sent (leader only).
     pub(super) heartbeat_deadline: Instant,
     /// Votes received in current election.
-    pub(super) votes_received: Vec<u64>,
+    pub(super) votes_received: HashSet<u64>,
     /// Pending ready output.
     pub(super) ready: Ready,
     /// Known leader ID (0 = unknown).
@@ -89,7 +90,7 @@ impl<S: LogStorage> RaftNode<S> {
             leader_state: None,
             election_deadline: now + config.election_timeout_max,
             heartbeat_deadline: now,
-            votes_received: Vec::new(),
+            votes_received: HashSet::new(),
             ready: Ready::default(),
             leader_id: 0,
             config,
diff --git a/nodedb-raft/src/node/rpc.rs b/nodedb-raft/src/node/rpc.rs
index 31a2af41..d2c5e4d4 100644
--- a/nodedb-raft/src/node/rpc.rs
+++ b/nodedb-raft/src/node/rpc.rs
@@ -166,7 +166,7 @@ impl<S: LogStorage> RaftNode<S> {
     }
 
     /// Handle RequestVote response (candidate only).
-    pub fn handle_request_vote_response(&mut self, _peer: u64, resp: &RequestVoteResponse) {
+    pub fn handle_request_vote_response(&mut self, peer: u64, resp: &RequestVoteResponse) {
         if resp.term > self.hard_state.current_term {
             self.become_follower(resp.term);
             return;
@@ -177,7 +177,7 @@ impl<S: LogStorage> RaftNode<S> {
         }
 
         if resp.vote_granted {
-            self.votes_received.push(resp.term);
+            self.votes_received.insert(peer);
             let vote_count = self.votes_received.len() + 1; // +1 for self-vote
 
             if vote_count >= self.config.quorum() {
diff --git a/nodedb-sql/src/ddl_ast/mod.rs b/nodedb-sql/src/ddl_ast/mod.rs
new file mode 100644
index 00000000..7e115e38
--- /dev/null
+++ b/nodedb-sql/src/ddl_ast/mod.rs
@@ -0,0 +1,16 @@
+//! Typed AST for NodeDB-specific DDL statements.
+//!
+//! Every DDL command the system supports is represented as a variant
+//! of [`NodedbStatement`]. The DDL router matches on this enum
+//! instead of string prefixes, so the compiler catches missing
+//! handlers when a new DDL is added.
+//!
+//! The parser ([`parse`]) converts raw SQL into a `NodedbStatement`
+//! using whitespace-split token matching — the same technique the
+//! old string-prefix router used, but producing a typed output.
+
+pub mod parse;
+pub mod statement;
+
+pub use parse::parse;
+pub use statement::NodedbStatement;
diff --git a/nodedb-sql/src/ddl_ast/parse.rs b/nodedb-sql/src/ddl_ast/parse.rs
new file mode 100644
index 00000000..78fb6635
--- /dev/null
+++ b/nodedb-sql/src/ddl_ast/parse.rs
@@ -0,0 +1,605 @@
+//! Parse raw SQL into a [`NodedbStatement`].
+
+use super::statement::NodedbStatement;
+
+/// Try to parse a DDL statement from raw SQL. Returns `None` for
+/// non-DDL queries (SELECT, INSERT, etc.) that should flow through
+/// the normal planner.
+pub fn parse(sql: &str) -> Option<NodedbStatement> {
+    let trimmed = sql.trim();
+    if trimmed.is_empty() {
+        return None;
+    }
+    let upper = trimmed.to_uppercase();
+    let parts: Vec<&str> = trimmed.split_whitespace().collect();
+    if parts.is_empty() {
+        return None;
+    }
+
+    // ── Collection lifecycle ─────────────────────────────────────
+    if upper.starts_with("CREATE COLLECTION ") || upper.starts_with("CREATE TABLE ") {
+        let if_not_exists = upper.contains("IF NOT EXISTS");
+        let name = extract_name_after_keyword(&parts, "COLLECTION")
+            .or_else(|| extract_name_after_keyword(&parts, "TABLE"))?;
+        return Some(NodedbStatement::CreateCollection {
+            name,
+            if_not_exists,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP COLLECTION ") || upper.starts_with("DROP TABLE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "COLLECTION")
+            .or_else(|| extract_name_after_if_exists(&parts, "TABLE"))?;
+        return Some(NodedbStatement::DropCollection { name, if_exists });
+    }
+    if upper.starts_with("ALTER COLLECTION ") || upper.starts_with("ALTER TABLE ") {
+        let name = extract_name_after_keyword(&parts, "COLLECTION")
+            .or_else(|| extract_name_after_keyword(&parts, "TABLE"))?;
+        return Some(NodedbStatement::AlterCollection {
+            name,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DESCRIBE ") && !upper.starts_with("DESCRIBE SEQUENCE") {
+        let name = parts.get(1)?.to_string();
+        return Some(NodedbStatement::DescribeCollection { name });
+    }
+    if upper == "\\D" || upper == "SHOW COLLECTIONS" || upper.starts_with("SHOW COLLECTIONS") {
+        return Some(NodedbStatement::ShowCollections);
+    }
+
+    // ── Index ────────────────────────────────────────────────────
+    if upper.starts_with("CREATE UNIQUE INDEX ") || upper.starts_with("CREATE UNIQUE IND") {
+        return Some(NodedbStatement::CreateIndex {
+            unique: true,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("CREATE INDEX ") {
+        return Some(NodedbStatement::CreateIndex {
+            unique: false,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP INDEX ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "INDEX")?;
+        return Some(NodedbStatement::DropIndex {
+            name,
+            collection: None,
+            if_exists,
+        });
+    }
+    if upper.starts_with("SHOW INDEX") {
+        let collection = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowIndexes { collection });
+    }
+    if upper.starts_with("REINDEX ") {
+        let collection = parts.get(1)?.to_string();
+        return Some(NodedbStatement::Reindex { collection });
+    }
+
+    // ── Trigger ──────────────────────────────────────────────────
+    if upper.starts_with("CREATE ") && upper.contains("TRIGGER ") {
+        let or_replace = upper.contains("OR REPLACE");
+        let deferred = upper.contains("DEFERRED");
+        let sync = upper.contains("SYNC");
+        return Some(NodedbStatement::CreateTrigger {
+            or_replace,
+            deferred,
+            sync,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP TRIGGER ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "TRIGGER")?;
+        let collection = extract_after_keyword(&parts, "ON").unwrap_or_default();
+        return Some(NodedbStatement::DropTrigger {
+            name,
+            collection,
+            if_exists,
+        });
+    }
+    if upper.starts_with("ALTER TRIGGER ") {
+        return Some(NodedbStatement::AlterTrigger {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW TRIGGERS") {
+        let collection = if upper.starts_with("SHOW TRIGGERS ON ") {
+            parts.get(3).map(|s| s.to_string())
+        } else {
+            None
+        };
+        return Some(NodedbStatement::ShowTriggers { collection });
+    }
+
+    // ── Schedule ─────────────────────────────────────────────────
+    if upper.starts_with("CREATE SCHEDULE ") {
+        return Some(NodedbStatement::CreateSchedule {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP SCHEDULE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "SCHEDULE")?;
+        return Some(NodedbStatement::DropSchedule { name, if_exists });
+    }
+    if upper.starts_with("ALTER SCHEDULE ") {
+        return Some(NodedbStatement::AlterSchedule {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW SCHEDULE HISTORY ") {
+        let name = parts.get(3)?.to_string();
+        return Some(NodedbStatement::ShowScheduleHistory { name });
+    }
+    if upper == "SHOW SCHEDULES" || upper.starts_with("SHOW SCHEDULES") {
+        return Some(NodedbStatement::ShowSchedules);
+    }
+
+    // ── Sequence ─────────────────────────────────────────────────
+    if upper.starts_with("CREATE SEQUENCE ") {
+        let if_not_exists = upper.contains("IF NOT EXISTS");
+        let name = extract_name_after_if_exists(&parts, "SEQUENCE")?;
+        return Some(NodedbStatement::CreateSequence {
+            name,
+            if_not_exists,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP SEQUENCE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "SEQUENCE")?;
+        return Some(NodedbStatement::DropSequence { name, if_exists });
+    }
+    if upper.starts_with("ALTER SEQUENCE ") {
+        return Some(NodedbStatement::AlterSequence {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DESCRIBE SEQUENCE ") {
+        let name = parts.get(2)?.to_string();
+        return Some(NodedbStatement::DescribeSequence { name });
+    }
+    if upper == "SHOW SEQUENCES" || upper.starts_with("SHOW SEQUENCES") {
+        return Some(NodedbStatement::ShowSequences);
+    }
+
+    // ── Alert ────────────────────────────────────────────────────
+    if upper.starts_with("CREATE ALERT ") {
+        return Some(NodedbStatement::CreateAlert {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP ALERT ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "ALERT")?;
+        return Some(NodedbStatement::DropAlert { name, if_exists });
+    }
+    if upper.starts_with("ALTER ALERT ") {
+        return Some(NodedbStatement::AlterAlert {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW ALERT STATUS ") {
+        let name = parts.get(3)?.to_string();
+        return Some(NodedbStatement::ShowAlertStatus { name });
+    }
+    if upper.starts_with("SHOW ALERT") && !upper.starts_with("SHOW ALERT STATUS") {
+        return Some(NodedbStatement::ShowAlerts);
+    }
+
+    // ── Retention policy ─────────────────────────────────────────
+    if upper.starts_with("CREATE RETENTION POLICY ") {
+        return Some(NodedbStatement::CreateRetentionPolicy {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP RETENTION POLICY ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "POLICY")?;
+        return Some(NodedbStatement::DropRetentionPolicy { name, if_exists });
+    }
+    if upper.starts_with("ALTER RETENTION POLICY ") {
+        return Some(NodedbStatement::AlterRetentionPolicy {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW RETENTION POLIC") {
+        return Some(NodedbStatement::ShowRetentionPolicies);
+    }
+
+    // ── Cluster admin ────────────────────────────────────────────
+    if upper.starts_with("SHOW CLUSTER") {
+        return Some(NodedbStatement::ShowCluster);
+    }
+    if upper.starts_with("SHOW MIGRATIONS") {
+        return Some(NodedbStatement::ShowMigrations);
+    }
+    if upper.starts_with("SHOW RANGES") {
+        return Some(NodedbStatement::ShowRanges);
+    }
+    if upper.starts_with("SHOW ROUTING") {
+        return Some(NodedbStatement::ShowRouting);
+    }
+    if upper.starts_with("SHOW SCHEMA VERSION") {
+        return Some(NodedbStatement::ShowSchemaVersion);
+    }
+    if upper.starts_with("SHOW PEER HEALTH") {
+        return Some(NodedbStatement::ShowPeerHealth);
+    }
+    if upper.starts_with("REBALANCE") {
+        return Some(NodedbStatement::Rebalance);
+    }
+    if upper.starts_with("SHOW RAFT GROUP ") {
+        let id = parts.get(3)?.to_string();
+        return Some(NodedbStatement::ShowRaftGroup { group_id: id });
+    }
+    if upper.starts_with("SHOW RAFT GROUPS") || upper.starts_with("SHOW RAFT") {
+        return Some(NodedbStatement::ShowRaftGroups);
+    }
+    if upper.starts_with("ALTER RAFT GROUP ") {
+        return Some(NodedbStatement::AlterRaftGroup {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("REMOVE NODE ") {
+        let id = parts.get(2)?.to_string();
+        return Some(NodedbStatement::RemoveNode { node_id: id });
+    }
+    if upper.starts_with("SHOW NODE ") {
+        let id = parts.get(2)?.to_string();
+        return Some(NodedbStatement::ShowNode { node_id: id });
+    }
+    if upper.starts_with("SHOW NODES") {
+        return Some(NodedbStatement::ShowNodes);
+    }
+
+    // ── Maintenance ──────────────────────────────────────────────
+    if upper.starts_with("ANALYZE") {
+        let collection = parts.get(1).map(|s| s.to_string());
+        return Some(NodedbStatement::Analyze { collection });
+    }
+    if upper.starts_with("COMPACT ") {
+        let collection = parts.get(1)?.to_string();
+        return Some(NodedbStatement::Compact { collection });
+    }
+    if upper.starts_with("SHOW COMPACTION ST") {
+        return Some(NodedbStatement::ShowCompactionStatus);
+    }
+    if upper.starts_with("SHOW STORAGE") {
+        let collection = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowStorage { collection });
+    }
+
+    // ── Backup / restore ─────────────────────────────────────────
+    if upper.starts_with("BACKUP TENANT ") {
+        return Some(NodedbStatement::BackupTenant {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("RESTORE TENANT ") {
+        let dry_run = upper.ends_with(" DRY RUN") || upper.ends_with(" DRYRUN");
+        return Some(NodedbStatement::RestoreTenant {
+            dry_run,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+
+    // ── User / auth ──────────────────────────────────────────────
+    if upper.starts_with("CREATE USER ") {
+        return Some(NodedbStatement::CreateUser {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP USER ") {
+        let username = parts.get(2)?.to_string();
+        return Some(NodedbStatement::DropUser { username });
+    }
+    if upper.starts_with("ALTER USER ") {
+        return Some(NodedbStatement::AlterUser {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW USERS") {
+        return Some(NodedbStatement::ShowUsers);
+    }
+    if upper.starts_with("GRANT ROLE ") {
+        return Some(NodedbStatement::GrantRole {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("REVOKE ROLE ") {
+        return Some(NodedbStatement::RevokeRole {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("GRANT ") {
+        return Some(NodedbStatement::GrantPermission {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("REVOKE ") {
+        return Some(NodedbStatement::RevokePermission {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW PERMISSIONS") {
+        let collection = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowPermissions { collection });
+    }
+    if upper.starts_with("SHOW GRANTS") {
+        let username = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowGrants { username });
+    }
+    if upper.starts_with("SHOW TENANTS") {
+        return Some(NodedbStatement::ShowTenants);
+    }
+    if upper.starts_with("SHOW AUDIT") {
+        return Some(NodedbStatement::ShowAuditLog);
+    }
+    if upper.starts_with("SHOW CONSTRAINTS ") {
+        let collection = parts.get(2)?.to_string();
+        return Some(NodedbStatement::ShowConstraints { collection });
+    }
+    if upper.starts_with("SHOW TYPEGUARD") {
+        let collection = parts.get(2)?.to_string();
+        return Some(NodedbStatement::ShowTypeGuards { collection });
+    }
+
+    // ── Change stream ────────────────────────────────────────────
+    if upper.starts_with("CREATE CHANGE STREAM ") {
+        return Some(NodedbStatement::CreateChangeStream {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP CHANGE STREAM ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "STREAM")?;
+        return Some(NodedbStatement::DropChangeStream { name, if_exists });
+    }
+
+    // ── RLS ──────────────────────────────────────────────────────
+    if upper.starts_with("CREATE RLS POLICY ") {
+        return Some(NodedbStatement::CreateRlsPolicy {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP RLS POLICY ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "POLICY")?;
+        let collection = extract_after_keyword(&parts, "ON").unwrap_or_default();
+        return Some(NodedbStatement::DropRlsPolicy {
+            name,
+            collection,
+            if_exists,
+        });
+    }
+    if upper.starts_with("SHOW RLS POLI") {
+        let collection = parts.get(3).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowRlsPolicies { collection });
+    }
+
+    // ── Materialized view ────────────────────────────────────────
+    if upper.starts_with("CREATE MATERIALIZED VIEW ") {
+        return Some(NodedbStatement::CreateMaterializedView {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP MATERIALIZED VIEW ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "VIEW")?;
+        return Some(NodedbStatement::DropMaterializedView { name, if_exists });
+    }
+
+    // ── Continuous aggregate ─────────────────────────────────────
+    if upper.starts_with("CREATE CONTINUOUS AGGREGATE ") {
+        return Some(NodedbStatement::CreateContinuousAggregate {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP CONTINUOUS AGGREGATE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "AGGREGATE")?;
+        return Some(NodedbStatement::DropContinuousAggregate { name, if_exists });
+    }
+
+    None
+}
+
+/// Extract the object name that follows a keyword (e.g. "COLLECTION"
+/// in "CREATE COLLECTION users ..."). Handles IF NOT EXISTS by
+/// skipping those tokens.
+fn extract_name_after_keyword(parts: &[&str], keyword: &str) -> Option<String> {
+    let kw_upper = keyword.to_uppercase();
+    let pos = parts.iter().position(|p| p.to_uppercase() == kw_upper)?;
+    let mut idx = pos + 1;
+    // Skip IF NOT EXISTS tokens.
+    if parts.get(idx).map(|s| s.to_uppercase()) == Some("IF".to_string()) {
+        idx += 1; // NOT
+        if parts.get(idx).map(|s| s.to_uppercase()) == Some("NOT".to_string()) {
+            idx += 1; // EXISTS
+        }
+        if parts.get(idx).map(|s| s.to_uppercase()) == Some("EXISTS".to_string()) {
+            idx += 1;
+        }
+    }
+    parts.get(idx).map(|s| s.to_string())
+}
+
+/// Extract the object name for DROP-style commands where IF EXISTS
+/// may appear between the keyword and the name.
+fn extract_name_after_if_exists(parts: &[&str], keyword: &str) -> Option<String> {
+    extract_name_after_keyword(parts, keyword)
+}
+
+/// Extract the token after a keyword like "ON" or "TO".
+fn extract_after_keyword(parts: &[&str], keyword: &str) -> Option<String> {
+    let kw_upper = keyword.to_uppercase();
+    let pos = parts.iter().position(|p| p.to_uppercase() == kw_upper)?;
+    parts.get(pos + 1).map(|s| s.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_create_collection() {
+        let stmt = parse("CREATE COLLECTION users (id INT, name TEXT)").unwrap();
+        match stmt {
+            NodedbStatement::CreateCollection {
+                name,
+                if_not_exists,
+                ..
+            } => {
+                assert_eq!(name, "users");
+                assert!(!if_not_exists);
+            }
+            other => panic!("expected CreateCollection, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_create_collection_if_not_exists() {
+        let stmt = parse("CREATE COLLECTION IF NOT EXISTS users").unwrap();
+        match stmt {
+            NodedbStatement::CreateCollection {
+                name,
+                if_not_exists,
+                ..
+            } => {
+                assert_eq!(name, "users");
+                assert!(if_not_exists);
+            }
+            other => panic!("expected CreateCollection, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_drop_collection() {
+        let stmt = parse("DROP COLLECTION users").unwrap();
+        assert_eq!(
+            stmt,
+            NodedbStatement::DropCollection {
+                name: "users".into(),
+                if_exists: false,
+            }
+        );
+    }
+
+    #[test]
+    fn parse_drop_collection_if_exists() {
+        let stmt = parse("DROP COLLECTION IF EXISTS users").unwrap();
+        assert_eq!(
+            stmt,
+            NodedbStatement::DropCollection {
+                name: "users".into(),
+                if_exists: true,
+            }
+        );
+    }
+
+    #[test]
+    fn parse_show_nodes() {
+        assert_eq!(parse("SHOW NODES"), Some(NodedbStatement::ShowNodes));
+    }
+
+    #[test]
+    fn parse_show_cluster() {
+        assert_eq!(parse("SHOW CLUSTER"), Some(NodedbStatement::ShowCluster));
+    }
+
+    #[test]
+    fn parse_create_trigger() {
+        let stmt = parse("CREATE OR REPLACE SYNC TRIGGER on_insert ...").unwrap();
+        match stmt {
+            NodedbStatement::CreateTrigger {
+                or_replace,
+                sync,
+                deferred,
+                ..
+            } => {
+                assert!(or_replace);
+                assert!(sync);
+                assert!(!deferred);
+            }
+            other => panic!("expected CreateTrigger, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_drop_index_if_exists() {
+        let stmt = parse("DROP INDEX IF EXISTS idx_name").unwrap();
+        match stmt {
+            NodedbStatement::DropIndex {
+                name, if_exists, ..
+            } => {
+                assert_eq!(name, "idx_name");
+                assert!(if_exists);
+            }
+            other => panic!("expected DropIndex, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_analyze() {
+        assert_eq!(
+            parse("ANALYZE users"),
+            Some(NodedbStatement::Analyze {
+                collection: Some("users".into()),
+            })
+        );
+        assert_eq!(
+            parse("ANALYZE"),
+            Some(NodedbStatement::Analyze { collection: None })
+        );
+    }
+
+    #[test]
+    fn non_ddl_returns_none() {
+        assert!(parse("SELECT * FROM users").is_none());
+        assert!(parse("INSERT INTO users VALUES (1)").is_none());
+    }
+
+    #[test]
+    fn parse_grant_role() {
+        let stmt = parse("GRANT ROLE admin TO alice").unwrap();
+        match stmt {
+            NodedbStatement::GrantRole { raw_sql } => {
+                assert!(raw_sql.contains("admin"));
+            }
+            other => panic!("expected GrantRole, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_create_sequence_if_not_exists() {
+        let stmt = parse("CREATE SEQUENCE IF NOT EXISTS my_seq START 1").unwrap();
+        match stmt {
+            NodedbStatement::CreateSequence {
+                name,
+                if_not_exists,
+                ..
+            } => {
+                assert_eq!(name, "my_seq");
+                assert!(if_not_exists);
+            }
+            other => panic!("expected CreateSequence, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_restore_dry_run() {
+        let stmt = parse("RESTORE TENANT 1 FROM '/tmp/backup' DRY RUN").unwrap();
+        match stmt {
+            NodedbStatement::RestoreTenant { dry_run, .. } => {
+                assert!(dry_run);
+            }
+            other => panic!("expected RestoreTenant, got {other:?}"),
+        }
+    }
+}
diff --git a/nodedb-sql/src/ddl_ast/statement.rs b/nodedb-sql/src/ddl_ast/statement.rs
new file mode 100644
index 00000000..30ee3db7
--- /dev/null
+++ b/nodedb-sql/src/ddl_ast/statement.rs
@@ -0,0 +1,278 @@
+//! The [`NodedbStatement`] enum — one variant per DDL command.
+
+/// Typed representation of every NodeDB DDL statement.
+///
+/// Handlers receive a fully-parsed variant instead of raw `&[&str]`
+/// parts, eliminating array-index panics and enabling exhaustive
+/// match coverage for new DDL commands.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum NodedbStatement {
+    // ── Collection lifecycle ─────────────────────────────────────
+    CreateCollection {
+        name: String,
+        if_not_exists: bool,
+        raw_sql: String,
+    },
+    DropCollection {
+        name: String,
+        if_exists: bool,
+    },
+    AlterCollection {
+        name: String,
+        raw_sql: String,
+    },
+    DescribeCollection {
+        name: String,
+    },
+    ShowCollections,
+
+    // ── Index ────────────────────────────────────────────────────
+    CreateIndex {
+        unique: bool,
+        raw_sql: String,
+    },
+    DropIndex {
+        name: String,
+        collection: Option<String>,
+        if_exists: bool,
+    },
+    ShowIndexes {
+        collection: Option<String>,
+    },
+    Reindex {
+        collection: String,
+    },
+
+    // ── Trigger ──────────────────────────────────────────────────
+    CreateTrigger {
+        or_replace: bool,
+        deferred: bool,
+        sync: bool,
+        raw_sql: String,
+    },
+    DropTrigger {
+        name: String,
+        collection: String,
+        if_exists: bool,
+    },
+    AlterTrigger {
+        raw_sql: String,
+    },
+    ShowTriggers {
+        collection: Option<String>,
+    },
+
+    // ── Schedule ─────────────────────────────────────────────────
+    CreateSchedule {
+        raw_sql: String,
+    },
+    DropSchedule {
+        name: String,
+        if_exists: bool,
+    },
+    AlterSchedule {
+        raw_sql: String,
+    },
+    ShowSchedules,
+    ShowScheduleHistory {
+        name: String,
+    },
+
+    // ── Sequence ─────────────────────────────────────────────────
+    CreateSequence {
+        name: String,
+        if_not_exists: bool,
+        raw_sql: String,
+    },
+    DropSequence {
+        name: String,
+        if_exists: bool,
+    },
+    AlterSequence {
+        raw_sql: String,
+    },
+    DescribeSequence {
+        name: String,
+    },
+    ShowSequences,
+
+    // ── Alert ────────────────────────────────────────────────────
+    CreateAlert {
+        raw_sql: String,
+    },
+    DropAlert {
+        name: String,
+        if_exists: bool,
+    },
+    AlterAlert {
+        raw_sql: String,
+    },
+    ShowAlerts,
+    ShowAlertStatus {
+        name: String,
+    },
+
+    // ── Retention policy ─────────────────────────────────────────
+    CreateRetentionPolicy {
+        raw_sql: String,
+    },
+    DropRetentionPolicy {
+        name: String,
+        if_exists: bool,
+    },
+    AlterRetentionPolicy {
+        raw_sql: String,
+    },
+    ShowRetentionPolicies,
+
+    // ── Change stream ────────────────────────────────────────────
+    CreateChangeStream {
+        raw_sql: String,
+    },
+    DropChangeStream {
+        name: String,
+        if_exists: bool,
+    },
+    AlterChangeStream {
+        raw_sql: String,
+    },
+    ShowChangeStreams,
+
+    // ── Consumer group ───────────────────────────────────────────
+    CreateConsumerGroup {
+        raw_sql: String,
+    },
+    DropConsumerGroup {
+        name: String,
+        stream: String,
+        if_exists: bool,
+    },
+    ShowConsumerGroups {
+        stream: Option<String>,
+    },
+
+    // ── RLS policy ───────────────────────────────────────────────
+    CreateRlsPolicy {
+        raw_sql: String,
+    },
+    DropRlsPolicy {
+        name: String,
+        collection: String,
+        if_exists: bool,
+    },
+    ShowRlsPolicies {
+        collection: Option<String>,
+    },
+
+    // ── Materialized view ────────────────────────────────────────
+    CreateMaterializedView {
+        raw_sql: String,
+    },
+    DropMaterializedView {
+        name: String,
+        if_exists: bool,
+    },
+    ShowMaterializedViews,
+
+    // ── Continuous aggregate ─────────────────────────────────────
+    CreateContinuousAggregate {
+        raw_sql: String,
+    },
+    DropContinuousAggregate {
+        name: String,
+        if_exists: bool,
+    },
+    ShowContinuousAggregates,
+
+    // ── Backup / restore ─────────────────────────────────────────
+    BackupTenant {
+        raw_sql: String,
+    },
+    RestoreTenant {
+        dry_run: bool,
+        raw_sql: String,
+    },
+
+    // ── Cluster admin ────────────────────────────────────────────
+    ShowNodes,
+    ShowNode {
+        node_id: String,
+    },
+    RemoveNode {
+        node_id: String,
+    },
+    ShowCluster,
+    ShowMigrations,
+    ShowRanges,
+    ShowRouting,
+    ShowSchemaVersion,
+    ShowPeerHealth,
+    Rebalance,
+    ShowRaftGroups,
+    ShowRaftGroup {
+        group_id: String,
+    },
+    AlterRaftGroup {
+        raw_sql: String,
+    },
+
+    // ── Maintenance ──────────────────────────────────────────────
+    Analyze {
+        collection: Option<String>,
+    },
+    Compact {
+        collection: String,
+    },
+    ShowStorage {
+        collection: Option<String>,
+    },
+    ShowCompactionStatus,
+
+    // ── User / auth / grant ──────────────────────────────────────
+    CreateUser {
+        raw_sql: String,
+    },
+    DropUser {
+        username: String,
+    },
+    AlterUser {
+        raw_sql: String,
+    },
+    ShowUsers,
+    GrantRole {
+        raw_sql: String,
+    },
+    RevokeRole {
+        raw_sql: String,
+    },
+    GrantPermission {
+        raw_sql: String,
+    },
+    RevokePermission {
+        raw_sql: String,
+    },
+    ShowPermissions {
+        collection: Option<String>,
+    },
+    ShowGrants {
+        username: Option<String>,
+    },
+
+    // ── Miscellaneous ────────────────────────────────────────────
+    ShowTenants,
+    ShowAuditLog,
+    ShowConstraints {
+        collection: String,
+    },
+    ShowTypeGuards {
+        collection: String,
+    },
+
+    /// Catch-all for DDL-like commands not yet promoted to their
+    /// own variant. Preserves the raw SQL for the legacy dispatch
+    /// path so new variants can be added incrementally without
+    /// breaking existing handlers.
+    Other {
+        raw_sql: String,
+    },
+}
diff --git a/nodedb-sql/src/lib.rs b/nodedb-sql/src/lib.rs
index 269d8ea4..4b614fe2 100644
--- a/nodedb-sql/src/lib.rs
+++ b/nodedb-sql/src/lib.rs
@@ -9,6 +9,7 @@
 //! ```
 
 pub mod catalog;
+pub mod ddl_ast;
 pub mod engine_rules;
 pub mod error;
 pub mod functions;
diff --git a/nodedb-types/src/config/tuning/network.rs b/nodedb-types/src/config/tuning/network.rs
index 888fa982..4fd9d956 100644
--- a/nodedb-types/src/config/tuning/network.rs
+++ b/nodedb-types/src/config/tuning/network.rs
@@ -223,10 +223,10 @@ fn default_raft_tick_interval_ms() -> u64 {
     10
 }
 fn default_election_timeout_min_secs() -> u64 {
-    60
+    2
 }
 fn default_election_timeout_max_secs() -> u64 {
-    120
+    5
 }
 fn default_rpc_timeout_secs() -> u64 {
     5
diff --git a/nodedb/src/control/cluster/handle.rs b/nodedb/src/control/cluster/handle.rs
index 3cede98e..bc8845e1 100644
--- a/nodedb/src/control/cluster/handle.rs
+++ b/nodedb/src/control/cluster/handle.rs
@@ -33,4 +33,8 @@ pub struct ClusterHandle {
     /// stays `Clone` while still guaranteeing single-transfer
     /// semantics at runtime.
     pub multi_raft: Mutex<Option<nodedb_cluster::MultiRaft>>,
+    /// Cluster catalog (redb-backed topology + routing persistence).
+    /// Shared with the `HealthMonitor` for persisting topology changes
+    /// on failure detection and recovery.
+    pub catalog: Arc<nodedb_cluster::ClusterCatalog>,
 }
diff --git a/nodedb/src/control/cluster/init.rs b/nodedb/src/control/cluster/init.rs
index ef06fe4b..3315b7d8 100644
--- a/nodedb/src/control/cluster/init.rs
+++ b/nodedb/src/control/cluster/init.rs
@@ -38,7 +38,7 @@ pub async fn init_cluster(
         "cluster QUIC transport bound"
     );
 
-    init_cluster_with_transport(config, transport, data_dir).await
+    init_cluster_with_transport(config, transport, data_dir, transport_tuning).await
 }
 
 /// Initialize the cluster using a pre-bound QUIC transport.
@@ -56,13 +56,15 @@ pub async fn init_cluster_with_transport(
     config: &ClusterSettings,
     transport: Arc<nodedb_cluster::NexarTransport>,
     data_dir: &std::path::Path,
+    transport_tuning: &ClusterTransportTuning,
 ) -> crate::Result<ClusterHandle> {
     // 2. Open cluster catalog.
     let catalog_path = data_dir.join("cluster.redb");
-    let catalog =
+    let catalog = Arc::new(
         nodedb_cluster::ClusterCatalog::open(&catalog_path).map_err(|e| crate::Error::Config {
             detail: format!("cluster catalog: {e}"),
-        })?;
+        })?,
+    );
 
     // 3. Bootstrap, join, or restart.
     let cluster_config = nodedb_cluster::ClusterConfig {
@@ -75,6 +77,12 @@ pub async fn init_cluster_with_transport(
         force_bootstrap: config.force_bootstrap,
         join_retry: join_retry_policy_from_env(),
         swim_udp_addr: None,
+        election_timeout_min: std::time::Duration::from_secs(
+            transport_tuning.election_timeout_min_secs,
+        ),
+        election_timeout_max: std::time::Duration::from_secs(
+            transport_tuning.election_timeout_max_secs,
+        ),
     };
 
     let lifecycle = nodedb_cluster::ClusterLifecycleTracker::new();
@@ -105,6 +113,7 @@ pub async fn init_cluster_with_transport(
         applied_index_watcher,
         node_id: config.node_id,
         multi_raft: Mutex::new(Some(state.multi_raft)),
+        catalog,
     })
 }
 
diff --git a/nodedb/src/control/cluster/start_raft.rs b/nodedb/src/control/cluster/start_raft.rs
index 1c14c57c..bc968bc4 100644
--- a/nodedb/src/control/cluster/start_raft.rs
+++ b/nodedb/src/control/cluster/start_raft.rs
@@ -112,7 +112,7 @@ pub fn start_raft(
     // Start the RPC server (accepts inbound QUIC connections).
     let transport_serve = handle.transport.clone();
     let rl_handler = raft_loop.clone();
-    let sr_serve = shutdown_rx;
+    let sr_serve = shutdown_rx.clone();
     tokio::spawn(async move {
         if let Err(e) = transport_serve.serve(rl_handler, sr_serve).await {
             tracing::error!(error = %e, "raft RPC server failed");
@@ -138,6 +138,27 @@ pub fn start_raft(
         );
     }
 
+    // Start the health monitor (periodic pings, failure detection,
+    // topology re-broadcast). Without this, topology updates are
+    // only propagated via the fire-and-forget broadcast during the
+    // join flow — if that single broadcast is lost (peer QUIC server
+    // not yet accepting), the peer never converges.
+    let health_config = nodedb_cluster::HealthConfig {
+        ping_interval: Duration::from_secs(transport_tuning.health_ping_interval_secs),
+        failure_threshold: transport_tuning.health_failure_threshold,
+    };
+    let health_monitor = Arc::new(nodedb_cluster::HealthMonitor::new(
+        handle.node_id,
+        handle.transport.clone(),
+        handle.topology.clone(),
+        handle.catalog.clone(),
+        health_config,
+    ));
+    let sr_health = shutdown_rx;
+    tokio::spawn(async move {
+        health_monitor.run(sr_health).await;
+    });
+
     info!(node_id = handle.node_id, "raft loop and RPC server started");
 
     Ok(ready_rx)
diff --git a/nodedb/src/control/metadata_proposer.rs b/nodedb/src/control/metadata_proposer.rs
index 8a8314d5..6077a92f 100644
--- a/nodedb/src/control/metadata_proposer.rs
+++ b/nodedb/src/control/metadata_proposer.rs
@@ -176,6 +176,15 @@ pub fn propose_catalog_entry_with_timeout(
     }
 
     let payload = catalog_entry::encode(entry)?;
+
+    // DDL transaction buffer: if a transactional DDL session is
+    // active on this thread (BEGIN ... COMMIT), buffer the payload
+    // instead of proposing immediately. The buffered entries will
+    // be proposed as a single MetadataEntry::Batch at COMMIT time.
+    if crate::control::server::pgwire::session::ddl_buffer::try_buffer(payload.clone()) {
+        return Ok(0);
+    }
+
     let metadata_entry = MetadataEntry::CatalogDdl { payload };
     let raw = encode_entry(&metadata_entry).map_err(|e| Error::Config {
         detail: format!("metadata entry encode: {e}"),
diff --git a/nodedb/src/control/server/http/routes/health.rs b/nodedb/src/control/server/http/routes/health.rs
index a97e02af..6a717d57 100644
--- a/nodedb/src/control/server/http/routes/health.rs
+++ b/nodedb/src/control/server/http/routes/health.rs
@@ -1,4 +1,12 @@
 //! Health check endpoints.
+//!
+//! | Endpoint          | Method | Purpose                     | k8s probe     |
+//! |-------------------|--------|-----------------------------|---------------|
+//! | `/health/live`    | GET    | Process alive (always 200)  | liveness      |
+//! | `/healthz`        | GET    | Ready to serve traffic      | readiness     |
+//! | `/health`         | GET    | Liveness with cluster info  | —             |
+//! | `/health/ready`   | GET    | WAL recovered               | readiness alt |
+//! | `/health/drain`   | POST   | Trigger graceful drain      | preStop hook  |
 
 use axum::extract::State;
 use axum::http::StatusCode;
@@ -7,13 +15,36 @@ use serde_json::json;
 
 use super::super::auth::AppState;
 
-/// GET /healthz — k8s-style readiness/liveness probe.
+/// GET /health/live — unconditional liveness probe.
 ///
-/// Returns `200 OK` when the node has reached `GatewayEnable` and is
-/// serving traffic. Returns `503 Service Unavailable` during startup or if
-/// startup has failed. This endpoint bypasses the startup gate middleware
-/// and is always reachable, making it suitable as a k8s readiness probe.
+/// Always returns 200. If this endpoint fails to respond, the
+/// process is dead and should be restarted. No internal state is
+/// checked — the mere ability to respond proves the event loop and
+/// HTTP listener are alive.
+pub async fn live() -> impl IntoResponse {
+    (StatusCode::OK, axum::Json(json!({ "status": "alive" })))
+}
+
+/// GET /healthz — k8s-style readiness probe.
+///
+/// Returns `200 OK` when the node has reached `GatewayEnable`, is
+/// serving traffic, and is NOT draining/decommissioned. Returns
+/// `503 Service Unavailable` during startup, after startup failure,
+/// or when the node is being decommissioned.
 pub async fn healthz(State(state): State<AppState>) -> impl IntoResponse {
+    // Check decommission state via the cluster observer (if present).
+    if let Some(obs) = state.shared.cluster_observer.get() {
+        let snap = obs.snapshot();
+        let label = snap.lifecycle_label();
+        if label == "draining" || label == "decommissioned" || label == "failed" {
+            let body = json!({
+                "status": "draining",
+                "lifecycle": label,
+                "node_id": state.shared.node_id,
+            });
+            return (StatusCode::SERVICE_UNAVAILABLE, axum::Json(body));
+        }
+    }
     let health = crate::control::startup::health::observe(&state.shared.startup);
     let (status, body) = crate::control::startup::health::to_http_response(&health);
     (status, axum::Json(body))
@@ -60,3 +91,31 @@ pub async fn ready(State(state): State<AppState>) -> impl IntoResponse {
     });
     (status, axum::Json(body))
 }
+
+/// POST /health/drain — trigger graceful connection drain.
+///
+/// Signals the canonical `ShutdownWatch` so every background loop
+/// begins its cooperative exit. Subsequent `/healthz` calls return
+/// 503, which causes the k8s readiness probe to fail and the
+/// service mesh to stop routing new connections to this node.
+///
+/// Designed for use as a k8s `preStop` hook:
+///
+/// ```yaml
+/// lifecycle:
+///   preStop:
+///     httpGet:
+///       path: /health/drain
+///       port: http
+/// ```
+pub async fn drain(State(state): State<AppState>) -> impl IntoResponse {
+    tracing::info!(node_id = state.shared.node_id, "drain requested via HTTP");
+    state.shared.shutdown.signal();
+    (
+        StatusCode::OK,
+        axum::Json(json!({
+            "status": "draining",
+            "node_id": state.shared.node_id,
+        })),
+    )
+}
diff --git a/nodedb/src/control/server/http/server.rs b/nodedb/src/control/server/http/server.rs
index 1a7e8d28..33d7a11c 100644
--- a/nodedb/src/control/server/http/server.rs
+++ b/nodedb/src/control/server/http/server.rs
@@ -3,7 +3,9 @@
 //! Endpoints:
 //! - GET  /healthz      — k8s readiness/liveness (always reachable; 503 until GatewayEnable)
 //! - GET  /health       — liveness
+//! - GET  /health/live  — unconditional liveness probe
 //! - GET  /health/ready — readiness (WAL recovered)
+//! - POST /health/drain — trigger graceful drain
 //! - GET  /metrics      — Prometheus-format metrics (requires monitor role)
 //! - POST /query        — execute DDL via HTTP (requires auth)
 
@@ -29,7 +31,9 @@ fn build_router(state: AppState) -> Router {
         // /healthz is always reachable — returns 503 during startup, 200 after.
         .route("/healthz", get(routes::health::healthz))
         .route("/health", get(routes::health::health))
+        .route("/health/live", get(routes::health::live))
         .route("/health/ready", get(routes::health::ready))
+        .route("/health/drain", post(routes::health::drain))
         .route("/metrics", get(routes::metrics::metrics))
         .route("/query", post(routes::query::query))
         .route("/status", get(routes::status::status))
@@ -98,8 +102,8 @@ fn build_router(state: AppState) -> Router {
 
 /// Axum middleware that gates non-health routes on [`StartupPhase::GatewayEnable`].
 ///
-/// `/healthz`, `/health`, and `/health/ready` are always let through so k8s
-/// readiness probes can observe startup progress. All other routes receive a
+/// All `/health*` paths (liveness, readiness, drain) are always let through so
+/// k8s probes can observe startup progress. All other routes receive a
 /// `503 Service Unavailable` until the node reaches `GatewayEnable`.
 async fn startup_gate_middleware(
     State(app_state): State<AppState>,
diff --git a/nodedb/src/control/server/native/dispatch/sql.rs b/nodedb/src/control/server/native/dispatch/sql.rs
index 570b3c21..7831d3cb 100644
--- a/nodedb/src/control/server/native/dispatch/sql.rs
+++ b/nodedb/src/control/server/native/dispatch/sql.rs
@@ -282,6 +282,9 @@ fn is_session_show(upper: &str) -> bool {
         && !upper.starts_with("SHOW PEER")
         && !upper.starts_with("SHOW NODES")
         && !upper.starts_with("SHOW NODE ")
+        && !upper.starts_with("SHOW RANGES")
+        && !upper.starts_with("SHOW ROUTING")
+        && !upper.starts_with("SHOW SCHEMA VERSION")
         && !upper.starts_with("SHOW COLLECTIONS")
         && !upper.starts_with("SHOW AUDIT")
         && !upper.starts_with("SHOW PERMISSIONS")
diff --git a/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs b/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs
index 0bcc576a..7228ada9 100644
--- a/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs
+++ b/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs
@@ -32,7 +32,7 @@ pub fn handle_auth_user(
         ));
     }
 
-    let upper0 = parts[0].to_uppercase();
+    let upper0 = parts.first().map(|s| s.to_uppercase()).unwrap_or_default();
     match upper0.as_str() {
         "DEACTIVATE" => deactivate_auth_user(state, identity, parts),
         "ALTER" => alter_auth_user_status(state, identity, parts),
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs b/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs
index 58509162..fbd34c9f 100644
--- a/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs
@@ -1,11 +1,17 @@
 pub mod health;
 pub mod migration;
 pub mod raft;
+pub mod ranges;
 pub mod rebalance_cmd;
+pub mod routing_hint;
+pub mod schema_version;
 pub mod topology;
 
 pub use health::show_peer_health;
 pub use migration::show_migrations;
 pub use raft::{alter_raft_group, show_raft_group, show_raft_groups};
+pub use ranges::show_ranges;
 pub use rebalance_cmd::rebalance;
+pub use routing_hint::show_routing;
+pub use schema_version::show_schema_version;
 pub use topology::{remove_node, show_cluster, show_node, show_nodes};
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs b/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs
new file mode 100644
index 00000000..8c82e2a1
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs
@@ -0,0 +1,75 @@
+//! `SHOW RANGES` — vshard distribution across the cluster.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::super::super::types::{int8_field, sqlstate_error, text_field};
+
+/// SHOW RANGES — list vshards with leaseholder and replica info.
+///
+/// Columns: vshard_id, group_id, leaseholder, replicas.
+/// Superuser only.
+pub fn show_ranges(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    if !identity.is_superuser {
+        return Err(sqlstate_error(
+            "42501",
+            "permission denied: only superuser can view ranges",
+        ));
+    }
+
+    let routing = match &state.cluster_routing {
+        Some(r) => r,
+        None => {
+            return Err(sqlstate_error(
+                "55000",
+                "cluster mode not enabled (single-node instance)",
+            ));
+        }
+    };
+
+    let schema = Arc::new(vec![
+        int8_field("vshard_id"),
+        int8_field("group_id"),
+        int8_field("leaseholder"),
+        text_field("replicas"),
+    ]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let rt = routing.read().unwrap_or_else(|p| p.into_inner());
+    for vshard_id in 0..nodedb_cluster::routing::VSHARD_COUNT {
+        let group_id = rt.group_for_vshard(vshard_id).unwrap_or(0);
+        let (leader, replicas_str) = match rt.group_info(group_id) {
+            Some(info) => {
+                let replicas: String = info
+                    .members
+                    .iter()
+                    .map(|m| m.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ");
+                (info.leader as i64, replicas)
+            }
+            None => (0i64, String::new()),
+        };
+        encoder.encode_field(&(vshard_id as i64))?;
+        encoder.encode_field(&(group_id as i64))?;
+        encoder.encode_field(&leader)?;
+        encoder.encode_field(&replicas_str)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs b/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs
new file mode 100644
index 00000000..4bdb57a5
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs
@@ -0,0 +1,72 @@
+//! `SHOW ROUTING` — expose the vshard → leaseholder → node address
+//! mapping so smart clients can cache it and route writes directly
+//! to the leaseholder, skipping the gateway hop.
+//!
+//! Result columns: `vshard_id`, `group_id`, `leaseholder_node_id`,
+//! `leaseholder_addr`.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::super::super::types::{int8_field, sqlstate_error, text_field};
+
+/// SHOW ROUTING — full vshard → leaseholder → address table.
+///
+/// Any authenticated user may call this (smart-client libs need it).
+pub fn show_routing(
+    state: &SharedState,
+    _identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let routing = match &state.cluster_routing {
+        Some(r) => r,
+        None => {
+            return Err(sqlstate_error(
+                "55000",
+                "cluster mode not enabled (single-node instance)",
+            ));
+        }
+    };
+
+    let schema = Arc::new(vec![
+        int8_field("vshard_id"),
+        int8_field("group_id"),
+        int8_field("leaseholder_node_id"),
+        text_field("leaseholder_addr"),
+    ]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let rt = routing.read().unwrap_or_else(|p| p.into_inner());
+    let topo_guard = state
+        .cluster_topology
+        .as_ref()
+        .map(|t| t.read().unwrap_or_else(|p| p.into_inner()));
+
+    for vshard_id in 0..nodedb_cluster::routing::VSHARD_COUNT {
+        let group_id = rt.group_for_vshard(vshard_id).unwrap_or(0);
+        let leader = rt.group_info(group_id).map(|info| info.leader).unwrap_or(0);
+        let addr = topo_guard
+            .as_ref()
+            .and_then(|topo| topo.get_node(leader))
+            .map(|n| n.addr.clone())
+            .unwrap_or_default();
+
+        encoder.encode_field(&(vshard_id as i64))?;
+        encoder.encode_field(&(group_id as i64))?;
+        encoder.encode_field(&(leader as i64))?;
+        encoder.encode_field(&addr)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs b/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs
new file mode 100644
index 00000000..9c0e9d94
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs
@@ -0,0 +1,57 @@
+//! `SHOW SCHEMA VERSION` — current descriptor version visible on
+//! this node.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::super::super::types::{sqlstate_error, text_field};
+
+/// SHOW SCHEMA VERSION — report the current descriptor version
+/// counter and per-collection metadata if available.
+pub fn show_schema_version(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    if !identity.is_superuser {
+        return Err(sqlstate_error(
+            "42501",
+            "permission denied: only superuser can view schema version",
+        ));
+    }
+
+    let schema = Arc::new(vec![text_field("property"), text_field("value")]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let version = state.schema_version.current();
+    encoder.encode_field(&"schema_version")?;
+    encoder.encode_field(&version.to_string())?;
+    rows.push(Ok(encoder.take_row()));
+
+    let applied_index = {
+        let cache = state
+            .metadata_cache
+            .read()
+            .unwrap_or_else(|p| p.into_inner());
+        cache.applied_index
+    };
+    encoder.encode_field(&"metadata_applied_index")?;
+    encoder.encode_field(&applied_index.to_string())?;
+    rows.push(Ok(encoder.take_row()));
+
+    encoder.encode_field(&"node_id")?;
+    encoder.encode_field(&state.node_id.to_string())?;
+    rows.push(Ok(encoder.take_row()));
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/grant/role.rs b/nodedb/src/control/server/pgwire/ddl/grant/role.rs
index cc902b36..2eef069a 100644
--- a/nodedb/src/control/server/pgwire/ddl/grant/role.rs
+++ b/nodedb/src/control/server/pgwire/ddl/grant/role.rs
@@ -57,6 +57,13 @@ pub fn grant_role(
 ) -> PgWireResult<Vec<Response>> {
     require_admin(identity, "grant roles")?;
 
+    if parts.len() < 5 {
+        return Err(sqlstate_error(
+            "42601",
+            "syntax: GRANT ROLE <role> TO <user>",
+        ));
+    }
+
     let role = parse_role(parts[2]);
 
     if matches!(role, Role::Superuser) && !identity.is_superuser {
@@ -94,6 +101,13 @@ pub fn revoke_role(
 ) -> PgWireResult<Vec<Response>> {
     require_admin(identity, "revoke roles")?;
 
+    if parts.len() < 5 {
+        return Err(sqlstate_error(
+            "42601",
+            "syntax: REVOKE ROLE <role> FROM <user>",
+        ));
+    }
+
     let role = parse_role(parts[2]);
 
     if !parts[3].eq_ignore_ascii_case("FROM") {
diff --git a/nodedb/src/control/server/pgwire/ddl/router/admin.rs b/nodedb/src/control/server/pgwire/ddl/router/admin.rs
index c85cdc9a..49ab2031 100644
--- a/nodedb/src/control/server/pgwire/ddl/router/admin.rs
+++ b/nodedb/src/control/server/pgwire/ddl/router/admin.rs
@@ -441,6 +441,15 @@ pub(super) async fn dispatch(
     if upper.starts_with("REMOVE NODE ") {
         return Some(super::super::cluster::remove_node(state, identity, parts));
     }
+    if upper.starts_with("SHOW RANGES") {
+        return Some(super::super::cluster::show_ranges(state, identity));
+    }
+    if upper.starts_with("SHOW ROUTING") {
+        return Some(super::super::cluster::show_routing(state, identity));
+    }
+    if upper.starts_with("SHOW SCHEMA VERSION") {
+        return Some(super::super::cluster::show_schema_version(state, identity));
+    }
 
     // Introspection.
     if upper.starts_with("SHOW USERS") {
diff --git a/nodedb/src/control/server/pgwire/ddl/router/ast.rs b/nodedb/src/control/server/pgwire/ddl/router/ast.rs
new file mode 100644
index 00000000..8793d904
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/router/ast.rs
@@ -0,0 +1,231 @@
+//! AST-based DDL dispatch — typed fast path.
+//!
+//! Runs before the legacy string-prefix routers. Handles
+//! `IF [NOT] EXISTS` at the dispatch level so individual handlers
+//! don't need to check. Falls through to legacy dispatch for
+//! `Other` variants and for statements where the typed path
+//! delegates to the existing handler (via `raw_sql`).
+
+use pgwire::api::results::{Response, Tag};
+use pgwire::error::PgWireResult;
+
+use nodedb_sql::ddl_ast::NodedbStatement;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+/// Try to dispatch a parsed `NodedbStatement`. Returns `Some` if
+/// fully handled, `None` if the statement should fall through to
+/// the legacy dispatch.
+pub(super) fn try_dispatch(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    stmt: &NodedbStatement,
+) -> Option<PgWireResult<Vec<Response>>> {
+    match stmt {
+        // ── IF NOT EXISTS: swallow duplicate-creation errors ──────
+        NodedbStatement::CreateCollection {
+            name,
+            if_not_exists: true,
+            ..
+        } => {
+            if collection_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("CREATE COLLECTION"))]));
+            }
+            None // fall through to legacy CREATE handler
+        }
+
+        NodedbStatement::CreateSequence {
+            name,
+            if_not_exists: true,
+            ..
+        } => {
+            if sequence_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("CREATE SEQUENCE"))]));
+            }
+            None
+        }
+
+        // ── IF EXISTS: swallow not-found errors on DROP ──────────
+        NodedbStatement::DropCollection {
+            name,
+            if_exists: true,
+        } => {
+            if !collection_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP COLLECTION"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropIndex {
+            if_exists: true, ..
+        } => None, // legacy handler has its own check
+
+        NodedbStatement::DropTrigger {
+            name,
+            if_exists: true,
+            ..
+        } => {
+            if !trigger_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP TRIGGER"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropSchedule {
+            name,
+            if_exists: true,
+        } => {
+            if !schedule_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP SCHEDULE"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropSequence {
+            name,
+            if_exists: true,
+        } => {
+            if !sequence_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP SEQUENCE"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropAlert {
+            name,
+            if_exists: true,
+        } => {
+            if !alert_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP ALERT"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropRetentionPolicy {
+            name,
+            if_exists: true,
+        } => {
+            if !retention_policy_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP RETENTION POLICY",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropChangeStream {
+            name,
+            if_exists: true,
+        } => {
+            if !change_stream_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP CHANGE STREAM",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropMaterializedView {
+            name,
+            if_exists: true,
+        } => {
+            if !materialized_view_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP MATERIALIZED VIEW",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropContinuousAggregate {
+            name,
+            if_exists: true,
+        } => {
+            if !continuous_aggregate_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP CONTINUOUS AGGREGATE",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropRlsPolicy {
+            if_exists: true, ..
+        } => {
+            // RLS policy existence check would need collection context;
+            // fall through to legacy handler which already handles this.
+            None
+        }
+
+        NodedbStatement::DropConsumerGroup {
+            if_exists: true, ..
+        } => None, // legacy handler
+
+        // All other variants fall through to legacy dispatch.
+        _ => None,
+    }
+}
+
+fn collection_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let Some(catalog) = state.credentials.catalog() else {
+        return false;
+    };
+    let tid = identity.tenant_id.as_u32();
+    matches!(catalog.get_collection(tid, name), Ok(Some(_)))
+}
+
+fn trigger_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let Some(catalog) = state.credentials.catalog() else {
+        return false;
+    };
+    let tid = identity.tenant_id.as_u32();
+    matches!(catalog.get_trigger(tid, name), Ok(Some(_)))
+}
+
+fn schedule_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.schedule_registry.get(tid, name).is_some()
+}
+
+fn sequence_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.sequence_registry.exists(tid, name)
+}
+
+fn alert_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.alert_registry.get(tid, name).is_some()
+}
+
+fn retention_policy_exists(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    name: &str,
+) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.retention_policy_registry.get(tid, name).is_some()
+}
+
+fn change_stream_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.stream_registry.get(tid, name).is_some()
+}
+
+fn materialized_view_exists(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    name: &str,
+) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.mv_registry.get_def(tid, name).is_some()
+}
+
+fn continuous_aggregate_exists(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    name: &str,
+) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.mv_registry.get_def(tid, name).is_some()
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/router/mod.rs b/nodedb/src/control/server/pgwire/ddl/router/mod.rs
index 73de64f0..905bb53f 100644
--- a/nodedb/src/control/server/pgwire/ddl/router/mod.rs
+++ b/nodedb/src/control/server/pgwire/ddl/router/mod.rs
@@ -1,4 +1,5 @@
 mod admin;
+mod ast;
 mod auth;
 mod collaborative;
 mod dsl;
@@ -26,6 +27,18 @@ pub async fn dispatch(
     identity: &AuthenticatedIdentity,
     sql: &str,
 ) -> Option<PgWireResult<Vec<Response>>> {
+    // AST-typed fast path: parse once, handle IF [NOT] EXISTS at the
+    // dispatch level, then fall through to legacy handlers for the
+    // actual execution. This is the incremental migration path —
+    // once every legacy handler has been ported to accept a typed
+    // NodedbStatement, the string-prefix routers below can be
+    // removed entirely.
+    if let Some(stmt) = nodedb_sql::ddl_ast::parse(sql)
+        && let Some(r) = ast::try_dispatch(state, identity, &stmt)
+    {
+        return Some(r);
+    }
+
     let upper = sql.to_uppercase();
     let parts: Vec<&str> = sql.split_whitespace().collect();
 
diff --git a/nodedb/src/control/server/pgwire/handler/retry.rs b/nodedb/src/control/server/pgwire/handler/retry.rs
index 3e793ad9..bf536bf2 100644
--- a/nodedb/src/control/server/pgwire/handler/retry.rs
+++ b/nodedb/src/control/server/pgwire/handler/retry.rs
@@ -18,8 +18,8 @@
 //!
 //! ## Retry budget
 //!
-//! Three attempts total with 50ms, 100ms, 200ms backoff between
-//! them — roughly 350ms of tolerance for a drain to complete.
+//! Five attempts total with 50/100/200/400 ms backoff between
+//! them — roughly 750ms of tolerance for a drain to complete.
 //! The `DEFAULT_DRAIN_TIMEOUT` from `metadata_proposer` is 35s,
 //! so in practice either drain completes within our retry budget
 //! (the proposer is actively draining and is probably close to
@@ -31,13 +31,17 @@ use std::time::Duration;
 use crate::error::Error;
 
 /// Maximum number of attempts (including the initial call).
-const MAX_ATTEMPTS: usize = 3;
+const MAX_ATTEMPTS: usize = 5;
 
 /// Backoff durations BETWEEN attempts. `BACKOFFS[i]` is the sleep
 /// duration before attempt `i + 1`. Length must be
 /// `MAX_ATTEMPTS - 1`.
-const BACKOFFS: [Duration; MAX_ATTEMPTS - 1] =
-    [Duration::from_millis(50), Duration::from_millis(100)];
+const BACKOFFS: [Duration; MAX_ATTEMPTS - 1] = [
+    Duration::from_millis(50),
+    Duration::from_millis(100),
+    Duration::from_millis(200),
+    Duration::from_millis(400),
+];
 
 /// Run `op` up to `MAX_ATTEMPTS` times. Retries only on
 /// `Error::RetryableSchemaChanged`. Any other error (including
diff --git a/nodedb/src/control/server/pgwire/handler/session_cmds.rs b/nodedb/src/control/server/pgwire/handler/session_cmds.rs
index e3848f2e..ac7baa64 100644
--- a/nodedb/src/control/server/pgwire/handler/session_cmds.rs
+++ b/nodedb/src/control/server/pgwire/handler/session_cmds.rs
@@ -67,6 +67,19 @@ impl NodeDbPgHandler {
             }
         }
 
+        if key == super::super::session::read_consistency::PARAM_KEY
+            && super::super::session::read_consistency::parse_value(&value).is_none()
+        {
+            return Err(PgWireError::UserError(Box::new(ErrorInfo::new(
+                "ERROR".to_owned(),
+                "22023".to_owned(),
+                format!(
+                    "invalid value for {}: '{value}'. Valid: strong, bounded_staleness:<secs>, eventual",
+                    super::super::session::read_consistency::PARAM_KEY
+                ),
+            ))));
+        }
+
         if key == "nodedb.tenant_id" && value.parse::<u32>().is_err() {
             return Err(PgWireError::UserError(Box::new(ErrorInfo::new(
                 "ERROR".to_owned(),
diff --git a/nodedb/src/control/server/pgwire/handler/sql_exec.rs b/nodedb/src/control/server/pgwire/handler/sql_exec.rs
index ef469e04..69936c8e 100644
--- a/nodedb/src/control/server/pgwire/handler/sql_exec.rs
+++ b/nodedb/src/control/server/pgwire/handler/sql_exec.rs
@@ -217,6 +217,9 @@ impl NodeDbPgHandler {
             && !upper.starts_with("SHOW PEER")
             && !upper.starts_with("SHOW NODES")
             && !upper.starts_with("SHOW NODE ")
+            && !upper.starts_with("SHOW RANGES")
+            && !upper.starts_with("SHOW ROUTING")
+            && !upper.starts_with("SHOW SCHEMA VERSION")
             && !upper.starts_with("SHOW COLLECTIONS")
             && !upper.starts_with("SHOW AUDIT")
             && !upper.starts_with("SHOW PERMISSIONS")
@@ -283,6 +286,13 @@ impl NodeDbPgHandler {
             );
         }
 
+        // pg_catalog virtual tables — intercept before the normal planner.
+        if let Some(result) =
+            super::super::pg_catalog::try_pg_catalog(&self.state, identity, &upper)
+        {
+            return result;
+        }
+
         if let Some(result) = super::super::ddl::dispatch(&self.state, identity, sql_trimmed).await
         {
             return result;
diff --git a/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs b/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs
index 63ea8026..0da593bb 100644
--- a/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs
+++ b/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs
@@ -18,6 +18,7 @@ impl NodeDbPgHandler {
             let next = self.state.wal.next_lsn();
             crate::types::Lsn::new(next.as_u64().saturating_sub(1))
         };
+        crate::control::server::pgwire::session::ddl_buffer::activate();
         self.sessions.begin(addr, snapshot_lsn).map_err(|msg| {
             PgWireError::UserError(Box::new(ErrorInfo::new(
                 "ERROR".to_owned(),
@@ -171,6 +172,35 @@ impl NodeDbPgHandler {
             }
         }
 
+        // Flush any buffered DDL entries as a single atomic batch.
+        if let Some(payloads) = crate::control::server::pgwire::session::ddl_buffer::take()
+            && !payloads.is_empty()
+        {
+            use nodedb_cluster::{MetadataEntry, encode_entry};
+            let sub_entries: Vec<MetadataEntry> = payloads
+                .into_iter()
+                .map(|p| MetadataEntry::CatalogDdl { payload: p })
+                .collect();
+            let batch = MetadataEntry::Batch {
+                entries: sub_entries,
+            };
+            if let Some(handle) = self.state.metadata_raft.get() {
+                let raw = encode_entry(&batch).map_err(|e| {
+                    PgWireError::UserError(Box::new(ErrorInfo::new(
+                        "ERROR".to_owned(),
+                        "XX000".to_owned(),
+                        format!("DDL batch encode: {e}"),
+                    )))
+                })?;
+                handle.propose(raw).map_err(|e| {
+                    PgWireError::UserError(Box::new(ErrorInfo::new(
+                        "ERROR".to_owned(),
+                        "XX000".to_owned(),
+                        format!("DDL batch propose: {e}"),
+                    )))
+                })?;
+            }
+        }
         // Close non-WITH-HOLD cursors on transaction end.
         self.sessions.close_non_hold_cursors(addr);
         Ok(vec![Response::Execution(Tag::new("COMMIT"))])
@@ -182,6 +212,7 @@ impl NodeDbPgHandler {
         identity: &AuthenticatedIdentity,
         addr: &std::net::SocketAddr,
     ) -> PgWireResult<Vec<Response>> {
+        crate::control::server::pgwire::session::ddl_buffer::discard();
         let reservations = self.sessions.rollback(addr).unwrap_or_default();
         for handle in &reservations {
             let key = &handle.sequence_key;
diff --git a/nodedb/src/control/server/pgwire/mod.rs b/nodedb/src/control/server/pgwire/mod.rs
index 21cb8ac2..c2c90ed1 100644
--- a/nodedb/src/control/server/pgwire/mod.rs
+++ b/nodedb/src/control/server/pgwire/mod.rs
@@ -2,5 +2,6 @@ pub mod ddl;
 pub mod factory;
 pub mod handler;
 pub mod listener;
+pub mod pg_catalog;
 pub mod session;
 pub mod types;
diff --git a/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs b/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs
new file mode 100644
index 00000000..86d481a1
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs
@@ -0,0 +1,95 @@
+//! pg_catalog query interception and dispatch.
+
+use pgwire::api::results::Response;
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::tables;
+
+/// Try to handle a SQL query as a pg_catalog virtual-table lookup.
+///
+/// Returns `Some(Ok(response))` if the query targets a known
+/// pg_catalog table, `None` if the query should fall through to the
+/// normal planner. The `upper` argument is the uppercased SQL.
+pub fn try_pg_catalog(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    upper: &str,
+) -> Option<PgWireResult<Vec<Response>>> {
+    let table = extract_pg_catalog_table(upper)?;
+    let result = match table {
+        "pg_database" => tables::pg_database(),
+        "pg_namespace" => tables::pg_namespace(),
+        "pg_type" => tables::pg_type(),
+        "pg_class" => tables::pg_class(state, identity),
+        "pg_attribute" => tables::pg_attribute(state, identity),
+        "pg_index" => tables::pg_index(),
+        "pg_authid" => tables::pg_authid(state, identity),
+        _ => return None,
+    };
+    Some(result)
+}
+
+/// Extract the first `pg_catalog.<table>` or bare `pg_<table>`
+/// reference from a FROM clause. Returns the lowercase table name
+/// if found.
+fn extract_pg_catalog_table(upper: &str) -> Option<&'static str> {
+    let known = [
+        "pg_database",
+        "pg_namespace",
+        "pg_type",
+        "pg_class",
+        "pg_attribute",
+        "pg_index",
+        "pg_authid",
+    ];
+    for table in &known {
+        let qualified = format!("PG_CATALOG.{}", table.to_uppercase());
+        let bare = table.to_uppercase();
+        if upper.contains(&qualified) || upper.contains(&bare) {
+            return Some(table);
+        }
+    }
+    None
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn extracts_qualified_table() {
+        let sql = "SELECT * FROM pg_catalog.pg_class WHERE relkind = 'r'";
+        assert_eq!(
+            extract_pg_catalog_table(&sql.to_uppercase()),
+            Some("pg_class")
+        );
+    }
+
+    #[test]
+    fn extracts_bare_table() {
+        let sql = "SELECT oid, typname FROM pg_type";
+        assert_eq!(
+            extract_pg_catalog_table(&sql.to_uppercase()),
+            Some("pg_type")
+        );
+    }
+
+    #[test]
+    fn no_match_for_regular_query() {
+        let sql = "SELECT * FROM users WHERE id = 1";
+        assert_eq!(extract_pg_catalog_table(&sql.to_uppercase()), None);
+    }
+
+    #[test]
+    fn handles_join_with_pg_catalog() {
+        let sql =
+            "SELECT c.oid FROM pg_class c JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid";
+        assert_eq!(
+            extract_pg_catalog_table(&sql.to_uppercase()),
+            Some("pg_namespace")
+        );
+    }
+}
diff --git a/nodedb/src/control/server/pgwire/pg_catalog/mod.rs b/nodedb/src/control/server/pgwire/pg_catalog/mod.rs
new file mode 100644
index 00000000..01e74e69
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/pg_catalog/mod.rs
@@ -0,0 +1,19 @@
+//! Minimal `pg_catalog` virtual-table emulation.
+//!
+//! Generic Postgres clients (DBeaver, pgAdmin, SQLAlchemy, psql's
+//! `\dt`) issue `SELECT` queries against `pg_catalog.*` tables to
+//! discover schemas, types, and tables. Without a response they
+//! either error out or show an empty catalog. This module intercepts
+//! those queries and returns rows synthesised from NodeDB's own
+//! `SystemCatalog` and credential store.
+//!
+//! The interception is pattern-based: we extract the first
+//! `pg_catalog.<table>` (or bare `pg_<table>`) reference from the
+//! `FROM` clause and delegate to the matching virtual table handler.
+//! The result always returns ALL rows with a fixed column schema —
+//! clients that send `WHERE` clauses filter client-side.
+
+pub mod dispatch;
+pub mod tables;
+
+pub use dispatch::try_pg_catalog;
diff --git a/nodedb/src/control/server/pgwire/pg_catalog/tables.rs b/nodedb/src/control/server/pgwire/pg_catalog/tables.rs
new file mode 100644
index 00000000..6e46e63e
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/pg_catalog/tables.rs
@@ -0,0 +1,270 @@
+//! Virtual table row generators for each pg_catalog table.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::server::pgwire::types::{bool_field, int4_field, int8_field, text_field};
+use crate::control::state::SharedState;
+
+/// `pg_database` — one row: the current database.
+pub fn pg_database() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("datname"),
+        text_field("datdba"),
+        text_field("encoding"),
+    ]);
+    let mut encoder = DataRowEncoder::new(schema.clone());
+    encoder.encode_field(&1i64)?;
+    encoder.encode_field(&"nodedb")?;
+    encoder.encode_field(&"nodedb")?;
+    encoder.encode_field(&"UTF8")?;
+    let rows = vec![Ok(encoder.take_row())];
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_namespace` — schemas: `public` + `pg_catalog`.
+pub fn pg_namespace() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("nspname"),
+        int8_field("nspowner"),
+    ]);
+    let mut encoder = DataRowEncoder::new(schema.clone());
+    let mut rows = Vec::new();
+
+    encoder.encode_field(&11i64)?;
+    encoder.encode_field(&"pg_catalog")?;
+    encoder.encode_field(&10i64)?;
+    rows.push(Ok(encoder.take_row()));
+
+    encoder.encode_field(&2200i64)?;
+    encoder.encode_field(&"public")?;
+    encoder.encode_field(&10i64)?;
+    rows.push(Ok(encoder.take_row()));
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_type` — common Postgres type OIDs that client drivers need.
+pub fn pg_type() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("typname"),
+        int8_field("typnamespace"),
+        int4_field("typlen"),
+        text_field("typtype"),
+    ]);
+
+    let types: &[(i64, &str, i32, &str)] = &[
+        (16, "bool", 1, "b"),
+        (20, "int8", 8, "b"),
+        (21, "int2", 2, "b"),
+        (23, "int4", 4, "b"),
+        (25, "text", -1, "b"),
+        (114, "json", -1, "b"),
+        (700, "float4", 4, "b"),
+        (701, "float8", 8, "b"),
+        (1043, "varchar", -1, "b"),
+        (1082, "date", 4, "b"),
+        (1114, "timestamp", 8, "b"),
+        (1184, "timestamptz", 8, "b"),
+        (2950, "uuid", 16, "b"),
+        (3802, "jsonb", -1, "b"),
+    ];
+
+    let mut rows = Vec::with_capacity(types.len());
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    for &(oid, name, len, typtype) in types {
+        encoder.encode_field(&oid)?;
+        encoder.encode_field(&name)?;
+        encoder.encode_field(&11i64)?;
+        encoder.encode_field(&len)?;
+        encoder.encode_field(&typtype)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_class` — one row per active collection (mapped as relation).
+pub fn pg_class(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("relname"),
+        int8_field("relnamespace"),
+        text_field("relkind"),
+        int8_field("relowner"),
+    ]);
+
+    let collections = load_collections(state, identity);
+
+    let mut rows = Vec::with_capacity(collections.len());
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    for (i, coll) in collections.iter().enumerate() {
+        let oid = 16384i64 + i as i64;
+        encoder.encode_field(&oid)?;
+        encoder.encode_field(&coll.name.as_str())?;
+        encoder.encode_field(&2200i64)?;
+        encoder.encode_field(&"r")?;
+        encoder.encode_field(&10i64)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_attribute` — one row per field in strict-schema collections.
+pub fn pg_attribute(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("attrelid"),
+        text_field("attname"),
+        int8_field("atttypid"),
+        int4_field("attnum"),
+        int4_field("attlen"),
+        bool_field("attnotnull"),
+    ]);
+
+    let collections = load_collections(state, identity);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    for (i, coll) in collections.iter().enumerate() {
+        let rel_oid = 16384i64 + i as i64;
+        for (col_num, (field_name, field_type)) in coll.fields.iter().enumerate() {
+            let type_oid = field_type_to_oid(field_type);
+            encoder.encode_field(&rel_oid)?;
+            encoder.encode_field(&field_name.as_str())?;
+            encoder.encode_field(&type_oid)?;
+            encoder.encode_field(&((col_num + 1) as i32))?;
+            encoder.encode_field(&(-1i32))?;
+            encoder.encode_field(&false)?;
+            rows.push(Ok(encoder.take_row()));
+        }
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_index` — secondary indexes.
+///
+/// Returns an empty result set with the correct schema. Structured
+/// index metadata is not yet surfaced through `StoredCollection`;
+/// once it is, this function will take `(state, identity)` and
+/// populate rows from the catalog.
+pub fn pg_index() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("indexrelid"),
+        int8_field("indrelid"),
+        bool_field("indisunique"),
+        bool_field("indisprimary"),
+    ]);
+
+    let rows: Vec<Result<_, pgwire::error::PgWireError>> = Vec::new();
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_authid` — users / roles.
+pub fn pg_authid(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("rolname"),
+        bool_field("rolsuper"),
+        bool_field("rolcanlogin"),
+    ]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let users = state.credentials.list_users();
+    for (i, user) in users.iter().enumerate() {
+        let oid = 10i64 + i as i64;
+        let is_super = identity.is_superuser && user == &identity.username;
+        encoder.encode_field(&oid)?;
+        encoder.encode_field(&user.as_str())?;
+        encoder.encode_field(&is_super)?;
+        encoder.encode_field(&true)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+fn load_collections(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> Vec<crate::control::security::catalog::types::StoredCollection> {
+    let Some(catalog) = state.credentials.catalog() else {
+        return Vec::new();
+    };
+    if identity.is_superuser {
+        catalog
+            .load_all_collections()
+            .unwrap_or_default()
+            .into_iter()
+            .filter(|c| c.is_active)
+            .collect()
+    } else {
+        catalog
+            .load_collections_for_tenant(identity.tenant_id.as_u32())
+            .unwrap_or_default()
+    }
+}
+
+fn field_type_to_oid(field_type: &str) -> i64 {
+    match field_type.to_lowercase().as_str() {
+        "bool" | "boolean" => 16,
+        "int" | "integer" | "int4" => 23,
+        "bigint" | "int8" => 20,
+        "smallint" | "int2" => 21,
+        "float" | "float4" | "real" => 700,
+        "double" | "float8" => 701,
+        "text" | "string" => 25,
+        "varchar" => 1043,
+        "json" => 114,
+        "jsonb" => 3802,
+        "uuid" => 2950,
+        "date" => 1082,
+        "timestamp" => 1114,
+        "timestamptz" => 1184,
+        _ => 25,
+    }
+}
diff --git a/nodedb/src/control/server/pgwire/session/ddl_buffer.rs b/nodedb/src/control/server/pgwire/session/ddl_buffer.rs
new file mode 100644
index 00000000..59d8ae26
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/session/ddl_buffer.rs
@@ -0,0 +1,119 @@
+//! Per-session DDL transaction buffer.
+//!
+//! When a pgwire session is inside a `BEGIN` block and executes DDL
+//! statements (CREATE, DROP, ALTER), the `propose_catalog_entry`
+//! path checks this buffer. If the buffer is active (non-None), the
+//! entry is pushed into it instead of being proposed immediately.
+//!
+//! On `COMMIT`, the buffer is flushed as a single
+//! `MetadataEntry::Batch`, so either all DDL in the transaction
+//! commits atomically or none does.
+//!
+//! On `ROLLBACK`, the buffer is cleared without proposing.
+
+use std::cell::RefCell;
+
+/// Encoded DDL payloads buffered during a transaction. Each entry
+/// is a serialized `CatalogEntry` ready for
+/// `MetadataEntry::CatalogDdl { payload }`.
+pub type DdlBuffer = Vec<Vec<u8>>;
+
+thread_local! {
+    /// Thread-local flag: when `Some`, `propose_catalog_entry` pushes
+    /// into this buffer instead of proposing through raft. Set by
+    /// `activate` before DDL dispatch, cleared by `take`.
+    ///
+    /// Thread-local is safe here because pgwire DDL handlers run
+    /// synchronously via `block_in_place` — the buffer is set and
+    /// read on the same OS thread within a single handler call.
+    static ACTIVE_BUFFER: RefCell<Option<DdlBuffer>> = const { RefCell::new(None) };
+}
+
+/// Activate the DDL buffer for the current thread. Any subsequent
+/// call to `try_buffer` will push into this buffer instead of
+/// returning `None`.
+pub fn activate() {
+    ACTIVE_BUFFER.with(|b| {
+        let mut guard = b.borrow_mut();
+        if guard.is_none() {
+            *guard = Some(Vec::new());
+        }
+    });
+}
+
+/// Try to buffer a DDL payload. Returns `true` if the buffer is
+/// active and the payload was pushed. Returns `false` if no buffer
+/// is active (caller should propose normally).
+pub fn try_buffer(payload: Vec<u8>) -> bool {
+    ACTIVE_BUFFER.with(|b| {
+        let mut guard = b.borrow_mut();
+        if let Some(buf) = guard.as_mut() {
+            buf.push(payload);
+            true
+        } else {
+            false
+        }
+    })
+}
+
+/// Take the accumulated buffer contents and deactivate. Returns
+/// `None` if the buffer was never activated.
+pub fn take() -> Option<DdlBuffer> {
+    ACTIVE_BUFFER.with(|b| b.borrow_mut().take())
+}
+
+/// Deactivate and discard the buffer without returning its contents.
+pub fn discard() {
+    ACTIVE_BUFFER.with(|b| {
+        let _ = b.borrow_mut().take();
+    });
+}
+
+/// Returns `true` if a DDL buffer is currently active on this thread.
+pub fn is_active() -> bool {
+    ACTIVE_BUFFER.with(|b| b.borrow().is_some())
+}
+
+/// Number of DDL statements buffered in the current thread's
+/// active transaction. Returns 0 if no buffer is active.
+pub fn buffer_len() -> usize {
+    ACTIVE_BUFFER.with(|b| b.borrow().as_ref().map(|v| v.len()).unwrap_or(0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn inactive_buffer_does_not_capture() {
+        discard(); // ensure clean state
+        assert!(!try_buffer(vec![1, 2, 3]));
+        assert!(!is_active());
+    }
+
+    #[test]
+    fn active_buffer_captures() {
+        activate();
+        assert!(is_active());
+        assert!(try_buffer(vec![1]));
+        assert!(try_buffer(vec![2]));
+        let buf = take().unwrap();
+        assert_eq!(buf.len(), 2);
+        assert!(!is_active());
+    }
+
+    #[test]
+    fn discard_clears_buffer() {
+        activate();
+        try_buffer(vec![1]);
+        discard();
+        assert!(!is_active());
+        assert!(take().is_none());
+    }
+
+    #[test]
+    fn take_on_inactive_returns_none() {
+        discard();
+        assert!(take().is_none());
+    }
+}
diff --git a/nodedb/src/control/server/pgwire/session/mod.rs b/nodedb/src/control/server/pgwire/session/mod.rs
index 7ea726b5..8fad3e7a 100644
--- a/nodedb/src/control/server/pgwire/session/mod.rs
+++ b/nodedb/src/control/server/pgwire/session/mod.rs
@@ -1,7 +1,9 @@
 mod cursor;
 pub mod cursor_spill;
+pub mod ddl_buffer;
 mod live;
 mod params;
+pub mod read_consistency;
 mod state;
 mod store;
 pub mod temp_tables;
diff --git a/nodedb/src/control/server/pgwire/session/read_consistency.rs b/nodedb/src/control/server/pgwire/session/read_consistency.rs
new file mode 100644
index 00000000..526034aa
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/session/read_consistency.rs
@@ -0,0 +1,155 @@
+//! Session-level `ReadConsistency` — wire `SET` / `SHOW` for the
+//! `default_read_consistency` session parameter.
+//!
+//! Accepted values:
+//!
+//! - `'strong'`
+//! - `'bounded_staleness:<secs>'` or `'bounded_staleness:<secs>s'`
+//! - `'eventual'`
+//!
+//! The value is stored as a plain string in the session parameter
+//! map. This module provides the typed parse + accessor.
+
+use std::net::SocketAddr;
+use std::time::Duration;
+
+use crate::types::ReadConsistency;
+
+use super::store::SessionStore;
+
+/// Session parameter key.
+pub const PARAM_KEY: &str = "default_read_consistency";
+
+/// Parse a user-supplied string into a `ReadConsistency`. Returns
+/// `None` on unrecognised input so the caller can return a helpful
+/// error message.
+pub fn parse_value(value: &str) -> Option<ReadConsistency> {
+    let lower = value.trim().to_lowercase();
+    match lower.as_str() {
+        "strong" => Some(ReadConsistency::Strong),
+        "eventual" => Some(ReadConsistency::Eventual),
+        _ => {
+            let stripped = lower.strip_prefix("bounded_staleness:")?;
+            let secs_str = stripped.trim_end_matches('s').trim();
+            let secs: f64 = secs_str.parse().ok()?;
+            if secs <= 0.0 {
+                return None;
+            }
+            Some(ReadConsistency::BoundedStaleness(Duration::from_secs_f64(
+                secs,
+            )))
+        }
+    }
+}
+
+/// Format a `ReadConsistency` back into the canonical string form
+/// so `SHOW default_read_consistency` returns something parseable.
+pub fn format_value(rc: &ReadConsistency) -> String {
+    match rc {
+        ReadConsistency::Strong => "strong".to_string(),
+        ReadConsistency::Eventual => "eventual".to_string(),
+        ReadConsistency::BoundedStaleness(d) => {
+            format!("bounded_staleness:{}s", d.as_secs_f64())
+        }
+    }
+}
+
+impl SessionStore {
+    /// Resolve the effective `ReadConsistency` for a session. Falls
+    /// back to `Strong` if the parameter is unset or unparseable.
+    pub fn read_consistency(&self, addr: &SocketAddr) -> ReadConsistency {
+        self.get_parameter(addr, PARAM_KEY)
+            .and_then(|v| parse_value(&v))
+            .unwrap_or_default()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_strong() {
+        assert_eq!(parse_value("strong"), Some(ReadConsistency::Strong));
+        assert_eq!(parse_value("STRONG"), Some(ReadConsistency::Strong));
+    }
+
+    #[test]
+    fn parse_eventual() {
+        assert_eq!(parse_value("eventual"), Some(ReadConsistency::Eventual));
+    }
+
+    #[test]
+    fn parse_bounded_staleness_seconds() {
+        let rc = parse_value("bounded_staleness:5").unwrap();
+        assert_eq!(
+            rc,
+            ReadConsistency::BoundedStaleness(Duration::from_secs(5))
+        );
+    }
+
+    #[test]
+    fn parse_bounded_staleness_with_s_suffix() {
+        let rc = parse_value("bounded_staleness:5s").unwrap();
+        assert_eq!(
+            rc,
+            ReadConsistency::BoundedStaleness(Duration::from_secs(5))
+        );
+    }
+
+    #[test]
+    fn parse_bounded_staleness_fractional() {
+        let rc = parse_value("bounded_staleness:0.5s").unwrap();
+        assert_eq!(
+            rc,
+            ReadConsistency::BoundedStaleness(Duration::from_millis(500))
+        );
+    }
+
+    #[test]
+    fn parse_rejects_zero_staleness() {
+        assert!(parse_value("bounded_staleness:0").is_none());
+    }
+
+    #[test]
+    fn parse_rejects_garbage() {
+        assert!(parse_value("foobar").is_none());
+        assert!(parse_value("").is_none());
+    }
+
+    #[test]
+    fn format_roundtrip_strong() {
+        let s = format_value(&ReadConsistency::Strong);
+        assert_eq!(parse_value(&s), Some(ReadConsistency::Strong));
+    }
+
+    #[test]
+    fn format_roundtrip_bounded() {
+        let rc = ReadConsistency::BoundedStaleness(Duration::from_secs(10));
+        let s = format_value(&rc);
+        assert_eq!(parse_value(&s), Some(rc));
+    }
+
+    #[test]
+    fn format_roundtrip_eventual() {
+        let s = format_value(&ReadConsistency::Eventual);
+        assert_eq!(parse_value(&s), Some(ReadConsistency::Eventual));
+    }
+
+    #[test]
+    fn session_store_defaults_to_strong() {
+        let store = SessionStore::new();
+        let addr: SocketAddr = "127.0.0.1:5432".parse().unwrap();
+        store.ensure_session(addr);
+        assert_eq!(store.read_consistency(&addr), ReadConsistency::Strong);
+    }
+
+    #[test]
+    fn session_store_reads_set_value() {
+        let store = SessionStore::new();
+        let addr: SocketAddr = "127.0.0.1:5432".parse().unwrap();
+        store.ensure_session(addr);
+        store.set_parameter(&addr, PARAM_KEY.to_string(), "eventual".to_string());
+        assert_eq!(store.read_consistency(&addr), ReadConsistency::Eventual);
+    }
+}
diff --git a/nodedb/tests/cluster_execute_request.rs b/nodedb/tests/cluster_execute_request.rs
index bc02383c..453d2b15 100644
--- a/nodedb/tests/cluster_execute_request.rs
+++ b/nodedb/tests/cluster_execute_request.rs
@@ -162,8 +162,19 @@ async fn execute_request_cross_node_dispatch() {
         .await
         .expect("create collection");
 
-    // Give the metadata applier on all nodes a moment to replicate.
-    tokio::time::sleep(Duration::from_millis(400)).await;
+    // Wait for the collection to be visible on every node.
+    common::cluster_harness::wait_for(
+        "cross_node_kv visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
 
     // Node 2 sends the request; node 1 (bootstrap leader) receives it.
     let sender_transport = cluster.nodes[1]
diff --git a/nodedb/tests/common/cluster_harness/cluster.rs b/nodedb/tests/common/cluster_harness/cluster.rs
index 5c8f7b25..ae769141 100644
--- a/nodedb/tests/common/cluster_harness/cluster.rs
+++ b/nodedb/tests/common/cluster_harness/cluster.rs
@@ -17,7 +17,18 @@ impl TestCluster {
     /// via node 1's pre-bound address. Waits until every node sees
     /// topology_size == 3 (10s deadline).
     pub async fn spawn_three() -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
-        Self::spawn_three_with_tuning(ClusterTransportTuning::default()).await
+        Self::spawn_three_with_tuning(ClusterTransportTuning {
+            // Fast health pings so the HealthMonitor re-broadcasts
+            // topology within ~1s if the initial join broadcast was missed.
+            health_ping_interval_secs: 1,
+            // Fast election timeouts so the metadata Raft group elects a
+            // leader well within the 10s convergence deadline, even under
+            // heavy parallel test load.
+            election_timeout_min_secs: 1,
+            election_timeout_max_secs: 2,
+            ..ClusterTransportTuning::default()
+        })
+        .await
     }
 
     /// Spawn a 3-node cluster with a custom `ClusterTransportTuning`.
@@ -29,12 +40,34 @@ impl TestCluster {
     ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
         let node1 = TestClusterNode::spawn_with_tuning(1, vec![], tuning.clone()).await?;
 
-        // Give node 1's transport + raft loop a moment to start
-        // accepting before peers dial in.
-        tokio::time::sleep(Duration::from_millis(200)).await;
+        // Wait until node 1 has bootstrapped (topology shows itself)
+        // before peers try to join. The old fixed 200ms sleep was too
+        // short under heavy host load (e.g. 500+ parallel unit tests
+        // sharing the same CPU pool), causing peers to dial before
+        // node 1's transport was ready — failing topology convergence.
+        let deadline = std::time::Instant::now() + Duration::from_secs(10);
+        while node1.topology_size() < 1 {
+            if std::time::Instant::now() >= deadline {
+                return Err("node 1 failed to bootstrap within 10s".into());
+            }
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
 
         let seeds = vec![node1.listen_addr];
         let node2 = TestClusterNode::spawn_with_tuning(2, seeds.clone(), tuning.clone()).await?;
+
+        // Wait for node 2's join to be reflected before spawning node 3.
+        // Under load, spawning both peers simultaneously can overwhelm the
+        // bootstrap leader's join handler, causing neither join to complete
+        // within the topology convergence deadline.
+        let deadline = std::time::Instant::now() + Duration::from_secs(10);
+        while node1.topology_size() < 2 {
+            if std::time::Instant::now() >= deadline {
+                return Err("node 2 failed to join within 10s".into());
+            }
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
+
         let node3 = TestClusterNode::spawn_with_tuning(3, seeds, tuning).await?;
 
         let cluster = Self {
@@ -44,7 +77,7 @@ impl TestCluster {
         wait_for(
             "all 3 nodes report topology_size == 3",
             Duration::from_secs(10),
-            Duration::from_millis(100),
+            Duration::from_millis(50),
             || cluster.nodes.iter().all(|n| n.topology_size() == 3),
         )
         .await;
diff --git a/nodedb/tests/common/cluster_harness/node.rs b/nodedb/tests/common/cluster_harness/node.rs
index db4c84af..e08461fd 100644
--- a/nodedb/tests/common/cluster_harness/node.rs
+++ b/nodedb/tests/common/cluster_harness/node.rs
@@ -129,6 +129,7 @@ impl TestClusterNode {
             &cluster_settings,
             transport.clone(),
             &data_dir_path,
+            &tuning,
         )
         .await?;
 
diff --git a/nodedb/tests/descriptor_lease_planner_integration.rs b/nodedb/tests/descriptor_lease_planner_integration.rs
index a87f15a8..550434c7 100644
--- a/nodedb/tests/descriptor_lease_planner_integration.rs
+++ b/nodedb/tests/descriptor_lease_planner_integration.rs
@@ -16,7 +16,7 @@ use common::cluster_harness::{TestCluster, wait_for};
 use nodedb_cluster::{DescriptorId, DescriptorKind};
 
 const TENANT: u32 = 1;
-const WAIT_BUDGET: Duration = Duration::from_secs(3);
+const WAIT_BUDGET: Duration = Duration::from_secs(10);
 const POLL: Duration = Duration::from_millis(20);
 
 fn coll_id(name: &str) -> DescriptorId {
diff --git a/nodedb/tests/sql_ddl_cluster.rs b/nodedb/tests/sql_ddl_cluster.rs
new file mode 100644
index 00000000..640b6ae3
--- /dev/null
+++ b/nodedb/tests/sql_ddl_cluster.rs
@@ -0,0 +1,265 @@
+//! DDL replication correctness matrix.
+//!
+//! For every DDL variant that flows through the replicated metadata
+//! path, this file tests:
+//!
+//! 1. Execute DDL on the leader → visible on every follower.
+//! 2. Execute the inverse DDL → removal visible on every node.
+//! 3. `IF NOT EXISTS` / `IF EXISTS` branches handled without error.
+//!
+//! Uses the 3-node `TestCluster` harness from `common/cluster_harness`.
+
+mod common;
+
+use std::time::Duration;
+
+use common::cluster_harness::{TestCluster, wait_for};
+
+// ── Collection ───────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_collection_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION ddl_test_coll")
+        .await
+        .expect("create");
+    wait_for(
+        "collection visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP COLLECTION ddl_test_coll")
+        .await
+        .expect("drop");
+    wait_for(
+        "collection removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() == 0)
+        },
+    )
+    .await;
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_collection_if_not_exists() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION ine_coll")
+        .await
+        .expect("first create");
+    wait_for(
+        "collection visible",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
+    // Second CREATE IF NOT EXISTS must succeed without error.
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION IF NOT EXISTS ine_coll")
+        .await
+        .expect("if not exists must not error");
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_drop_collection_if_exists_missing() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    // DROP IF EXISTS on a nonexistent collection must succeed.
+    cluster
+        .exec_ddl_on_any_leader("DROP COLLECTION IF EXISTS no_such_coll")
+        .await
+        .expect("if exists on missing must not error");
+    cluster.shutdown().await;
+}
+
+// ── Sequence ─────────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_sequence_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE SEQUENCE ddl_test_seq START 1")
+        .await
+        .expect("create seq");
+    wait_for(
+        "sequence visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.has_sequence(1, "ddl_test_seq"))
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP SEQUENCE ddl_test_seq")
+        .await
+        .expect("drop seq");
+    wait_for(
+        "sequence removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || cluster.nodes.iter().all(|n| n.sequence_count(1) == 0),
+    )
+    .await;
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_sequence_if_not_exists() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE SEQUENCE ine_seq START 1")
+        .await
+        .expect("first create");
+    wait_for(
+        "seq visible",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || cluster.nodes.iter().all(|n| n.has_sequence(1, "ine_seq")),
+    )
+    .await;
+    cluster
+        .exec_ddl_on_any_leader("CREATE SEQUENCE IF NOT EXISTS ine_seq START 1")
+        .await
+        .expect("if not exists must not error");
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_drop_sequence_if_exists_missing() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("DROP SEQUENCE IF EXISTS no_such_seq")
+        .await
+        .expect("if exists on missing must not error");
+    cluster.shutdown().await;
+}
+
+// ── Trigger ──────────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_trigger_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION trig_coll")
+        .await
+        .expect("create coll for trigger");
+    wait_for(
+        "coll visible",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader(
+            "CREATE TRIGGER ddl_test_trig AFTER INSERT ON trig_coll FOR EACH ROW BEGIN RETURN 1; END",
+        )
+        .await
+        .expect("create trigger");
+    wait_for(
+        "trigger visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.has_trigger(1, "ddl_test_trig"))
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP TRIGGER ddl_test_trig ON trig_coll")
+        .await
+        .expect("drop trigger");
+    wait_for(
+        "trigger removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| !n.has_trigger(1, "ddl_test_trig"))
+        },
+    )
+    .await;
+    cluster.shutdown().await;
+}
+
+// ── Schedule ─────────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_schedule_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader(
+            "CREATE SCHEDULE ddl_test_sched CRON '0 0 * * *' AS BEGIN RETURN 1; END",
+        )
+        .await
+        .expect("create schedule");
+    wait_for(
+        "schedule visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.has_schedule(1, "ddl_test_sched"))
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP SCHEDULE ddl_test_sched")
+        .await
+        .expect("drop schedule");
+    wait_for(
+        "schedule removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| !n.has_schedule(1, "ddl_test_sched"))
+        },
+    )
+    .await;
+    cluster.shutdown().await;
+}