diff --git a/Cargo.lock b/Cargo.lock index b17b653b..69390359 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3645,7 +3645,7 @@ dependencies = [ [[package]] name = "nodedb" -version = "0.0.3" +version = "0.0.4" dependencies = [ "aes-gcm", "anyhow", @@ -3735,7 +3735,7 @@ dependencies = [ [[package]] name = "nodedb-bridge" -version = "0.0.3" +version = "0.0.4" dependencies = [ "fluxbench", "libc", @@ -3747,7 +3747,7 @@ dependencies = [ [[package]] name = "nodedb-client" -version = "0.0.3" +version = "0.0.4" dependencies = [ "async-trait", "nodedb-types", @@ -3765,7 +3765,7 @@ dependencies = [ [[package]] name = "nodedb-cluster" -version = "0.0.3" +version = "0.0.4" dependencies = [ "async-trait", "crc32c", @@ -3792,7 +3792,7 @@ dependencies = [ [[package]] name = "nodedb-codec" -version = "0.0.3" +version = "0.0.4" dependencies = [ "lz4_flex 0.11.6", "pco", @@ -3808,7 +3808,7 @@ dependencies = [ [[package]] name = "nodedb-columnar" -version = "0.0.3" +version = "0.0.4" dependencies = [ "crc32c", "nodedb-codec", @@ -3825,7 +3825,7 @@ dependencies = [ [[package]] name = "nodedb-crdt" -version = "0.0.3" +version = "0.0.4" dependencies = [ "hmac 0.12.1", "loro", @@ -3838,7 +3838,7 @@ dependencies = [ [[package]] name = "nodedb-fts" -version = "0.0.3" +version = "0.0.4" dependencies = [ "icu_segmenter", "lindera", @@ -3853,7 +3853,7 @@ dependencies = [ [[package]] name = "nodedb-graph" -version = "0.0.3" +version = "0.0.4" dependencies = [ "nodedb-types", "rkyv 0.8.15", @@ -3867,7 +3867,7 @@ dependencies = [ [[package]] name = "nodedb-mem" -version = "0.0.3" +version = "0.0.4" dependencies = [ "fluxbench", "libc", @@ -3882,7 +3882,7 @@ dependencies = [ [[package]] name = "nodedb-query" -version = "0.0.3" +version = "0.0.4" dependencies = [ "nodedb-fts", "nodedb-spatial", @@ -3898,7 +3898,7 @@ dependencies = [ [[package]] name = "nodedb-raft" -version = "0.0.3" +version = "0.0.4" dependencies = [ "rand 0.9.4", "rkyv 0.8.15", @@ -3914,7 +3914,7 @@ dependencies = [ [[package]] name = "nodedb-spatial" -version = "0.0.3" +version = "0.0.4" dependencies = [ "h3o", "nodedb-types", @@ -3929,7 +3929,7 @@ dependencies = [ [[package]] name = "nodedb-sql" -version = "0.0.3" +version = "0.0.4" dependencies = [ "nodedb-query", "nodedb-types", @@ -3939,7 +3939,7 @@ dependencies = [ [[package]] name = "nodedb-strict" -version = "0.0.3" +version = "0.0.4" dependencies = [ "arrow", "nodedb-types", @@ -3953,7 +3953,7 @@ dependencies = [ [[package]] name = "nodedb-types" -version = "0.0.3" +version = "0.0.4" dependencies = [ "nanoid", "nodedb-codec", @@ -3972,7 +3972,7 @@ dependencies = [ [[package]] name = "nodedb-vector" -version = "0.0.3" +version = "0.0.4" dependencies = [ "libc", "memmap2", @@ -3989,7 +3989,7 @@ dependencies = [ [[package]] name = "nodedb-wal" -version = "0.0.3" +version = "0.0.4" dependencies = [ "aes-gcm", "crc32c", diff --git a/Cargo.toml b/Cargo.toml index 34822136..5d18c12c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.0.3" +version = "0.0.4" edition = "2024" rust-version = "1.94" license = "BUSL-1.1" diff --git a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs index a09b2e2c..6bb20f2b 100644 --- a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs +++ b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs @@ -35,7 +35,8 @@ pub(super) fn bootstrap(config: &ClusterConfig, catalog: &ClusterCatalog) -> Res ); // Create MultiRaft with all groups (single-node, no peers). - let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone()); + let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone()) + .with_election_timeout(config.election_timeout_min, config.election_timeout_max); for group_id in routing.group_ids() { multi_raft.add_group(group_id, vec![])?; } @@ -81,6 +82,7 @@ fn generate_cluster_id() -> u64 { mod tests { use super::*; use crate::catalog::ClusterCatalog; + use std::time::Duration; fn temp_catalog() -> (tempfile::TempDir, ClusterCatalog) { let dir = tempfile::tempdir().unwrap(); @@ -102,6 +104,8 @@ mod tests { force_bootstrap: false, join_retry: Default::default(), swim_udp_addr: None, + election_timeout_min: Duration::from_millis(150), + election_timeout_max: Duration::from_millis(300), }; let state = bootstrap(&config, &catalog).unwrap(); diff --git a/nodedb-cluster/src/bootstrap/config.rs b/nodedb-cluster/src/bootstrap/config.rs index 933198c5..3b42b1ee 100644 --- a/nodedb-cluster/src/bootstrap/config.rs +++ b/nodedb-cluster/src/bootstrap/config.rs @@ -91,6 +91,10 @@ pub struct ClusterConfig { /// [`crate::spawn_swim`] after the cluster is up and feed the /// seed list from `seed_nodes`. pub swim_udp_addr: Option, + /// Raft election timeout range. Controls how long a follower waits + /// before starting an election after losing contact with the leader. + pub election_timeout_min: Duration, + pub election_timeout_max: Duration, } /// Result of cluster startup — everything needed to run the Raft loop. diff --git a/nodedb-cluster/src/bootstrap/join.rs b/nodedb-cluster/src/bootstrap/join.rs index afe6ad7a..67e79854 100644 --- a/nodedb-cluster/src/bootstrap/join.rs +++ b/nodedb-cluster/src/bootstrap/join.rs @@ -288,7 +288,8 @@ fn apply_join_response( // learners). A learner-started group boots in the `Learner` // role and will not run an election until a subsequent // `PromoteLearner` conf change is applied. - let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone()); + let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone()) + .with_election_timeout(config.election_timeout_min, config.election_timeout_max); for g in &resp.groups { let is_voter = g.members.contains(&config.node_id); let is_learner = g.learners.contains(&config.node_id); @@ -450,6 +451,8 @@ mod tests { force_bootstrap: false, join_retry: Default::default(), swim_udp_addr: None, + election_timeout_min: Duration::from_millis(150), + election_timeout_max: Duration::from_millis(300), }; let state1 = bootstrap(&config1, &catalog1).unwrap(); @@ -499,6 +502,8 @@ mod tests { force_bootstrap: false, join_retry: Default::default(), swim_udp_addr: None, + election_timeout_min: Duration::from_millis(150), + election_timeout_max: Duration::from_millis(300), }; let lifecycle = ClusterLifecycleTracker::new(); diff --git a/nodedb-cluster/src/bootstrap/probe.rs b/nodedb-cluster/src/bootstrap/probe.rs index 1688c87d..4df5838b 100644 --- a/nodedb-cluster/src/bootstrap/probe.rs +++ b/nodedb-cluster/src/bootstrap/probe.rs @@ -223,6 +223,8 @@ mod tests { force_bootstrap: false, join_retry: Default::default(), swim_udp_addr: None, + election_timeout_min: Duration::from_millis(150), + election_timeout_max: Duration::from_millis(300), } } diff --git a/nodedb-cluster/src/bootstrap/restart.rs b/nodedb-cluster/src/bootstrap/restart.rs index 3306142a..1c18186f 100644 --- a/nodedb-cluster/src/bootstrap/restart.rs +++ b/nodedb-cluster/src/bootstrap/restart.rs @@ -35,7 +35,8 @@ pub(super) fn restart( // as a learner on restart; dropping the group entirely would // leave the node permanently without any copy of it and // silently broken. - let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone()); + let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone()) + .with_election_timeout(config.election_timeout_min, config.election_timeout_max); for (group_id, info) in routing.group_members() { let is_voter = info.members.contains(&config.node_id); let is_learner = info.learners.contains(&config.node_id); @@ -91,6 +92,7 @@ mod tests { use super::super::bootstrap_fn::bootstrap; use super::*; use crate::catalog::ClusterCatalog; + use std::time::Duration; fn temp_catalog() -> (tempfile::TempDir, ClusterCatalog) { let dir = tempfile::tempdir().unwrap(); @@ -112,6 +114,8 @@ mod tests { force_bootstrap: false, join_retry: Default::default(), swim_udp_addr: None, + election_timeout_min: Duration::from_millis(150), + election_timeout_max: Duration::from_millis(300), }; // Bootstrap first. diff --git a/nodedb-cluster/src/circuit_breaker.rs b/nodedb-cluster/src/circuit_breaker.rs index 3b02d4e5..3c5992f1 100644 --- a/nodedb-cluster/src/circuit_breaker.rs +++ b/nodedb-cluster/src/circuit_breaker.rs @@ -145,6 +145,26 @@ impl CircuitBreaker { .unwrap_or(CircuitState::Closed) } + /// Return the ids of every peer whose breaker is currently Open. + /// + /// Used by the reachability driver to find peers that need an + /// active probe — without a periodic poke these peers never + /// transition back to HalfOpen (no traffic → no `check()` call + /// → no cooldown re-evaluation). + pub fn open_peers(&self) -> Vec { + let peers = self.peers.read().unwrap_or_else(|p| p.into_inner()); + peers + .iter() + .filter_map(|(id, b)| { + if b.state == CircuitState::Open { + Some(*id) + } else { + None + } + }) + .collect() + } + /// Get consecutive failure count for a peer. pub fn failure_count(&self, peer: u64) -> u32 { let peers = self.peers.read().unwrap_or_else(|p| p.into_inner()); diff --git a/nodedb-cluster/src/closed_timestamp.rs b/nodedb-cluster/src/closed_timestamp.rs new file mode 100644 index 00000000..550fb9f2 --- /dev/null +++ b/nodedb-cluster/src/closed_timestamp.rs @@ -0,0 +1,123 @@ +//! Per-group closed-timestamp tracker. +//! +//! Every time a Raft group applies a committed entry, the applier +//! records the wall-clock instant as that group's "closed timestamp". +//! A follower whose closed timestamp for a group is within the +//! caller's staleness bound can serve reads locally — no gateway hop +//! to the leader. +//! +//! The tracker is intentionally simple: one `Instant` per group, +//! updated monotonically. There is no HLC or cross-node coordination +//! here — the closed timestamp is local to this node. Safety comes +//! from the fact that a follower's applied index can only advance +//! (Raft guarantees), so a read served at a given closed timestamp +//! sees a consistent prefix of the log. + +use std::collections::HashMap; +use std::sync::RwLock; +use std::time::{Duration, Instant}; + +/// Tracks the most recent apply instant per Raft group. +pub struct ClosedTimestampTracker { + groups: RwLock>, +} + +impl ClosedTimestampTracker { + pub fn new() -> Self { + Self { + groups: RwLock::new(HashMap::new()), + } + } + + /// Record that `group_id` just applied one or more entries. + /// Called by the raft-loop applier after each apply batch. + pub fn mark_applied(&self, group_id: u64) { + let mut g = self.groups.write().unwrap_or_else(|p| p.into_inner()); + g.insert(group_id, Instant::now()); + } + + /// Record that `group_id` just applied, using a caller-supplied + /// instant. Exposed for deterministic testing with paused time. + pub fn mark_applied_at(&self, group_id: u64, at: Instant) { + let mut g = self.groups.write().unwrap_or_else(|p| p.into_inner()); + g.insert(group_id, at); + } + + /// Check whether this node's replica of `group_id` has applied + /// recently enough that a read with `max_staleness` can be + /// served locally. + /// + /// Returns `false` if the group has never applied on this node + /// (no closed timestamp recorded). + pub fn is_fresh_enough(&self, group_id: u64, max_staleness: Duration) -> bool { + let g = self.groups.read().unwrap_or_else(|p| p.into_inner()); + match g.get(&group_id) { + Some(last) => last.elapsed() <= max_staleness, + None => false, + } + } + + /// Return the age of the closed timestamp for a group, or `None` + /// if the group has never applied on this node. Useful for + /// observability (metrics, SHOW commands). + pub fn staleness(&self, group_id: u64) -> Option { + let g = self.groups.read().unwrap_or_else(|p| p.into_inner()); + g.get(&group_id).map(|last| last.elapsed()) + } +} + +impl Default for ClosedTimestampTracker { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn unknown_group_is_not_fresh() { + let tracker = ClosedTimestampTracker::new(); + assert!(!tracker.is_fresh_enough(99, Duration::from_secs(10))); + } + + #[test] + fn recently_applied_is_fresh() { + let tracker = ClosedTimestampTracker::new(); + tracker.mark_applied(1); + assert!(tracker.is_fresh_enough(1, Duration::from_secs(5))); + } + + #[test] + fn stale_group_is_not_fresh() { + let tracker = ClosedTimestampTracker::new(); + let old = Instant::now() - Duration::from_secs(30); + tracker.mark_applied_at(1, old); + assert!(!tracker.is_fresh_enough(1, Duration::from_secs(5))); + } + + #[test] + fn staleness_returns_none_for_unknown() { + let tracker = ClosedTimestampTracker::new(); + assert!(tracker.staleness(42).is_none()); + } + + #[test] + fn staleness_returns_age_for_known() { + let tracker = ClosedTimestampTracker::new(); + tracker.mark_applied(1); + let s = tracker.staleness(1).unwrap(); + assert!(s < Duration::from_millis(100)); + } + + #[test] + fn mark_applied_updates_monotonically() { + let tracker = ClosedTimestampTracker::new(); + let old = Instant::now() - Duration::from_secs(10); + tracker.mark_applied_at(1, old); + assert!(!tracker.is_fresh_enough(1, Duration::from_secs(5))); + tracker.mark_applied(1); + assert!(tracker.is_fresh_enough(1, Duration::from_secs(5))); + } +} diff --git a/nodedb-cluster/src/decommission/coordinator.rs b/nodedb-cluster/src/decommission/coordinator.rs new file mode 100644 index 00000000..4f62c4aa --- /dev/null +++ b/nodedb-cluster/src/decommission/coordinator.rs @@ -0,0 +1,222 @@ +//! `DecommissionCoordinator` — drives a [`DecommissionPlan`] through +//! the metadata Raft group one entry at a time. +//! +//! The coordinator is a stateless-looking actor: it owns the plan, +//! a [`MetadataProposer`] (the injection seam for tests and for +//! whichever Raft driver is wired up at runtime), and an index +//! counter. On every call to [`DecommissionCoordinator::run`] it +//! proposes each entry in order, waiting for each to commit before +//! advancing. A proposer failure aborts the run at the failed step — +//! the caller can retry by constructing a fresh coordinator from +//! the same plan, because every step is idempotent at the metadata +//! layer (the cache and live-state appliers skip already-applied +//! indexes). +//! +//! The coordinator does not own a timer or a shutdown channel — it +//! is a one-shot sequence. Higher-level supervisors handle retries +//! and cancellation. + +use async_trait::async_trait; +use tracing::{debug, info}; + +use crate::error::Result; +use crate::metadata_group::MetadataEntry; + +use super::flow::DecommissionPlan; + +/// Injection seam: proposes a single metadata entry through the +/// metadata Raft group and waits for it to commit. Returns the +/// applied index on success so the coordinator can tell it apart +/// from older commits. +#[async_trait] +pub trait MetadataProposer: Send + Sync { + async fn propose_and_wait(&self, entry: MetadataEntry) -> Result; +} + +// Blanket impl so callers can pass `Arc` wherever a `MetadataProposer` +// is required without having to write a forwarding impl for every +// wrapper type. Defined here (rather than in the consumer crate) to +// avoid orphan-rule issues for downstream test impls. +#[async_trait] +impl MetadataProposer for std::sync::Arc { + async fn propose_and_wait(&self, entry: MetadataEntry) -> Result { + (**self).propose_and_wait(entry).await + } +} + +/// Drives a [`DecommissionPlan`] to completion. +pub struct DecommissionCoordinator { + plan: DecommissionPlan, + proposer: P, +} + +/// Outcome of a successful coordinator run. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DecommissionRunResult { + pub node_id: u64, + pub entries_committed: usize, + pub last_applied_index: u64, +} + +impl DecommissionCoordinator

{ + pub fn new(plan: DecommissionPlan, proposer: P) -> Self { + Self { plan, proposer } + } + + /// Propose every entry in the plan sequentially, waiting for + /// each commit. Returns the total number of entries committed + /// and the final applied index. + pub async fn run(self) -> Result { + let node_id = self.plan.node_id; + let total = self.plan.entries.len(); + info!(node_id, steps = total, "decommission coordinator starting"); + let mut last_applied = 0u64; + for (step, entry) in self.plan.entries.into_iter().enumerate() { + debug!(node_id, step, total, "proposing decommission entry"); + last_applied = self.proposer.propose_and_wait(entry).await?; + } + info!( + node_id, + entries_committed = total, + last_applied, + "decommission coordinator finished" + ); + Ok(DecommissionRunResult { + node_id, + entries_committed: total, + last_applied_index: last_applied, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::decommission::flow::plan_full_decommission; + use crate::error::ClusterError; + use crate::metadata_group::{RoutingChange, TopologyChange}; + use crate::routing::RoutingTable; + use crate::topology::{ClusterTopology, NodeInfo, NodeState}; + use std::net::SocketAddr; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::sync::{Arc, Mutex}; + + struct RecordingProposer { + committed: Mutex>, + counter: AtomicU64, + } + + impl RecordingProposer { + fn new() -> Arc { + Arc::new(Self { + committed: Mutex::new(Vec::new()), + counter: AtomicU64::new(0), + }) + } + } + + #[async_trait] + impl MetadataProposer for RecordingProposer { + async fn propose_and_wait(&self, entry: MetadataEntry) -> Result { + let idx = self.counter.fetch_add(1, Ordering::SeqCst) + 1; + self.committed.lock().unwrap().push(entry); + Ok(idx) + } + } + + struct FailingProposer { + fail_after: usize, + counter: AtomicU64, + } + + #[async_trait] + impl MetadataProposer for FailingProposer { + async fn propose_and_wait(&self, _entry: MetadataEntry) -> Result { + let n = self.counter.fetch_add(1, Ordering::SeqCst); + if n as usize >= self.fail_after { + return Err(ClusterError::Transport { + detail: "injected failure".into(), + }); + } + Ok(n + 1) + } + } + + fn three_node_plan() -> DecommissionPlan { + let mut t = ClusterTopology::new(); + for (i, id) in [1u64, 2, 3].iter().enumerate() { + let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + t.add_node(NodeInfo::new(*id, a, NodeState::Active)); + } + let routing = RoutingTable::uniform(2, &[1, 2, 3], 3); + plan_full_decommission(1, &t, &routing, 2).unwrap() + } + + #[tokio::test] + async fn coordinator_proposes_every_entry_in_order() { + let plan = three_node_plan(); + let expected = plan.entries.clone(); + let proposer = RecordingProposer::new(); + let coord = DecommissionCoordinator::new(plan, proposer.clone()); + let result = coord.run().await.unwrap(); + + assert_eq!(result.node_id, 1); + assert_eq!(result.entries_committed, expected.len()); + let committed = proposer.committed.lock().unwrap().clone(); + assert_eq!(committed, expected); + } + + #[tokio::test] + async fn coordinator_aborts_on_proposer_error() { + let plan = three_node_plan(); + let proposer = FailingProposer { + fail_after: 2, + counter: AtomicU64::new(0), + }; + let coord = DecommissionCoordinator::new(plan, proposer); + let err = coord.run().await.unwrap_err(); + assert!(err.to_string().contains("injected failure")); + } + + #[tokio::test] + async fn coordinator_reports_last_applied_index() { + let plan = three_node_plan(); + let proposer = RecordingProposer::new(); + let coord = DecommissionCoordinator::new(plan, proposer.clone()); + let result = coord.run().await.unwrap(); + // The recording proposer returns monotonically increasing + // indexes starting from 1; the last one equals the total + // entry count. + assert_eq!(result.last_applied_index, result.entries_committed as u64); + } + + /// Sanity: the plan's shape is preserved end to end — the + /// recording proposer sees the same `StartDecommission` / + /// `FinishDecommission` / `Leave` bookends. + #[tokio::test] + async fn coordinator_preserves_bookends() { + let plan = three_node_plan(); + let proposer = RecordingProposer::new(); + let coord = DecommissionCoordinator::new(plan, proposer.clone()); + coord.run().await.unwrap(); + + let committed = proposer.committed.lock().unwrap().clone(); + assert!(matches!( + committed.first(), + Some(MetadataEntry::TopologyChange( + TopologyChange::StartDecommission { node_id: 1 } + )) + )); + assert!(matches!( + committed.last(), + Some(MetadataEntry::TopologyChange(TopologyChange::Leave { + node_id: 1 + })) + )); + // At least one RemoveMember for the target. + assert!(committed.iter().any(|e| matches!( + e, + MetadataEntry::RoutingChange(RoutingChange::RemoveMember { node_id: 1, .. }) + ))); + } +} diff --git a/nodedb-cluster/src/decommission/flow.rs b/nodedb-cluster/src/decommission/flow.rs new file mode 100644 index 00000000..aa2c30c3 --- /dev/null +++ b/nodedb-cluster/src/decommission/flow.rs @@ -0,0 +1,227 @@ +//! Decommission flow — emit the full ordered sequence of metadata +//! entries that move a node from `Active` to fully removed. +//! +//! [`plan_full_decommission`] is pure: given a snapshot of topology +//! and routing, it returns the exact list of +//! [`MetadataEntry`](crate::metadata_group::MetadataEntry) values the +//! coordinator will propose through the metadata Raft group, in the +//! order they must commit. The flow is deterministic — two nodes +//! looking at the same snapshot produce byte-identical plans, which +//! means a failed coordinator can be resumed from any consistent +//! snapshot without needing per-plan state to be replicated. + +use crate::error::Result; +use crate::metadata_group::{MetadataEntry, RoutingChange, TopologyChange}; +use crate::routing::RoutingTable; +use crate::topology::ClusterTopology; + +use super::safety::check_can_decommission; + +/// Output of [`plan_full_decommission`] — the caller proposes +/// `entries` in order, waiting for each to commit before moving on. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DecommissionPlan { + pub node_id: u64, + pub entries: Vec, +} + +/// Build the complete decommission plan for `node_id`. +/// +/// Steps (in the order they appear in the returned `entries`): +/// +/// 1. `TopologyChange::StartDecommission` — flip the target to +/// `Draining`. +/// 2. `RoutingChange::LeadershipTransfer` — for every group the +/// target currently leads, hand leadership to another voter. +/// 3. `RoutingChange::RemoveMember` — strip the target out of every +/// group's member (and learner) list. +/// 4. `TopologyChange::FinishDecommission` — flip the target to +/// `Decommissioned`. +/// 5. `TopologyChange::Leave` — remove the target from topology +/// entirely so future peer lookups return `NodeNotFound`. +/// +/// The safety gate in [`check_can_decommission`] runs first and +/// returns an error without producing a plan if any group would drop +/// below the configured replication factor. +pub fn plan_full_decommission( + node_id: u64, + topology: &ClusterTopology, + routing: &RoutingTable, + replication_factor: usize, +) -> Result { + check_can_decommission(node_id, topology, routing, replication_factor)?; + + let mut entries = Vec::new(); + entries.push(MetadataEntry::TopologyChange( + TopologyChange::StartDecommission { node_id }, + )); + + // Collect a stable, sorted group_id ordering so the plan is + // reproducible across HashMap iterations. + let mut group_ids: Vec = routing + .group_members() + .iter() + .filter(|(_, info)| info.members.contains(&node_id) || info.learners.contains(&node_id)) + .map(|(gid, _)| *gid) + .collect(); + group_ids.sort_unstable(); + + // 2. Leadership transfers for every group the target currently leads. + for gid in &group_ids { + let info = routing + .group_info(*gid) + .expect("group id came from routing snapshot"); + if info.leader != node_id { + continue; + } + if let Some(&new_leader) = info.members.iter().find(|&&m| m != node_id) { + entries.push(MetadataEntry::RoutingChange( + RoutingChange::LeadershipTransfer { + group_id: *gid, + new_leader_node_id: new_leader, + }, + )); + } + } + + // 3. Remove the target from every group's member and learner sets. + for gid in &group_ids { + entries.push(MetadataEntry::RoutingChange(RoutingChange::RemoveMember { + group_id: *gid, + node_id, + })); + } + + // 4. Finish decommission (topology state → Decommissioned). + entries.push(MetadataEntry::TopologyChange( + TopologyChange::FinishDecommission { node_id }, + )); + + // 5. Leave — remove from topology entirely. + entries.push(MetadataEntry::TopologyChange(TopologyChange::Leave { + node_id, + })); + + Ok(DecommissionPlan { node_id, entries }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::topology::{NodeInfo, NodeState}; + use std::net::SocketAddr; + + fn topo(nodes: &[u64]) -> ClusterTopology { + let mut t = ClusterTopology::new(); + for (i, id) in nodes.iter().enumerate() { + let addr: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + t.add_node(NodeInfo::new(*id, addr, NodeState::Active)); + } + t + } + + #[test] + fn plan_shape_matches_spec() { + let t = topo(&[1, 2, 3]); + // 2 groups, RF=3 (each group has all 3 nodes). Decommission + // 1 with RF=2 (the surviving quorum). + let routing = RoutingTable::uniform(2, &[1, 2, 3], 3); + let plan = plan_full_decommission(1, &t, &routing, 2).unwrap(); + assert_eq!(plan.node_id, 1); + + // First entry: StartDecommission. + assert!(matches!( + plan.entries.first(), + Some(MetadataEntry::TopologyChange( + TopologyChange::StartDecommission { node_id: 1 } + )) + )); + + // Last two entries: FinishDecommission, Leave. + let n = plan.entries.len(); + assert!(matches!( + plan.entries[n - 2], + MetadataEntry::TopologyChange(TopologyChange::FinishDecommission { node_id: 1 }) + )); + assert!(matches!( + plan.entries[n - 1], + MetadataEntry::TopologyChange(TopologyChange::Leave { node_id: 1 }) + )); + + // Every group the target is in must get a RemoveMember. + let remove_count = plan + .entries + .iter() + .filter(|e| { + matches!( + e, + MetadataEntry::RoutingChange(RoutingChange::RemoveMember { node_id: 1, .. }) + ) + }) + .count(); + assert_eq!(remove_count, 2); + } + + #[test] + fn plan_emits_leadership_transfer_when_target_leads() { + let t = topo(&[1, 2, 3]); + let mut routing = RoutingTable::uniform(2, &[1, 2, 3], 3); + routing.set_leader(0, 1); + routing.set_leader(1, 2); + let plan = plan_full_decommission(1, &t, &routing, 2).unwrap(); + // Exactly one LeadershipTransfer for group 0. + let transfers: Vec<_> = plan + .entries + .iter() + .filter_map(|e| match e { + MetadataEntry::RoutingChange(RoutingChange::LeadershipTransfer { + group_id, + new_leader_node_id, + }) => Some((*group_id, *new_leader_node_id)), + _ => None, + }) + .collect(); + assert_eq!(transfers.len(), 1); + assert_eq!(transfers[0].0, 0); + assert_ne!(transfers[0].1, 1, "new leader must not be the target"); + } + + #[test] + fn plan_is_deterministic() { + let t = topo(&[1, 2, 3]); + let routing = RoutingTable::uniform(4, &[1, 2, 3], 3); + let p1 = plan_full_decommission(2, &t, &routing, 2).unwrap(); + let p2 = plan_full_decommission(2, &t, &routing, 2).unwrap(); + assert_eq!(p1.entries, p2.entries); + } + + #[test] + fn plan_rejected_when_safety_fails() { + let t = topo(&[1, 2]); + let routing = RoutingTable::uniform(2, &[1, 2], 2); + let err = plan_full_decommission(1, &t, &routing, 2).unwrap_err(); + assert!(err.to_string().contains("replication factor")); + } + + #[test] + fn plan_skips_groups_target_is_not_in() { + let t = topo(&[1, 2, 3]); + let mut routing = RoutingTable::uniform(4, &[1, 2, 3], 3); + routing.set_group_members(0, vec![2, 3]); + routing.set_group_members(1, vec![2, 3]); + routing.set_group_members(2, vec![1, 2, 3]); + routing.set_group_members(3, vec![1, 2, 3]); + let plan = plan_full_decommission(1, &t, &routing, 2).unwrap(); + let removes: Vec = plan + .entries + .iter() + .filter_map(|e| match e { + MetadataEntry::RoutingChange(RoutingChange::RemoveMember { group_id, .. }) => { + Some(*group_id) + } + _ => None, + }) + .collect(); + assert_eq!(removes, vec![2, 3]); + } +} diff --git a/nodedb-cluster/src/decommission/mod.rs b/nodedb-cluster/src/decommission/mod.rs new file mode 100644 index 00000000..3b0bd5c4 --- /dev/null +++ b/nodedb-cluster/src/decommission/mod.rs @@ -0,0 +1,35 @@ +//! Decommission flow — graceful removal of a node from the cluster. +//! +//! Decommission is a multi-step, metadata-raft-replicated process: +//! +//! 1. **Safety gate** — [`safety::check_can_decommission`] refuses the +//! decommission if any Raft group the target is in would drop below +//! the configured replication factor after its removal. This is +//! the only correctness-critical check — once it passes, every +//! subsequent step is just routing/topology bookkeeping. +//! 2. **Plan** — [`flow::plan_full_decommission`] emits the full ordered +//! sequence of [`MetadataEntry`](crate::metadata_group::MetadataEntry) +//! values the coordinator will propose: `StartDecommission`, any +//! required leadership transfers, a `RemoveMember` per group, then +//! `FinishDecommission` and `Leave`. +//! 3. **Propose** (future batch: `coordinator.rs`) — stateful actor +//! proposes each entry in order through a `MetadataProposer` trait, +//! waiting for the applied index to advance past each commit before +//! advancing its own state. +//! 4. **Observe** (future batch: `observer.rs`) — the target node +//! watches its own topology state and fires a cooperative shutdown +//! signal when it transitions to `Decommissioned`. +//! +//! This sub-batch ships steps 1 and 2 as pure, side-effect-free +//! functions so the flow can be exhaustively unit-tested before the +//! stateful coordinator is wired up. + +pub mod coordinator; +pub mod flow; +pub mod observer; +pub mod safety; + +pub use coordinator::{DecommissionCoordinator, DecommissionRunResult, MetadataProposer}; +pub use flow::{DecommissionPlan, plan_full_decommission}; +pub use observer::DecommissionObserver; +pub use safety::{DecommissionSafetyError, check_can_decommission}; diff --git a/nodedb-cluster/src/decommission/observer.rs b/nodedb-cluster/src/decommission/observer.rs new file mode 100644 index 00000000..d3034c80 --- /dev/null +++ b/nodedb-cluster/src/decommission/observer.rs @@ -0,0 +1,196 @@ +//! `DecommissionObserver` — local-node self-shutdown signal. +//! +//! The coordinator proposes a full decommission plan through the +//! metadata Raft group. Every node (including the target itself) +//! applies the resulting entries through `CacheApplier`, which, when +//! attached with [`CacheApplier::with_live_state`](crate::metadata_group::CacheApplier::with_live_state), +//! cascades topology state transitions into the live +//! `Arc>` handle. +//! +//! The observer polls that handle for the *local* node id. Once the +//! node's own state reaches `Decommissioned` — or the node has been +//! removed from topology entirely by a committed `Leave` — the +//! observer flips a `tokio::sync::watch` channel to `true`, which is +//! the cooperative shutdown signal every long-lived background task +//! on this node is already listening on. +//! +//! This is the last link in the decommission chain: once the watch +//! is flipped, the raft loops, SWIM detector, reachability driver, +//! and transport accept loops all drain and exit on their own. + +use std::sync::{Arc, RwLock}; +use std::time::Duration; + +use tokio::sync::watch; +use tokio::time::interval; +use tracing::{info, warn}; + +use crate::topology::{ClusterTopology, NodeState}; + +/// Periodically checks the local node's topology state and fires a +/// shutdown signal on `Decommissioned` or removal. +pub struct DecommissionObserver { + topology: Arc>, + local_node_id: u64, + shutdown_tx: watch::Sender, + poll_interval: Duration, +} + +impl DecommissionObserver { + /// Build an observer and return it alongside the receiver half of + /// its shutdown watch channel. Every subsystem that wants to + /// cooperatively drain on decommission can call + /// [`watch::Receiver::clone`] on the returned receiver. + pub fn new( + topology: Arc>, + local_node_id: u64, + poll_interval: Duration, + ) -> (Self, watch::Receiver) { + let (shutdown_tx, shutdown_rx) = watch::channel(false); + ( + Self { + topology, + local_node_id, + shutdown_tx, + poll_interval, + }, + shutdown_rx, + ) + } + + /// Single check. Returns `true` iff the observer fired the + /// shutdown signal during this call (or had already fired it + /// previously — the watch is level-triggered, not edge). + pub fn check_once(&self) -> bool { + if *self.shutdown_tx.borrow() { + return true; + } + let topo = self.topology.read().unwrap_or_else(|p| p.into_inner()); + let should_fire = match topo.get_node(self.local_node_id) { + Some(node) => node.state == NodeState::Decommissioned, + // Node is gone from topology — either a committed `Leave` + // (post-decommission) or manual removal. Either way, we + // are no longer part of the cluster. + None => true, + }; + if should_fire { + info!( + local_node_id = self.local_node_id, + "decommission observer firing local shutdown signal" + ); + if let Err(e) = self.shutdown_tx.send(true) { + warn!(error = %e, "shutdown watch receivers all dropped"); + } + return true; + } + false + } + + /// Run the observer's poll loop until `cancel` flips to `true`. + /// Exits immediately after firing its own shutdown signal — + /// there is nothing more to watch. + pub async fn run(self, mut cancel: watch::Receiver) { + let mut tick = interval(self.poll_interval); + tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + loop { + tokio::select! { + biased; + changed = cancel.changed() => { + if changed.is_ok() && *cancel.borrow() { + return; + } + } + _ = tick.tick() => { + if self.check_once() { + return; + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::topology::NodeInfo; + use std::net::SocketAddr; + + fn topo_with(node_id: u64, state: NodeState) -> Arc> { + let mut t = ClusterTopology::new(); + let addr: SocketAddr = "127.0.0.1:9000".parse().unwrap(); + t.add_node(NodeInfo::new(node_id, addr, state)); + Arc::new(RwLock::new(t)) + } + + #[test] + fn check_once_does_not_fire_while_active() { + let topo = topo_with(5, NodeState::Active); + let (obs, _rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(10)); + assert!(!obs.check_once()); + } + + #[test] + fn check_once_fires_on_decommissioned_state() { + let topo = topo_with(5, NodeState::Active); + let (obs, mut rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(10)); + assert!(!obs.check_once()); + topo.write() + .unwrap() + .set_state(5, NodeState::Decommissioned); + assert!(obs.check_once()); + assert!(*rx.borrow_and_update()); + } + + #[test] + fn check_once_fires_when_node_removed_from_topology() { + let topo = topo_with(5, NodeState::Active); + let (obs, _rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(10)); + topo.write().unwrap().remove_node(5); + assert!(obs.check_once()); + } + + #[test] + fn check_once_is_idempotent_after_firing() { + let topo = topo_with(5, NodeState::Decommissioned); + let (obs, _rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(10)); + assert!(obs.check_once()); + // Second call sees the fired signal and reports true again. + assert!(obs.check_once()); + } + + #[tokio::test(start_paused = true)] + async fn run_loop_fires_shutdown_and_exits() { + let topo = topo_with(5, NodeState::Active); + let (obs, mut rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(50)); + let (_cancel_tx, cancel_rx) = watch::channel(false); + let handle = tokio::spawn(async move { obs.run(cancel_rx).await }); + + // Advance twice — first tick = no-op, then flip state. + tokio::time::advance(Duration::from_millis(60)).await; + tokio::task::yield_now().await; + topo.write() + .unwrap() + .set_state(5, NodeState::Decommissioned); + tokio::time::advance(Duration::from_millis(60)).await; + tokio::task::yield_now().await; + + let _ = tokio::time::timeout(Duration::from_millis(500), handle) + .await + .expect("observer run loop did not exit"); + assert!(*rx.borrow_and_update()); + } + + #[tokio::test(start_paused = true)] + async fn run_loop_exits_on_cancel_without_firing() { + let topo = topo_with(5, NodeState::Active); + let (obs, rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(50)); + let (cancel_tx, cancel_rx) = watch::channel(false); + let handle = tokio::spawn(async move { obs.run(cancel_rx).await }); + let _ = cancel_tx.send(true); + let _ = tokio::time::timeout(Duration::from_millis(500), handle) + .await + .expect("cancel did not end run loop"); + assert!(!*rx.borrow()); + } +} diff --git a/nodedb-cluster/src/decommission/safety.rs b/nodedb-cluster/src/decommission/safety.rs new file mode 100644 index 00000000..91533a34 --- /dev/null +++ b/nodedb-cluster/src/decommission/safety.rs @@ -0,0 +1,172 @@ +//! Decommission safety gate. +//! +//! Before the coordinator proposes a single metadata entry, it must +//! prove that removing the target node from every Raft group it +//! belongs to will leave each group with at least `replication_factor` +//! voting members. Dropping below RF silently is a data-loss bug — +//! this module is the only place that decision is made. + +use crate::error::{ClusterError, Result}; +use crate::routing::RoutingTable; +use crate::topology::{ClusterTopology, NodeState}; + +/// Why a decommission request was rejected. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DecommissionSafetyError { + /// The target node id does not exist in the topology. + NodeNotFound { node_id: u64 }, + /// The node is already past the point of decommission. + AlreadyDecommissioned { node_id: u64 }, + /// Removing the node would leave this group below `replication_factor` + /// voters. The decommission must wait until a new voter has been + /// added to the group (via rebalance / migration executor). + WouldViolateReplicationFactor { + node_id: u64, + group_id: u64, + current_voters: usize, + replication_factor: usize, + }, +} + +impl std::fmt::Display for DecommissionSafetyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NodeNotFound { node_id } => { + write!(f, "node {node_id} not found in topology") + } + Self::AlreadyDecommissioned { node_id } => { + write!(f, "node {node_id} is already decommissioned") + } + Self::WouldViolateReplicationFactor { + node_id, + group_id, + current_voters, + replication_factor, + } => write!( + f, + "removing node {node_id} from group {group_id} \ + would leave {} voter(s), below replication factor {replication_factor}", + current_voters.saturating_sub(1) + ), + } + } +} + +impl std::error::Error for DecommissionSafetyError {} + +impl From for ClusterError { + fn from(value: DecommissionSafetyError) -> Self { + ClusterError::Transport { + detail: value.to_string(), + } + } +} + +/// Verify that node `node_id` can be safely stripped out of every +/// group it participates in without dropping any group below +/// `replication_factor` voters. +/// +/// This check is purely structural — it looks at the current routing +/// table, not the live cluster. Callers must re-run it immediately +/// before proposing each step if the topology may have shifted since +/// the plan was computed. +pub fn check_can_decommission( + node_id: u64, + topology: &ClusterTopology, + routing: &RoutingTable, + replication_factor: usize, +) -> Result<()> { + let node = topology + .get_node(node_id) + .ok_or(DecommissionSafetyError::NodeNotFound { node_id })?; + + if node.state == NodeState::Decommissioned { + return Err(DecommissionSafetyError::AlreadyDecommissioned { node_id }.into()); + } + + for (group_id, info) in routing.group_members() { + if !info.members.contains(&node_id) { + continue; + } + let current_voters = info.members.len(); + // After removal the group would have `current_voters - 1` + // voters. Require that to be at least `replication_factor`. + if current_voters.saturating_sub(1) < replication_factor { + return Err(DecommissionSafetyError::WouldViolateReplicationFactor { + node_id, + group_id: *group_id, + current_voters, + replication_factor, + } + .into()); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::topology::NodeInfo; + use std::net::SocketAddr; + + fn topo(nodes: &[u64]) -> ClusterTopology { + let mut t = ClusterTopology::new(); + for (i, id) in nodes.iter().enumerate() { + let addr: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + t.add_node(NodeInfo::new(*id, addr, NodeState::Active)); + } + t + } + + #[test] + fn rejects_unknown_node() { + let t = topo(&[1, 2, 3]); + let r = RoutingTable::uniform(2, &[1, 2, 3], 3); + let err = check_can_decommission(99, &t, &r, 2).unwrap_err(); + assert!(err.to_string().contains("99")); + } + + #[test] + fn rejects_already_decommissioned() { + let mut t = topo(&[1, 2, 3]); + t.set_state(1, NodeState::Decommissioned); + let r = RoutingTable::uniform(2, &[1, 2, 3], 3); + let err = check_can_decommission(1, &t, &r, 2).unwrap_err(); + assert!(err.to_string().contains("already decommissioned")); + } + + #[test] + fn rejects_when_rf_would_be_violated() { + let t = topo(&[1, 2]); + // RF=2 with only 2 nodes → every group has exactly 2 voters. + // Removing either one would leave 1 voter (< RF=2). + let r = RoutingTable::uniform(2, &[1, 2], 2); + let err = check_can_decommission(1, &t, &r, 2).unwrap_err(); + assert!(err.to_string().contains("replication factor")); + } + + #[test] + fn accepts_when_extra_voter_available() { + let t = topo(&[1, 2, 3]); + // 3 nodes × RF=2 means each group has 2 voters but the third + // node is a candidate replacement. The safety check doesn't + // know about replacements — it only checks current state, + // so we need RF=1 for this to pass without a prior rebalance. + let r = RoutingTable::uniform(2, &[1, 2, 3], 3); + check_can_decommission(1, &t, &r, 2).unwrap(); + } + + #[test] + fn skips_groups_target_is_not_member_of() { + let t = topo(&[1, 2, 3]); + // Node 1 is only in group 0, node 2 is only in group 1. + let mut r = RoutingTable::uniform(2, &[1, 2, 3], 3); + r.set_group_members(0, vec![1, 3]); + r.set_group_members(1, vec![2, 3]); + // Decommission 1 with RF=1 → group 0 drops to [3], group 1 + // untouched. + check_can_decommission(1, &t, &r, 1).unwrap(); + } +} diff --git a/nodedb-cluster/src/follower_read.rs b/nodedb-cluster/src/follower_read.rs new file mode 100644 index 00000000..16d0886e --- /dev/null +++ b/nodedb-cluster/src/follower_read.rs @@ -0,0 +1,130 @@ +//! Follower-read decision gate. +//! +//! [`FollowerReadGate`] answers a single question: "given the +//! session's `ReadConsistency` and the local node's role + closed +//! timestamp for the target Raft group, can this read be served +//! locally without forwarding to the leader?" +//! +//! ## Decision table +//! +//! | Consistency | Local role | Closed TS fresh? | Serve locally? | +//! |-----------------------|-------------|------------------|----------------| +//! | Strong | * | * | Only if leader | +//! | BoundedStaleness(d) | Follower | ≤ d | Yes | +//! | BoundedStaleness(d) | Follower | > d | No → forward | +//! | BoundedStaleness(d) | Leader | * | Yes | +//! | Eventual | * | * | Yes | +//! +//! The gate is stateless — it reads from shared handles to the +//! closed-timestamp tracker and the raft-status provider. + +use std::sync::Arc; +use std::time::Duration; + +use crate::closed_timestamp::ClosedTimestampTracker; + +/// Consistency level for a single read — mirrors the `ReadConsistency` +/// enum in the `nodedb` crate without coupling `nodedb-cluster` to it. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ReadLevel { + Strong, + BoundedStaleness(Duration), + Eventual, +} + +/// Answers "can this read be served locally?" +pub struct FollowerReadGate { + closed_ts: Arc, + /// Type-erased function that returns true if this node is the + /// leader for the given group. Injection seam — production wraps + /// `MultiRaft::group_statuses`, tests supply a closure. + is_leader_fn: Box bool + Send + Sync>, +} + +impl FollowerReadGate { + pub fn new( + closed_ts: Arc, + is_leader_fn: Box bool + Send + Sync>, + ) -> Self { + Self { + closed_ts, + is_leader_fn, + } + } + + /// Returns `true` if the read can be served from this node's + /// local replica without forwarding to the leader. + pub fn can_serve_locally(&self, group_id: u64, level: ReadLevel) -> bool { + match level { + ReadLevel::Strong => (self.is_leader_fn)(group_id), + ReadLevel::Eventual => true, + ReadLevel::BoundedStaleness(max) => { + if (self.is_leader_fn)(group_id) { + return true; + } + self.closed_ts.is_fresh_enough(group_id, max) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn gate(leader_groups: &'static [u64]) -> FollowerReadGate { + FollowerReadGate::new( + Arc::new(ClosedTimestampTracker::new()), + Box::new(move |gid| leader_groups.contains(&gid)), + ) + } + + fn gate_with_tracker( + leader_groups: &'static [u64], + tracker: Arc, + ) -> FollowerReadGate { + FollowerReadGate::new(tracker, Box::new(move |gid| leader_groups.contains(&gid))) + } + + #[test] + fn strong_requires_leader() { + let g = gate(&[1]); + assert!(g.can_serve_locally(1, ReadLevel::Strong)); + assert!(!g.can_serve_locally(2, ReadLevel::Strong)); + } + + #[test] + fn eventual_always_local() { + let g = gate(&[]); + assert!(g.can_serve_locally(99, ReadLevel::Eventual)); + } + + #[test] + fn bounded_staleness_leader_always_local() { + let g = gate(&[1]); + assert!(g.can_serve_locally(1, ReadLevel::BoundedStaleness(Duration::from_secs(5)))); + } + + #[test] + fn bounded_staleness_follower_fresh_enough() { + let tracker = Arc::new(ClosedTimestampTracker::new()); + tracker.mark_applied(2); + let g = gate_with_tracker(&[], tracker); + assert!(g.can_serve_locally(2, ReadLevel::BoundedStaleness(Duration::from_secs(5)))); + } + + #[test] + fn bounded_staleness_follower_too_stale() { + let tracker = Arc::new(ClosedTimestampTracker::new()); + let old = std::time::Instant::now() - Duration::from_secs(30); + tracker.mark_applied_at(2, old); + let g = gate_with_tracker(&[], tracker); + assert!(!g.can_serve_locally(2, ReadLevel::BoundedStaleness(Duration::from_secs(5)))); + } + + #[test] + fn bounded_staleness_unknown_group_not_local() { + let g = gate(&[]); + assert!(!g.can_serve_locally(99, ReadLevel::BoundedStaleness(Duration::from_secs(5)))); + } +} diff --git a/nodedb-cluster/src/health.rs b/nodedb-cluster/src/health.rs index e27e8137..654b4f56 100644 --- a/nodedb-cluster/src/health.rs +++ b/nodedb-cluster/src/health.rs @@ -151,14 +151,38 @@ impl HealthMonitor { } } - /// Handle a successful pong — reset failure count, mark node Active if needed. - fn handle_pong(&self, peer_id: u64, _pong: &PongResponse) -> bool { + /// Handle a successful pong — reset failure count, mark node Active + /// if needed, and push topology if the peer is behind. + fn handle_pong(&self, peer_id: u64, pong: &PongResponse) -> bool { // Reset failure count. { let mut failures = self.ping_failures.lock().unwrap_or_else(|p| p.into_inner()); failures.remove(&peer_id); } + // Push topology to peers with a stale version. This closes + // the convergence gap when the fire-and-forget broadcast + // during the join flow is lost (e.g. peer QUIC server not + // yet accepting at that instant). + let our_version = { + let topo = self.topology.read().unwrap_or_else(|p| p.into_inner()); + topo.version() + }; + if pong.topology_version < our_version { + debug!( + peer_id, + peer_version = pong.topology_version, + our_version, + "peer has stale topology, pushing update" + ); + let transport = self.transport.clone(); + let topology = self.topology.clone(); + let self_id = self.node_id; + tokio::spawn(async move { + broadcast_topology_to_peer(self_id, peer_id, &topology, &transport).await; + }); + } + // If node was not Active, mark it Active. let mut topo = self.topology.write().unwrap_or_else(|p| p.into_inner()); if let Some(node) = topo.get_node(peer_id) @@ -264,6 +288,34 @@ pub fn broadcast_topology( } } +/// Send a topology update to a single peer that has a stale version. +async fn broadcast_topology_to_peer( + _self_node_id: u64, + peer_id: u64, + topology: &RwLock, + transport: &NexarTransport, +) { + let update = { + let topo = topology.read().unwrap_or_else(|p| p.into_inner()); + RaftRpc::TopologyUpdate(TopologyUpdate { + version: topo.version(), + nodes: topo + .all_nodes() + .map(|n| JoinNodeInfo { + node_id: n.node_id, + addr: n.addr.clone(), + state: n.state.as_u8(), + raft_groups: n.raft_groups.clone(), + wire_version: n.wire_version, + }) + .collect(), + }) + }; + if let Err(e) = transport.send_rpc(peer_id, update).await { + debug!(peer_id, error = %e, "targeted topology push failed"); + } +} + /// Handle an incoming Ping RPC — return a Pong with our topology version. pub fn handle_ping(node_id: u64, topology_version: u64, _req: &PingRequest) -> RaftRpc { RaftRpc::Pong(PongResponse { diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs index ea340b52..8909a79d 100644 --- a/nodedb-cluster/src/lib.rs +++ b/nodedb-cluster/src/lib.rs @@ -1,9 +1,11 @@ pub mod bootstrap; pub mod catalog; pub mod circuit_breaker; +pub mod closed_timestamp; pub mod cluster_info; pub mod conf_change; pub mod cross_shard_txn; +pub mod decommission; pub mod distributed_document; pub mod distributed_graph; pub mod distributed_join; @@ -11,6 +13,7 @@ pub mod distributed_spatial; pub mod distributed_timeseries; pub mod distributed_vector; pub mod error; +pub mod follower_read; pub mod forward; pub mod ghost; pub mod ghost_sweeper; @@ -25,10 +28,13 @@ pub mod quic_transport; pub mod raft_loop; pub mod raft_storage; pub mod rdma_transport; +pub mod reachability; pub mod readiness; pub mod rebalance; pub mod rebalance_scheduler; +pub mod rebalancer; pub mod routing; +pub mod routing_liveness; pub mod rpc_codec; pub mod shard_split; pub mod swim; @@ -39,11 +45,17 @@ pub mod wire; pub use bootstrap::{ClusterConfig, ClusterState, JoinRetryPolicy, start_cluster}; pub use catalog::ClusterCatalog; +pub use closed_timestamp::ClosedTimestampTracker; pub use cluster_info::{ ClusterInfoSnapshot, ClusterObserver, GroupSnapshot, GroupStatusProvider, PeerSnapshot, }; pub use conf_change::{ConfChange, ConfChangeType}; +pub use decommission::{ + DecommissionCoordinator, DecommissionObserver, DecommissionPlan, DecommissionRunResult, + DecommissionSafetyError, MetadataProposer, check_can_decommission, plan_full_decommission, +}; pub use error::{ClusterError, Result}; +pub use follower_read::{FollowerReadGate, ReadLevel}; pub use forward::{NoopPlanExecutor, PlanExecutor}; pub use ghost::{GhostStub, GhostTable}; pub use health::{HealthConfig, HealthMonitor}; @@ -54,8 +66,17 @@ pub use migration_executor::{ }; pub use multi_raft::{GroupStatus, MultiRaft}; pub use raft_loop::{CommitApplier, RaftLoop, VShardEnvelopeHandler}; +pub use reachability::{ + NoopProber, ReachabilityDriver, ReachabilityDriverConfig, ReachabilityProber, TransportProber, +}; pub use rebalance::{RebalancePlan, compute_plan, plan_to_requests}; +pub use rebalancer::{ + AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, LoadWeights, + MigrationDispatcher, RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig, + RebalancerPlanConfig, compute_load_based_plan, normalized_score, +}; pub use routing::RoutingTable; +pub use routing_liveness::{NodeIdResolver, RoutingLivenessHook}; pub use rpc_codec::RaftRpc; pub use topology::{ClusterTopology, NodeInfo, NodeState}; pub use transport::{NexarTransport, RaftRpcHandler}; @@ -78,7 +99,8 @@ pub use lifecycle::{ pub use rdma_transport::{RdmaConfig, RdmaTransport}; pub use rebalance_scheduler::{NodeMetrics, RebalanceScheduler, RebalanceTrigger, SchedulerConfig}; pub use shard_split::{SplitPlan, SplitStrategy, plan_graph_split, plan_vector_split}; +pub use swim::bootstrap::spawn_with_subscribers as spawn_swim_with_subscribers; pub use swim::{ - Incarnation, Member, MemberState, MembershipList, SwimConfig, SwimError, SwimHandle, - UdpTransport, spawn as spawn_swim, + Incarnation, Member, MemberState, MembershipList, MembershipSubscriber, SwimConfig, SwimError, + SwimHandle, UdpTransport, spawn as spawn_swim, }; diff --git a/nodedb-cluster/src/lifecycle.rs b/nodedb-cluster/src/lifecycle.rs index 28dd6bd2..43966b8c 100644 --- a/nodedb-cluster/src/lifecycle.rs +++ b/nodedb-cluster/src/lifecycle.rs @@ -15,7 +15,7 @@ use tracing::{info, warn}; use crate::error::{ClusterError, Result}; -use crate::metadata_group::{MetadataEntry, RoutingChange, TopologyChange}; +use crate::metadata_group::{MetadataEntry, TopologyChange}; use crate::routing::RoutingTable; use crate::topology::{ClusterTopology, NodeInfo, NodeState}; @@ -27,55 +27,34 @@ pub struct DecommissionResult { pub completed: bool, } -/// Plan a node decommission: compute which vShards to migrate and where. -/// -/// Produces a sequence of [`MetadataEntry`] values to be proposed against -/// the metadata Raft group in order. Steps: -/// 1. Start decommission (topology transition). -/// 2. Transfer leadership of all Raft groups led by this node. +/// Plan a node decommission — thin wrapper over +/// [`crate::decommission::plan_full_decommission`] that returns the +/// full ordered sequence of metadata entries. Kept as a public +/// convenience for older call sites; new code should use the +/// `decommission` module directly. pub fn plan_decommission( node_id: u64, topology: &ClusterTopology, routing: &RoutingTable, ) -> Result> { - let node = topology.get_node(node_id).ok_or(ClusterError::Transport { - detail: format!("node {node_id} not found in topology"), - })?; - - if node.state == NodeState::Decommissioned { - return Err(ClusterError::Transport { - detail: format!("node {node_id} is already decommissioned"), - }); - } - - let mut entries = Vec::new(); - - // Step 1: Start decommission. - entries.push(MetadataEntry::TopologyChange( - TopologyChange::StartDecommission { node_id }, - )); - - // Step 2: Leadership transfers for groups led by this node. - for group_id in routing.group_ids() { - if let Some(info) = routing.group_info(group_id) - && info.leader == node_id - && let Some(&new_leader) = info.members.iter().find(|&&m| m != node_id) - { - entries.push(MetadataEntry::RoutingChange( - RoutingChange::LeadershipTransfer { - group_id, - new_leader_node_id: new_leader, - }, - )); - } - } - + // Historical callers assumed the full-cluster RF; derive a safe + // lower bound from the smallest existing group so the check is + // never stricter than the cluster is already running under. + let rf = routing + .group_members() + .values() + .map(|info| info.members.len()) + .min() + .unwrap_or(1) + .saturating_sub(1) + .max(1); + let plan = crate::decommission::plan_full_decommission(node_id, topology, routing, rf)?; info!( node_id, - metadata_entries = entries.len(), + metadata_entries = plan.entries.len(), "decommission plan computed" ); - Ok(entries) + Ok(plan.entries) } /// Check if a node can be safely removed from the cluster. diff --git a/nodedb-cluster/src/metadata_group/applier.rs b/nodedb-cluster/src/metadata_group/applier.rs index fd169f04..46d59549 100644 --- a/nodedb-cluster/src/metadata_group/applier.rs +++ b/nodedb-cluster/src/metadata_group/applier.rs @@ -1,12 +1,16 @@ //! [`MetadataApplier`] trait: the contract raft_loop uses to dispatch //! committed entries on the metadata group (group 0). +use std::net::SocketAddr; use std::sync::{Arc, RwLock}; use tracing::warn; use crate::metadata_group::cache::MetadataCache; use crate::metadata_group::codec::decode_entry; +use crate::metadata_group::entry::{MetadataEntry, RoutingChange, TopologyChange}; +use crate::routing::RoutingTable; +use crate::topology::{ClusterTopology, NodeInfo, NodeState}; /// Applies committed metadata entries to local state. /// @@ -29,16 +33,123 @@ pub trait MetadataApplier: Send + Sync + 'static { #[derive(Clone)] pub struct CacheApplier { cache: Arc>, + /// Optional live topology handle. When set, committed + /// `TopologyChange` entries mutate this handle in place so the + /// rest of the process sees the new state immediately — decommission + /// state transitions, joiner promotion, and `Leave` removal all + /// flow through here. + live_topology: Option>>, + /// Optional live routing table handle. When set, committed + /// `RoutingChange` entries (leadership transfer, member removal, + /// vshard reassignment) mutate this handle in place. + live_routing: Option>>, } impl CacheApplier { pub fn new(cache: Arc>) -> Self { - Self { cache } + Self { + cache, + live_topology: None, + live_routing: None, + } + } + + /// Extend this applier with live topology/routing handles. When + /// set, committed `TopologyChange` and `RoutingChange` entries + /// mutate the handles in place in addition to the in-memory + /// history log kept in `MetadataCache`. Backward-compatible: + /// existing callers that don't attach handles see no behaviour + /// change. + pub fn with_live_state( + mut self, + topology: Arc>, + routing: Arc>, + ) -> Self { + self.live_topology = Some(topology); + self.live_routing = Some(routing); + self } pub fn cache(&self) -> Arc> { self.cache.clone() } + + /// Mutate the live topology handle (if attached) in response to + /// a committed `TopologyChange`. Silent no-op when no handle is + /// set — backward-compatible with older test wiring. + fn apply_topology_change(&self, change: &TopologyChange) { + let Some(live) = &self.live_topology else { + return; + }; + let mut topo = live.write().unwrap_or_else(|p| p.into_inner()); + match change { + TopologyChange::Join { node_id, addr } => { + if topo.contains(*node_id) { + return; + } + let parsed: SocketAddr = addr.parse().unwrap_or_else(|_| { + warn!(node_id, addr, "join: invalid address, using placeholder"); + SocketAddr::from(([0, 0, 0, 0], 0)) + }); + topo.join_as_learner(NodeInfo::new(*node_id, parsed, NodeState::Joining)); + } + TopologyChange::PromoteToVoter { node_id } => { + topo.promote_to_voter(*node_id); + } + TopologyChange::StartDecommission { node_id } => { + topo.set_state(*node_id, NodeState::Draining); + } + TopologyChange::FinishDecommission { node_id } => { + topo.set_state(*node_id, NodeState::Decommissioned); + } + TopologyChange::Leave { node_id } => { + topo.remove_node(*node_id); + } + } + } + + /// Cascade live-state mutations for a committed entry. Handles + /// `Batch` by recursing into each sub-entry. + fn cascade_live_state(&self, entry: &MetadataEntry) { + match entry { + MetadataEntry::TopologyChange(change) => self.apply_topology_change(change), + MetadataEntry::RoutingChange(change) => self.apply_routing_change(change), + MetadataEntry::Batch { entries } => { + for sub in entries { + self.cascade_live_state(sub); + } + } + _ => {} + } + } + + /// Mutate the live routing handle (if attached) in response to + /// a committed `RoutingChange`. + fn apply_routing_change(&self, change: &RoutingChange) { + let Some(live) = &self.live_routing else { + return; + }; + let mut rt = live.write().unwrap_or_else(|p| p.into_inner()); + match change { + RoutingChange::ReassignVShard { + vshard_id, + new_group_id, + new_leaseholder_node_id, + } => { + rt.reassign_vshard(*vshard_id, *new_group_id); + rt.set_leader(*new_group_id, *new_leaseholder_node_id); + } + RoutingChange::LeadershipTransfer { + group_id, + new_leader_node_id, + } => { + rt.set_leader(*group_id, *new_leader_node_id); + } + RoutingChange::RemoveMember { group_id, node_id } => { + rt.remove_group_member(*group_id, *node_id); + } + } + } } impl MetadataApplier for CacheApplier { @@ -54,7 +165,10 @@ impl MetadataApplier for CacheApplier { continue; } match decode_entry(data) { - Ok(entry) => guard.apply(*index, &entry), + Ok(entry) => { + guard.apply(*index, &entry); + self.cascade_live_state(&entry); + } Err(e) => warn!(index = *index, error = %e, "metadata decode failed"), } } @@ -120,6 +234,72 @@ mod tests { assert_eq!(guard.catalog_entries_applied, 1); } + #[test] + fn cache_applier_mutates_live_topology_on_start_decommission() { + use crate::topology::{ClusterTopology, NodeInfo, NodeState}; + use std::net::SocketAddr; + + let cache = Arc::new(RwLock::new(MetadataCache::new())); + let mut t = ClusterTopology::new(); + let addr: SocketAddr = "127.0.0.1:9000".parse().unwrap(); + t.add_node(NodeInfo::new(7, addr, NodeState::Active)); + let topology = Arc::new(RwLock::new(t)); + let routing = Arc::new(RwLock::new(crate::routing::RoutingTable::uniform( + 1, + &[7], + 1, + ))); + let applier = + CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone()); + + let bytes = encode_entry(&MetadataEntry::TopologyChange( + TopologyChange::StartDecommission { node_id: 7 }, + )) + .unwrap(); + applier.apply(&[(1, bytes)]); + + let topo = topology.read().unwrap(); + assert_eq!(topo.get_node(7).unwrap().state, NodeState::Draining); + } + + #[test] + fn cache_applier_mutates_live_routing_on_remove_member() { + use crate::metadata_group::entry::RoutingChange; + + let cache = Arc::new(RwLock::new(MetadataCache::new())); + let topology = Arc::new(RwLock::new(crate::topology::ClusterTopology::new())); + let routing = Arc::new(RwLock::new(crate::routing::RoutingTable::uniform( + 1, + &[1, 2, 3], + 3, + ))); + let applier = + CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone()); + + let bytes = encode_entry(&MetadataEntry::RoutingChange(RoutingChange::RemoveMember { + group_id: 0, + node_id: 2, + })) + .unwrap(); + applier.apply(&[(1, bytes)]); + + let rt = routing.read().unwrap(); + assert!(!rt.group_info(0).unwrap().members.contains(&2)); + } + + #[test] + fn cache_applier_without_live_state_stays_log_only() { + let cache = Arc::new(RwLock::new(MetadataCache::new())); + let applier = CacheApplier::new(cache.clone()); + let bytes = encode_entry(&MetadataEntry::TopologyChange( + TopologyChange::StartDecommission { node_id: 5 }, + )) + .unwrap(); + // Must not panic and must still advance the applied index. + let last = applier.apply(&[(1, bytes)]); + assert_eq!(last, 1); + } + #[test] fn noop_applier_advances_watermark() { let noop = NoopMetadataApplier; diff --git a/nodedb-cluster/src/metadata_group/cache.rs b/nodedb-cluster/src/metadata_group/cache.rs index 24f7a4ba..23ae959c 100644 --- a/nodedb-cluster/src/metadata_group/cache.rs +++ b/nodedb-cluster/src/metadata_group/cache.rs @@ -106,6 +106,11 @@ impl MetadataCache { } } MetadataEntry::DescriptorDrainEnd { .. } => {} + MetadataEntry::Batch { entries } => { + for sub in entries { + self.apply(index, sub); + } + } } } } diff --git a/nodedb-cluster/src/metadata_group/entry.rs b/nodedb-cluster/src/metadata_group/entry.rs index 301a9bf8..8c8c18a2 100644 --- a/nodedb-cluster/src/metadata_group/entry.rs +++ b/nodedb-cluster/src/metadata_group/entry.rs @@ -39,6 +39,14 @@ pub enum MetadataEntry { payload: Vec, }, + /// Atomic batch of metadata entries proposed by a transactional + /// DDL session (`BEGIN; CREATE ...; CREATE ...; COMMIT;`). The + /// applier unpacks and applies each sub-entry in order at a + /// single raft log index, so either all commit or none do. + Batch { + entries: Vec, + }, + // ── Topology / routing ───────────────────────────────────────────── TopologyChange(TopologyChange), RoutingChange(RoutingChange), @@ -123,4 +131,11 @@ pub enum RoutingChange { group_id: u64, new_leader_node_id: u64, }, + /// Remove a node from a Raft group's member and learner sets. + /// + /// Used by the decommission flow to strip a draining node out of + /// every group it belongs to. Proposing this is only safe once + /// `safety::check_can_decommission` has confirmed the group will + /// still satisfy the configured replication factor. + RemoveMember { group_id: u64, node_id: u64 }, } diff --git a/nodedb-cluster/src/migration_executor.rs b/nodedb-cluster/src/migration_executor.rs index 9caeee80..18ea6b18 100644 --- a/nodedb-cluster/src/migration_executor.rs +++ b/nodedb-cluster/src/migration_executor.rs @@ -16,8 +16,10 @@ use std::time::Duration; use tracing::{debug, info}; use crate::conf_change::{ConfChange, ConfChangeType}; +use crate::decommission::MetadataProposer; use crate::error::{ClusterError, Result}; use crate::ghost::{GhostStub, GhostTable}; +use crate::metadata_group::{MetadataEntry, RoutingChange}; use crate::migration::{MigrationPhase, MigrationState}; use crate::multi_raft::MultiRaft; use crate::routing::RoutingTable; @@ -65,6 +67,13 @@ pub struct MigrationExecutor { topology: Arc>, transport: Arc, ghost_table: Arc>, + /// Optional metadata proposer for replicated routing updates. + /// When set, Phase 3 cut-over proposes a `RoutingChange` through + /// the metadata Raft group so every node applies the routing + /// update atomically on commit. When `None`, falls back to + /// local-only routing mutation (used by tests that don't stand + /// up a metadata group). + metadata_proposer: Option>, } impl MigrationExecutor { @@ -80,9 +89,17 @@ impl MigrationExecutor { topology, transport, ghost_table: Arc::new(Mutex::new(GhostTable::new())), + metadata_proposer: None, } } + /// Attach a metadata proposer for replicated Phase 3 cut-over. + /// Production wiring calls this; tests may omit it for simplicity. + pub fn with_metadata_proposer(mut self, proposer: Arc) -> Self { + self.metadata_proposer = Some(proposer); + self + } + /// Access the ghost table (for scatter-gather resolution). pub fn ghost_table(&self) -> &Arc> { &self.ghost_table @@ -180,9 +197,11 @@ impl MigrationExecutor { "phase 1: adding target to raft group" ); - // Add target node as a voter to the Raft group via ConfChange. + // Add target node as a LEARNER so it can catch up via Raft + // replication without participating in elections or voting. + // Promotion to voter happens after Phase 2 confirms catch-up. let change = ConfChange { - change_type: ConfChangeType::AddNode, + change_type: ConfChangeType::AddLearner, node_id: req.target_node, }; @@ -202,12 +221,13 @@ impl MigrationExecutor { // The ConfChange will be replicated and applied. The target node // receives the full log through Raft's normal replication. - // Mark base copy as complete immediately — Raft handles the transfer. + // Mark base copy as complete — Raft replication is now in + // progress; the real progress signal is match_index in Phase 2. state.update_base_copy(committed); debug!( vshard = req.vshard_id, - "phase 1 complete: target added to raft group" + "phase 1 complete: target added as learner to raft group" ); Ok(()) @@ -313,9 +333,21 @@ impl MigrationExecutor { state.update_wal_catchup(leader_commit, target_match); if state.is_catchup_ready() { + // Learner has caught up — promote to voter so the + // group has enough replicas for a safe cut-over. + let promote = ConfChange { + change_type: ConfChangeType::PromoteLearner, + node_id: req.target_node, + }; + { + let mut mr = self.multi_raft.lock().unwrap_or_else(|p| p.into_inner()); + mr.propose_conf_change(group_id, &promote)?; + } debug!( vshard = req.vshard_id, - leader_commit, target_match, "phase 2 complete: target caught up" + leader_commit, + target_match, + "phase 2 complete: target caught up and promoted to voter" ); return Ok(()); } @@ -331,15 +363,20 @@ impl MigrationExecutor { } } - /// Phase 3: Atomic routing table update via Raft. + /// Phase 3: Atomic routing table update. + /// + /// When a [`MetadataProposer`] is attached, the cut-over proposes + /// a `LeadershipTransfer` through the metadata Raft group so + /// every node applies the routing update atomically on commit. + /// Without a proposer (tests), falls back to a local-only + /// mutation. async fn phase3_cutover( &self, state: &mut MigrationState, group_id: u64, req: &MigrationRequest, ) -> Result<()> { - // Estimate pause (time to propose + commit the routing update). - let estimated_pause_us = 10_000; // ~10ms estimate for Raft round-trip. + let estimated_pause_us = 10_000; state.start_cutover(estimated_pause_us).map_err(|e| { state.fail(format!("cutover rejected: {e}")); @@ -353,28 +390,23 @@ impl MigrationExecutor { estimated_pause_us, "phase 3: atomic cut-over" ); - // Propose the routing update as a Raft entry so all nodes apply it - // atomically when committed. The entry is serialized as a ConfChange - // with a special routing marker that the applier interprets. - let routing_change = ConfChange { - change_type: ConfChangeType::AddNode, - node_id: req.target_node, - }; - { - let mut mr = self.multi_raft.lock().unwrap_or_else(|p| p.into_inner()); - mr.propose_conf_change(group_id, &routing_change)?; - } - - // Update the local routing table. Other nodes update theirs when they - // apply the committed entry through their own applier. - { + // Propose the routing change. With a metadata proposer the + // `CacheApplier::with_live_state` on every node handles the + // actual routing mutation when the entry commits; without a + // proposer we mutate locally for backward-compat. + if let Some(proposer) = &self.metadata_proposer { + let entry = MetadataEntry::RoutingChange(RoutingChange::LeadershipTransfer { + group_id, + new_leader_node_id: req.target_node, + }); + proposer.propose_and_wait(entry).await?; + } else { let mut routing = self.routing.write().unwrap_or_else(|p| p.into_inner()); - routing.reassign_vshard(req.vshard_id, group_id); + routing.set_leader(group_id, req.target_node); } - // Install ghost stub on source so scatter-gather queries that arrive - // before the client refreshes its routing table are transparently - // forwarded to the new owner. + // Ghost stub so in-flight scatter-gather queries that still + // target the old leader are transparently forwarded. { let mut ghosts = self.ghost_table.lock().unwrap_or_else(|p| p.into_inner()); ghosts.insert(GhostStub { @@ -387,18 +419,13 @@ impl MigrationExecutor { .as_millis() as u64, }); } - debug!( - vshard = req.vshard_id, - target = req.target_node, - "ghost stub registered for transparent forwarding" - ); let actual_pause_us = cutover_start.elapsed().as_micros() as u64; state.complete(actual_pause_us); debug!( vshard = req.vshard_id, - actual_pause_us, "phase 3 complete: routing updated via raft" + actual_pause_us, "phase 3 complete: routing updated" ); Ok(()) @@ -521,14 +548,14 @@ mod tests { write_pause_budget_us: 500_000, }; - // Phase 1 should succeed (adds node 2 to group 0). + // Phase 1 should succeed (adds node 2 as learner to group 0). executor .phase1_base_copy(&mut state, 0, &req) .await .unwrap(); - // Verify: the ConfChange was proposed (it's in the Raft log). - // The actual application happens when committed, which requires tick(). + // Verify: the ConfChange (AddLearner) was proposed in the Raft log. + // Application happens on next tick/commit cycle. } #[test] diff --git a/nodedb-cluster/src/multi_raft/core.rs b/nodedb-cluster/src/multi_raft/core.rs index 9aa60bc4..72029096 100644 --- a/nodedb-cluster/src/multi_raft/core.rs +++ b/nodedb-cluster/src/multi_raft/core.rs @@ -77,8 +77,8 @@ impl MultiRaft { node_id, groups: HashMap::new(), routing, - election_timeout_min: Duration::from_millis(150), - election_timeout_max: Duration::from_millis(300), + election_timeout_min: Duration::from_secs(2), + election_timeout_max: Duration::from_secs(5), heartbeat_interval: Duration::from_millis(50), data_dir, } diff --git a/nodedb-cluster/src/raft_loop/loop_core.rs b/nodedb-cluster/src/raft_loop/loop_core.rs index e73787dc..ed1ccd98 100644 --- a/nodedb-cluster/src/raft_loop/loop_core.rs +++ b/nodedb-cluster/src/raft_loop/loop_core.rs @@ -580,35 +580,38 @@ mod tests { let sr1h = shutdown_tx.subscribe(); tokio::spawn(async move { t1.serve(rl1_h, sr1h).await }); - tokio::time::sleep(Duration::from_millis(200)).await; - - assert!( - rl1.applier.count() >= 1, - "node 1 should have committed at least the no-op, got {}", - rl1.applier.count() - ); + // Poll until node 1 commits at least the no-op (election done). + let deadline = tokio::time::Instant::now() + Duration::from_secs(5); + loop { + if rl1.applier.count() >= 1 { + break; + } + assert!( + tokio::time::Instant::now() < deadline, + "node 1 should have committed at least the no-op, got {}", + rl1.applier.count() + ); + tokio::time::sleep(Duration::from_millis(20)).await; + } let (_gid, idx) = rl1.propose(0, b"distributed-cmd".to_vec()).unwrap(); assert!(idx >= 2); - tokio::time::sleep(Duration::from_millis(200)).await; - - assert!( - rl1.applier.count() >= 2, - "node 1: expected >= 2 applied, got {}", - rl1.applier.count() - ); - - assert!( - rl2.applier.count() >= 1, - "node 2: expected >= 1 applied, got {}", - rl2.applier.count() - ); - assert!( - rl3.applier.count() >= 1, - "node 3: expected >= 1 applied, got {}", - rl3.applier.count() - ); + // Poll until all nodes replicate the proposed command. + let deadline = tokio::time::Instant::now() + Duration::from_secs(5); + loop { + if rl1.applier.count() >= 2 && rl2.applier.count() >= 1 && rl3.applier.count() >= 1 { + break; + } + assert!( + tokio::time::Instant::now() < deadline, + "replication timed out: n1={}, n2={}, n3={}", + rl1.applier.count(), + rl2.applier.count(), + rl3.applier.count() + ); + tokio::time::sleep(Duration::from_millis(20)).await; + } shutdown_tx.send(true).unwrap(); } diff --git a/nodedb-cluster/src/reachability/driver.rs b/nodedb-cluster/src/reachability/driver.rs new file mode 100644 index 00000000..b677ba0c --- /dev/null +++ b/nodedb-cluster/src/reachability/driver.rs @@ -0,0 +1,220 @@ +//! [`ReachabilityDriver`] — periodic open-breaker probe loop. +//! +//! Every `interval`, the driver asks the shared [`CircuitBreaker`] +//! for its currently-Open peer set and fires a probe at each via the +//! injected [`ReachabilityProber`]. Probes run in parallel via +//! `tokio::spawn` so a slow peer never blocks the next one. Probe +//! results are intentionally ignored: the production `TransportProber` +//! routes through `NexarTransport::send_rpc`, which already walks the +//! circuit breaker's `check → record_success|record_failure` path, so +//! the driver does not need to bookkeep anything itself. +//! +//! Shutdown is cooperative via `tokio::sync::watch`. On `true` the +//! run loop breaks at the next tick or immediately if it is waiting. + +use std::sync::Arc; +use std::time::Duration; + +use tokio::sync::watch; +use tokio::time::{MissedTickBehavior, interval}; +use tracing::{debug, trace}; + +use crate::circuit_breaker::CircuitBreaker; + +use super::prober::ReachabilityProber; + +/// Configuration for the reachability driver. +#[derive(Debug, Clone)] +pub struct ReachabilityDriverConfig { + /// Period between open-peer sweeps. Defaults to 30 s in + /// production; tests override to milliseconds. + pub interval: Duration, +} + +impl Default for ReachabilityDriverConfig { + fn default() -> Self { + Self { + interval: Duration::from_secs(30), + } + } +} + +/// Drives periodic reachability probes against every Open-state peer. +pub struct ReachabilityDriver { + breaker: Arc, + prober: Arc, + cfg: ReachabilityDriverConfig, +} + +impl ReachabilityDriver { + pub fn new( + breaker: Arc, + prober: Arc, + cfg: ReachabilityDriverConfig, + ) -> Self { + Self { + breaker, + prober, + cfg, + } + } + + /// Run the driver until `shutdown` flips to `true`. + pub async fn run(self: Arc, mut shutdown: watch::Receiver) { + let mut tick = interval(self.cfg.interval); + // Skip the immediate first tick so the first probe fires one + // full interval after start. Otherwise every process restart + // would stampede every open breaker at once. + tick.set_missed_tick_behavior(MissedTickBehavior::Delay); + tick.tick().await; + loop { + tokio::select! { + biased; + changed = shutdown.changed() => { + if changed.is_ok() && *shutdown.borrow() { + break; + } + } + _ = tick.tick() => { + self.sweep_once().await; + } + } + } + debug!("reachability driver shutting down"); + } + + /// Single sweep — exposed for tests that drive the loop manually. + pub async fn sweep_once(&self) { + let open = self.breaker.open_peers(); + if open.is_empty() { + return; + } + trace!(count = open.len(), "reachability sweep: probing open peers"); + for peer in open { + let prober = Arc::clone(&self.prober); + tokio::spawn(async move { + let _ = prober.probe(peer).await; + }); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::circuit_breaker::CircuitBreakerConfig; + use async_trait::async_trait; + use std::sync::Mutex; + + struct RecordingProber { + calls: Mutex>, + } + + impl RecordingProber { + fn new() -> Arc { + Arc::new(Self { + calls: Mutex::new(Vec::new()), + }) + } + fn take(&self) -> Vec { + let mut g = self.calls.lock().unwrap(); + let out = g.clone(); + g.clear(); + out + } + } + + #[async_trait] + impl ReachabilityProber for RecordingProber { + async fn probe(&self, peer: u64) -> Result<(), crate::error::ClusterError> { + self.calls.lock().unwrap().push(peer); + Ok(()) + } + } + + fn open_breaker() -> Arc { + Arc::new(CircuitBreaker::new(CircuitBreakerConfig { + failure_threshold: 1, + cooldown: Duration::from_secs(60), + })) + } + + #[tokio::test] + async fn sweep_probes_every_open_peer() { + let breaker = open_breaker(); + breaker.record_failure(1); + breaker.record_failure(2); + breaker.record_failure(3); + + let prober = RecordingProber::new(); + let driver = Arc::new(ReachabilityDriver::new( + Arc::clone(&breaker), + prober.clone() as Arc, + ReachabilityDriverConfig { + interval: Duration::from_millis(50), + }, + )); + driver.sweep_once().await; + // Let spawned probe tasks run. + for _ in 0..8 { + tokio::task::yield_now().await; + } + let mut calls = prober.take(); + calls.sort_unstable(); + assert_eq!(calls, vec![1, 2, 3]); + } + + #[tokio::test] + async fn sweep_skips_closed_peers() { + let breaker = open_breaker(); + breaker.record_success(1); // Registers 1 as Closed. + breaker.record_failure(2); // Opens 2. + let prober = RecordingProber::new(); + let driver = Arc::new(ReachabilityDriver::new( + Arc::clone(&breaker), + prober.clone() as Arc, + ReachabilityDriverConfig::default(), + )); + driver.sweep_once().await; + for _ in 0..8 { + tokio::task::yield_now().await; + } + assert_eq!(prober.take(), vec![2]); + } + + #[tokio::test(start_paused = true)] + async fn run_loop_fires_sweeps_on_interval_and_shuts_down() { + let breaker = open_breaker(); + breaker.record_failure(7); + let prober = RecordingProber::new(); + let driver = Arc::new(ReachabilityDriver::new( + Arc::clone(&breaker), + prober.clone() as Arc, + ReachabilityDriverConfig { + interval: Duration::from_millis(100), + }, + )); + let (tx, rx) = watch::channel(false); + let handle = tokio::spawn({ + let d = Arc::clone(&driver); + async move { d.run(rx).await } + }); + + // First tick is skipped, second delivers a sweep. + tokio::time::advance(Duration::from_millis(120)).await; + tokio::task::yield_now().await; + tokio::time::advance(Duration::from_millis(120)).await; + tokio::task::yield_now().await; + for _ in 0..16 { + tokio::task::yield_now().await; + } + + assert!( + !prober.take().is_empty(), + "driver never probed in run-loop mode" + ); + + let _ = tx.send(true); + let _ = tokio::time::timeout(Duration::from_millis(500), handle).await; + } +} diff --git a/nodedb-cluster/src/reachability/mod.rs b/nodedb-cluster/src/reachability/mod.rs new file mode 100644 index 00000000..6423b087 --- /dev/null +++ b/nodedb-cluster/src/reachability/mod.rs @@ -0,0 +1,22 @@ +//! Reachability driver — the active half of circuit-breaker recovery. +//! +//! `CircuitBreaker` transitions `Open → HalfOpen` only on the next +//! `check()` call. Without periodic traffic to an offline peer, that +//! check never happens and the breaker stays `Open` forever even after +//! the peer has recovered. This module closes that blind spot: +//! +//! - [`ReachabilityDriver`] periodically walks the breaker's open set +//! and sends a lightweight probe RPC to each peer via the existing +//! `send_rpc` path, which drives the normal HalfOpen → Closed / +//! HalfOpen → Open transitions. +//! - [`ReachabilityProber`] is the injection seam: production wraps +//! [`crate::transport::NexarTransport`], tests use a mock. +//! +//! The driver is shutdown-aware (watch channel) and bounded — one +//! probe per open peer per tick, fire-and-forget. + +pub mod driver; +pub mod prober; + +pub use driver::{ReachabilityDriver, ReachabilityDriverConfig}; +pub use prober::{NoopProber, ReachabilityProber, TransportProber}; diff --git a/nodedb-cluster/src/reachability/prober.rs b/nodedb-cluster/src/reachability/prober.rs new file mode 100644 index 00000000..47607e1d --- /dev/null +++ b/nodedb-cluster/src/reachability/prober.rs @@ -0,0 +1,68 @@ +//! [`ReachabilityProber`] — the injection seam for reachability probes. +//! +//! Implementations: +//! +//! - [`TransportProber`] wraps an `Arc` and sends a +//! `RaftRpc::Ping` to the peer. `send_rpc` already handles the +//! circuit-breaker check, the QUIC dial, retries, and +//! `record_success` / `record_failure` — the prober is a one-line +//! adapter. +//! - [`NoopProber`] always succeeds. Useful for tests that only want +//! to verify the loop's tick cadence and shutdown. +//! +//! Tests that want deterministic open→closed transitions construct +//! their own trait impls; see `tests/reachability_loop.rs`. + +use std::sync::Arc; + +use async_trait::async_trait; + +use crate::error::Result; +use crate::rpc_codec::{PingRequest, RaftRpc}; +use crate::transport::NexarTransport; + +/// Abstract probe operation over a single peer. +#[async_trait] +pub trait ReachabilityProber: Send + Sync { + /// Send one probe to `peer`. Returns `Ok(())` iff the probe + /// completed successfully (implying the peer is reachable). + async fn probe(&self, peer: u64) -> Result<()>; +} + +/// Production prober: sends a `Ping` via the live transport. The +/// transport's internal circuit breaker records success/failure +/// automatically — the driver does not need to bookkeep anything. +pub struct TransportProber { + transport: Arc, + self_node_id: u64, +} + +impl TransportProber { + pub fn new(transport: Arc, self_node_id: u64) -> Self { + Self { + transport, + self_node_id, + } + } +} + +#[async_trait] +impl ReachabilityProber for TransportProber { + async fn probe(&self, peer: u64) -> Result<()> { + let rpc = RaftRpc::Ping(PingRequest { + sender_id: self.self_node_id, + topology_version: 0, + }); + self.transport.send_rpc(peer, rpc).await.map(|_| ()) + } +} + +/// Always-succeeds prober for cadence/shutdown tests. +pub struct NoopProber; + +#[async_trait] +impl ReachabilityProber for NoopProber { + async fn probe(&self, _peer: u64) -> Result<()> { + Ok(()) + } +} diff --git a/nodedb-cluster/src/rebalancer/driver.rs b/nodedb-cluster/src/rebalancer/driver.rs new file mode 100644 index 00000000..2150a474 --- /dev/null +++ b/nodedb-cluster/src/rebalancer/driver.rs @@ -0,0 +1,427 @@ +//! Rebalancer driver loop. +//! +//! [`RebalancerLoop`] is the active half of the load-based rebalancer. +//! Every `interval` it walks this sequence: +//! +//! 1. Ask the injected `ElectionGate` whether any raft group is +//! currently mid-election. If so, skip this tick entirely — +//! moves during an election race with the new leader's log and +//! are almost guaranteed to be wasted work. +//! 2. Ask the injected [`LoadMetricsProvider`] for a snapshot of +//! every node's current load metrics. +//! 3. Call [`compute_load_based_plan`] against the live routing + +//! topology with the configured plan config. If the plan is +//! empty (cluster within threshold, or no cold candidates), do +//! nothing. +//! 4. Dispatch each planned move through the injected +//! [`MigrationDispatcher`], fire-and-forget. The dispatcher is +//! where the bridge to the production `MigrationExecutor` lives +//! — tests use a mock that records the calls. +//! +//! The loop holds no state of its own; the dispatcher tracks +//! in-flight work and the breaker/scheduler state is on the +//! underlying subsystems. This keeps the driver trivially +//! restartable: crash mid-tick, respawn, resume. + +use std::sync::{Arc, RwLock}; +use std::time::Duration; + +use async_trait::async_trait; +use tokio::sync::{Notify, watch}; +use tokio::time::{MissedTickBehavior, interval}; +use tracing::{debug, info, warn}; + +use crate::error::Result; +use crate::rebalance::PlannedMove; +use crate::routing::RoutingTable; +use crate::topology::ClusterTopology; + +use super::metrics::LoadMetricsProvider; +use super::plan::{RebalancerPlanConfig, compute_load_based_plan}; + +/// Injection seam: tells the driver whether it's safe to dispatch +/// moves. Production wraps a `MultiRaft` status probe; tests return +/// a constant boolean. +#[async_trait] +pub trait ElectionGate: Send + Sync { + /// Return `true` if **any** raft group is currently holding an + /// election (no stable leader). The driver skips its tick when + /// this is `true`. + async fn any_group_electing(&self) -> bool; +} + +/// Permissive gate that never blocks the driver. Useful in tests +/// and in single-node clusters where elections are instantaneous. +pub struct AlwaysReadyGate; + +#[async_trait] +impl ElectionGate for AlwaysReadyGate { + async fn any_group_electing(&self) -> bool { + false + } +} + +/// Injection seam: executes a single planned move. Production +/// wraps `MigrationExecutor::execute` and reports success/failure +/// via logging + the tracker; tests record the move. +#[async_trait] +pub trait MigrationDispatcher: Send + Sync { + async fn dispatch(&self, mv: PlannedMove) -> Result<()>; +} + +/// Configuration for [`RebalancerLoop`]. +#[derive(Debug, Clone)] +pub struct RebalancerLoopConfig { + /// Period between rebalance sweeps. Defaults to 30 s. + pub interval: Duration, + /// Plan computation config propagated to + /// [`compute_load_based_plan`] on every tick. + pub plan: RebalancerPlanConfig, + /// CPU utilization threshold (0.0–1.0) above which the + /// rebalancer pauses to avoid amplifying load. If ANY node in + /// the metrics snapshot exceeds this value, the sweep is skipped + /// and a STATUS event is logged. Default 0.80 (80%). + pub backpressure_cpu_threshold: f64, +} + +impl Default for RebalancerLoopConfig { + fn default() -> Self { + Self { + interval: Duration::from_secs(30), + plan: RebalancerPlanConfig::default(), + backpressure_cpu_threshold: 0.80, + } + } +} + +/// The driver itself. +pub struct RebalancerLoop { + cfg: RebalancerLoopConfig, + metrics: Arc, + dispatcher: Arc, + gate: Arc, + routing: Arc>, + topology: Arc>, + /// Membership-change notification. When any caller (a SWIM + /// subscriber, a manual admin trigger, etc.) calls + /// [`notify`](Notify::notify_one) on this handle, the run loop + /// wakes up immediately and runs an extra sweep instead of + /// waiting for the next 30 s tick. + kick: Arc, +} + +impl RebalancerLoop { + pub fn new( + cfg: RebalancerLoopConfig, + metrics: Arc, + dispatcher: Arc, + gate: Arc, + routing: Arc>, + topology: Arc>, + ) -> Self { + Self { + cfg, + metrics, + dispatcher, + gate, + routing, + topology, + kick: Arc::new(Notify::new()), + } + } + + /// Return a handle that callers can use to trigger an immediate + /// sweep. Cloning the `Arc` is cheap; every clone + /// shares the same waker. + pub fn kick_handle(&self) -> Arc { + Arc::clone(&self.kick) + } + + /// Run the driver until `shutdown` flips to `true`. + pub async fn run(self: Arc, mut shutdown: watch::Receiver) { + let mut tick = interval(self.cfg.interval); + tick.set_missed_tick_behavior(MissedTickBehavior::Delay); + // Consume the immediate first tick so the first sweep fires + // a full interval after start. Prevents start-up stampedes + // when many nodes restart together. + tick.tick().await; + loop { + tokio::select! { + biased; + changed = shutdown.changed() => { + if changed.is_ok() && *shutdown.borrow() { + break; + } + } + _ = tick.tick() => { + self.sweep_once().await; + } + _ = self.kick.notified() => { + debug!("rebalancer: membership-change kick received"); + self.sweep_once().await; + } + } + } + debug!("rebalancer loop shutting down"); + } + + /// Run a single sweep. Exposed for tests that drive the loop + /// manually rather than through `run`. + pub async fn sweep_once(&self) { + if self.gate.any_group_electing().await { + debug!("rebalancer: raft election in progress, skipping tick"); + return; + } + let metrics = match self.metrics.snapshot().await { + Ok(m) => m, + Err(e) => { + warn!(error = %e, "rebalancer: failed to collect metrics"); + return; + } + }; + if let Some(hot) = metrics + .iter() + .find(|m| m.cpu_utilization > self.cfg.backpressure_cpu_threshold) + { + info!( + node_id = hot.node_id, + cpu = format!("{:.0}%", hot.cpu_utilization * 100.0), + threshold = format!("{:.0}%", self.cfg.backpressure_cpu_threshold * 100.0), + "rebalancer: back-pressure — cluster under load, skipping sweep" + ); + return; + } + let plan = { + let routing = self.routing.read().unwrap_or_else(|p| p.into_inner()); + let topo = self.topology.read().unwrap_or_else(|p| p.into_inner()); + compute_load_based_plan(&metrics, &routing, &topo, &self.cfg.plan) + }; + if plan.is_empty() { + debug!("rebalancer: no moves needed this tick"); + return; + } + info!( + move_count = plan.len(), + "rebalancer: dispatching planned moves" + ); + for mv in plan { + let dispatcher = Arc::clone(&self.dispatcher); + tokio::spawn(async move { + if let Err(e) = dispatcher.dispatch(mv).await { + warn!(error = %e, "rebalancer: dispatch failed"); + } + }); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::rebalancer::metrics::LoadMetrics; + use crate::topology::{NodeInfo, NodeState}; + use std::net::SocketAddr; + use std::sync::Mutex; + + struct StaticMetrics(Vec); + + #[async_trait] + impl LoadMetricsProvider for StaticMetrics { + async fn snapshot(&self) -> Result> { + Ok(self.0.clone()) + } + } + + struct RecordingDispatcher { + calls: Mutex>, + } + + impl RecordingDispatcher { + fn new() -> Arc { + Arc::new(Self { + calls: Mutex::new(Vec::new()), + }) + } + fn take(&self) -> Vec { + let mut g = self.calls.lock().unwrap(); + let out = g.clone(); + g.clear(); + out + } + } + + #[async_trait] + impl MigrationDispatcher for RecordingDispatcher { + async fn dispatch(&self, mv: PlannedMove) -> Result<()> { + self.calls.lock().unwrap().push(mv); + Ok(()) + } + } + + struct BlockingGate(bool); + + #[async_trait] + impl ElectionGate for BlockingGate { + async fn any_group_electing(&self) -> bool { + self.0 + } + } + + fn topo(nodes: &[u64]) -> Arc> { + let mut t = ClusterTopology::new(); + for (i, id) in nodes.iter().enumerate() { + let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + t.add_node(NodeInfo::new(*id, a, NodeState::Active)); + } + Arc::new(RwLock::new(t)) + } + + fn routing_hot_on(node: u64) -> Arc> { + let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1); + for gid in 0..6 { + r.set_leader(gid, node); + } + Arc::new(RwLock::new(r)) + } + + fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics { + LoadMetrics { + node_id: id, + vshards_led: v, + bytes_stored: bytes_mib * 1_048_576, + writes_per_sec: w, + reads_per_sec: r, + cpu_utilization: 0.0, + } + } + + fn hot_cluster_loop( + gate: Arc, + ) -> (Arc, Arc) { + let metrics: Arc = Arc::new(StaticMetrics(vec![ + lm(1, 500, 5000, 200.0, 200.0), + lm(2, 5, 5, 5.0, 5.0), + lm(3, 5, 5, 5.0, 5.0), + ])); + let dispatcher = RecordingDispatcher::new(); + let disp_dyn: Arc = dispatcher.clone(); + let rloop = Arc::new(RebalancerLoop::new( + RebalancerLoopConfig { + interval: Duration::from_millis(50), + ..Default::default() + }, + metrics, + disp_dyn, + gate, + routing_hot_on(1), + topo(&[1, 2, 3]), + )); + (rloop, dispatcher) + } + + #[tokio::test] + async fn sweep_dispatches_moves_when_imbalanced() { + let (rloop, dispatcher) = hot_cluster_loop(Arc::new(AlwaysReadyGate)); + rloop.sweep_once().await; + for _ in 0..16 { + tokio::task::yield_now().await; + } + let calls = dispatcher.take(); + assert!(!calls.is_empty()); + for c in &calls { + assert_eq!(c.source_node, 1); + } + } + + #[tokio::test] + async fn sweep_skipped_during_election() { + let (rloop, dispatcher) = hot_cluster_loop(Arc::new(BlockingGate(true))); + rloop.sweep_once().await; + for _ in 0..8 { + tokio::task::yield_now().await; + } + assert!(dispatcher.take().is_empty()); + } + + #[tokio::test] + async fn sweep_noop_on_balanced_cluster() { + let metrics: Arc = Arc::new(StaticMetrics(vec![ + lm(1, 50, 500, 100.0, 100.0), + lm(2, 50, 500, 100.0, 100.0), + lm(3, 50, 500, 100.0, 100.0), + ])); + let dispatcher = RecordingDispatcher::new(); + let rloop = Arc::new(RebalancerLoop::new( + RebalancerLoopConfig::default(), + metrics, + dispatcher.clone() as Arc, + Arc::new(AlwaysReadyGate), + routing_hot_on(1), + topo(&[1, 2, 3]), + )); + rloop.sweep_once().await; + for _ in 0..8 { + tokio::task::yield_now().await; + } + assert!(dispatcher.take().is_empty()); + } + + #[tokio::test(start_paused = true)] + async fn run_loop_fires_sweeps_and_shuts_down() { + let (rloop, dispatcher) = hot_cluster_loop(Arc::new(AlwaysReadyGate)); + let (tx, rx) = watch::channel(false); + let handle = tokio::spawn({ + let d = Arc::clone(&rloop); + async move { d.run(rx).await } + }); + // First tick consumed immediately by run(); advance past a + // couple of real intervals with interleaved yields so the + // run-loop's select + spawned dispatch tasks all get to poll. + for _ in 0..4 { + tokio::time::advance(Duration::from_millis(80)).await; + for _ in 0..16 { + tokio::task::yield_now().await; + } + } + assert!(!dispatcher.take().is_empty()); + + let _ = tx.send(true); + let _ = tokio::time::timeout(Duration::from_millis(500), handle).await; + } + + #[tokio::test] + async fn sweep_skipped_under_cpu_backpressure() { + let metrics: Arc = Arc::new(StaticMetrics(vec![ + LoadMetrics { + node_id: 1, + vshards_led: 500, + bytes_stored: 5000 * 1_048_576, + writes_per_sec: 200.0, + reads_per_sec: 200.0, + cpu_utilization: 0.95, // above 80% threshold + }, + lm(2, 5, 5, 5.0, 5.0), + lm(3, 5, 5, 5.0, 5.0), + ])); + let dispatcher = RecordingDispatcher::new(); + let rloop = Arc::new(RebalancerLoop::new( + RebalancerLoopConfig { + interval: Duration::from_millis(50), + ..Default::default() + }, + metrics, + dispatcher.clone() as Arc, + Arc::new(AlwaysReadyGate), + routing_hot_on(1), + topo(&[1, 2, 3]), + )); + rloop.sweep_once().await; + for _ in 0..8 { + tokio::task::yield_now().await; + } + assert!( + dispatcher.take().is_empty(), + "dispatcher should not fire when cluster is under CPU backpressure" + ); + } +} diff --git a/nodedb-cluster/src/rebalancer/elastic.rs b/nodedb-cluster/src/rebalancer/elastic.rs new file mode 100644 index 00000000..36903741 --- /dev/null +++ b/nodedb-cluster/src/rebalancer/elastic.rs @@ -0,0 +1,147 @@ +//! Elastic scaling glue — ties SWIM membership transitions to the +//! rebalancer loop so new/departing nodes trigger an immediate sweep +//! instead of waiting for the next 30 s tick. +//! +//! ## Add-node path +//! +//! 1. Node joins via the existing bootstrap/join RPC path. +//! 2. `CacheApplier` with live state applies `TopologyChange::Join` +//! + `PromoteToVoter`, adding the node to the live topology. +//! 3. SWIM detects the new node as `Alive` through gossip. +//! 4. [`RebalancerKickHook`] (a [`MembershipSubscriber`]) fires +//! [`Notify::notify_one`] on the rebalancer loop's kick handle. +//! 5. The loop wakes, collects metrics (including the new node's +//! low load score), and dispatches moves to the new node. +//! +//! ## Remove-node path +//! +//! 1. Operator runs `cluster decommission N` (Phase E.4). +//! 2. The decommission flow strips the node from all groups and +//! removes it from topology. +//! 3. SWIM detects the node as `Dead` / `Left`. +//! 4. The same kick hook wakes the rebalancer so it re-evaluates +//! whether the remaining nodes are balanced. +//! +//! No new data types or traits — just a [`MembershipSubscriber`] +//! impl holding a shared `Arc`. + +use std::sync::Arc; + +use nodedb_types::NodeId; +use tokio::sync::Notify; +use tracing::debug; + +use crate::swim::member::MemberState; +use crate::swim::subscriber::MembershipSubscriber; + +/// SWIM [`MembershipSubscriber`] that triggers an immediate +/// rebalancer sweep on membership-relevant transitions. +/// +/// Relevant transitions are: +/// - `None → Alive` (first time a new node is seen — add path) +/// - `_ → Dead` / `_ → Left` (node departure — remove path) +/// - `_ → Alive` after `Dead`/`Left` (node recovery) +/// +/// All other transitions (Alive → Suspect, Suspect → Alive) are +/// transient and do not change the set of Active nodes, so they +/// are ignored. +pub struct RebalancerKickHook { + kick: Arc, +} + +impl RebalancerKickHook { + pub fn new(kick: Arc) -> Self { + Self { kick } + } +} + +impl MembershipSubscriber for RebalancerKickHook { + fn on_state_change(&self, node_id: &NodeId, old: Option, new: MemberState) { + let relevant = match (old, new) { + // First-time insert as Alive (new node joined). + (None, MemberState::Alive) => true, + // Node died or left. + (_, MemberState::Dead) | (_, MemberState::Left) => true, + // Node recovered from Dead/Left back to Alive. + (Some(MemberState::Dead), MemberState::Alive) + | (Some(MemberState::Left), MemberState::Alive) => true, + _ => false, + }; + if relevant { + debug!(?node_id, ?old, ?new, "rebalancer kick: membership change"); + self.kick.notify_one(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicU32, Ordering}; + + fn counting_notify() -> (Arc, Arc, tokio::task::JoinHandle<()>) { + let notify = Arc::new(Notify::new()); + let counter = Arc::new(AtomicU32::new(0)); + let n = notify.clone(); + let c = counter.clone(); + let handle = tokio::spawn(async move { + loop { + n.notified().await; + c.fetch_add(1, Ordering::SeqCst); + } + }); + (notify, counter, handle) + } + + #[tokio::test] + async fn kick_fires_on_new_node_alive() { + let (notify, counter, handle) = counting_notify(); + let hook = RebalancerKickHook::new(notify); + hook.on_state_change(&NodeId::new("new"), None, MemberState::Alive); + tokio::task::yield_now().await; + tokio::task::yield_now().await; + assert!(counter.load(Ordering::SeqCst) >= 1); + handle.abort(); + } + + #[tokio::test] + async fn kick_fires_on_dead() { + let (notify, counter, handle) = counting_notify(); + let hook = RebalancerKickHook::new(notify); + hook.on_state_change( + &NodeId::new("x"), + Some(MemberState::Alive), + MemberState::Dead, + ); + tokio::task::yield_now().await; + tokio::task::yield_now().await; + assert!(counter.load(Ordering::SeqCst) >= 1); + handle.abort(); + } + + #[tokio::test] + async fn kick_fires_on_left() { + let (notify, counter, handle) = counting_notify(); + let hook = RebalancerKickHook::new(notify); + hook.on_state_change( + &NodeId::new("x"), + Some(MemberState::Alive), + MemberState::Left, + ); + tokio::task::yield_now().await; + tokio::task::yield_now().await; + assert!(counter.load(Ordering::SeqCst) >= 1); + handle.abort(); + } + + #[test] + fn kick_does_not_fire_on_suspect() { + let notify = Arc::new(Notify::new()); + let hook = RebalancerKickHook::new(notify); + hook.on_state_change( + &NodeId::new("x"), + Some(MemberState::Alive), + MemberState::Suspect, + ); + } +} diff --git a/nodedb-cluster/src/rebalancer/metrics.rs b/nodedb-cluster/src/rebalancer/metrics.rs new file mode 100644 index 00000000..b9c9f5b5 --- /dev/null +++ b/nodedb-cluster/src/rebalancer/metrics.rs @@ -0,0 +1,147 @@ +//! Per-node load metrics and scoring. +//! +//! `LoadMetrics` is the raw per-node observation the rebalancer loop +//! consumes. `normalized_score` folds a `LoadMetrics` plus a set of +//! `LoadWeights` into a single `f64` so different nodes can be +//! compared on one axis — the hotter the score, the more work the +//! node is doing relative to the cluster. +//! +//! Weights are configurable because different workloads care about +//! different dimensions: a write-heavy OLTP cluster wants high +//! `writes` weight, an analytical cluster wants high `bytes` +//! weight, and a very uniform vshard layout wants high `vshards` +//! weight. The defaults (1.0 each) are a balanced starting point. + +use async_trait::async_trait; + +use crate::error::Result; + +/// Raw load observation for a single node. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct LoadMetrics { + pub node_id: u64, + /// Count of vshards this node is currently leading. + pub vshards_led: u32, + /// Total bytes stored across all vshards on this node. + pub bytes_stored: u64, + /// Writes per second (rolling average, caller-defined window). + pub writes_per_sec: f64, + /// Reads per second (rolling average, caller-defined window). + pub reads_per_sec: f64, + /// Per-core CPU utilization (0.0–1.0). Used by the + /// back-pressure gate to pause the rebalancer when the cluster + /// is already stressed. + pub cpu_utilization: f64, +} + +/// Relative weights for the four load dimensions. Scaled linearly; +/// the absolute values don't matter, only their ratios. +#[derive(Debug, Clone, Copy)] +pub struct LoadWeights { + pub vshards: f64, + pub bytes: f64, + pub writes: f64, + pub reads: f64, +} + +impl Default for LoadWeights { + fn default() -> Self { + Self { + vshards: 1.0, + bytes: 1.0, + writes: 1.0, + reads: 1.0, + } + } +} + +/// Collapse a `LoadMetrics` observation into a single scalar score +/// using `weights`. Higher = hotter. +/// +/// The implementation is a straightforward weighted sum — each field +/// is scaled by its weight and added. Bytes are divided by a +/// reasonable unit (1 MiB) so the float stays in a comparable range +/// to the per-second rates; otherwise a moderately-sized dataset +/// would swamp the qps signal entirely. +pub fn normalized_score(m: &LoadMetrics, weights: &LoadWeights) -> f64 { + const BYTES_UNIT: f64 = 1_048_576.0; // 1 MiB + weights.vshards * m.vshards_led as f64 + + weights.bytes * (m.bytes_stored as f64 / BYTES_UNIT) + + weights.writes * m.writes_per_sec + + weights.reads * m.reads_per_sec +} + +/// Injection seam for collecting load metrics from every node in the +/// cluster. Production impls talk to the metrics endpoint via the +/// transport; tests inject synthetic values. +#[async_trait] +pub trait LoadMetricsProvider: Send + Sync { + /// Return a snapshot of every known node's current load metrics. + /// The returned slice may be in any order — the rebalancer plan + /// sorts internally for determinism. + async fn snapshot(&self) -> Result>; +} + +#[cfg(test)] +mod tests { + use super::*; + + fn m(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics { + LoadMetrics { + node_id: id, + vshards_led: v, + bytes_stored: bytes_mib * 1_048_576, + writes_per_sec: w, + reads_per_sec: r, + cpu_utilization: 0.0, + } + } + + #[test] + fn default_weights_are_uniform() { + let w = LoadWeights::default(); + assert_eq!(w.vshards, 1.0); + assert_eq!(w.bytes, 1.0); + assert_eq!(w.writes, 1.0); + assert_eq!(w.reads, 1.0); + } + + #[test] + fn zero_metrics_score_zero() { + let metrics = m(1, 0, 0, 0.0, 0.0); + assert_eq!(normalized_score(&metrics, &LoadWeights::default()), 0.0); + } + + #[test] + fn score_sums_all_dimensions_with_default_weights() { + // 4 vshards + 8 MiB + 2 wps + 3 rps = 17.0 + let metrics = m(1, 4, 8, 2.0, 3.0); + let score = normalized_score(&metrics, &LoadWeights::default()); + assert!((score - 17.0).abs() < 1e-9); + } + + #[test] + fn weights_scale_dimensions_independently() { + let metrics = m(1, 10, 0, 0.0, 0.0); + let w = LoadWeights { + vshards: 5.0, + ..Default::default() + }; + assert!((normalized_score(&metrics, &w) - 50.0).abs() < 1e-9); + } + + #[test] + fn hotter_node_has_higher_score() { + let cold = m(1, 1, 1, 1.0, 1.0); + let hot = m(2, 10, 100, 100.0, 100.0); + let w = LoadWeights::default(); + assert!(normalized_score(&hot, &w) > normalized_score(&cold, &w)); + } + + #[test] + fn bytes_scale_via_mib_unit() { + // 1 MiB with bytes weight = 1.0 contributes 1.0, not 1_048_576. + let metrics = m(1, 0, 1, 0.0, 0.0); + assert!((normalized_score(&metrics, &LoadWeights::default()) - 1.0).abs() < 1e-9); + } +} diff --git a/nodedb-cluster/src/rebalancer/mod.rs b/nodedb-cluster/src/rebalancer/mod.rs new file mode 100644 index 00000000..3374ca06 --- /dev/null +++ b/nodedb-cluster/src/rebalancer/mod.rs @@ -0,0 +1,34 @@ +//! Load-based automatic rebalancer. +//! +//! This module is the *signal* side of the rebalancer: given a +//! per-node snapshot of load metrics (vshards led, bytes stored, +//! writes/sec, reads/sec) it computes whether the cluster is +//! imbalanced enough to warrant moves, and emits a bounded plan of +//! vshard migrations from the hottest nodes to the coldest ones. +//! +//! The actual driver loop (`loop_driver.rs`) and the bridge to +//! `MigrationExecutor` land in a follow-up sub-batch. Everything +//! shipped here is pure, side-effect-free, and fully deterministic +//! so it can be unit-tested exhaustively before any tokio task is +//! spawned against it. +//! +//! ## Why a new module +//! +//! The existing [`crate::rebalance_scheduler::RebalanceScheduler`] +//! triggers on CPU utilization, SPSC queue pressure, and shard-count +//! ratio. Those are fast-path overload signals and belong where they +//! are. This module is a distinct, storage-shape-driven rebalancer +//! (bytes + qps + vshard count) with bounded in-flight moves and a +//! 30 s cadence, complementing the overload path. + +pub mod driver; +pub mod elastic; +pub mod metrics; +pub mod plan; + +pub use driver::{ + AlwaysReadyGate, ElectionGate, MigrationDispatcher, RebalancerLoop, RebalancerLoopConfig, +}; +pub use elastic::RebalancerKickHook; +pub use metrics::{LoadMetrics, LoadMetricsProvider, LoadWeights, normalized_score}; +pub use plan::{RebalancerPlanConfig, compute_load_based_plan}; diff --git a/nodedb-cluster/src/rebalancer/plan.rs b/nodedb-cluster/src/rebalancer/plan.rs new file mode 100644 index 00000000..ae68712a --- /dev/null +++ b/nodedb-cluster/src/rebalancer/plan.rs @@ -0,0 +1,366 @@ +//! Load-imbalance plan computation. +//! +//! Given a snapshot of per-node `LoadMetrics` and the current routing +//! table, decide whether the cluster is imbalanced enough to justify +//! moves and, if so, emit a bounded list of `PlannedMove`s from the +//! hottest nodes to the coldest ones. +//! +//! ## Trigger +//! +//! The rebalancer fires when, after normalizing every node's score: +//! +//! > `max - min > threshold_pct / 100 * mean` +//! +//! ...i.e. the hottest node is more than `threshold_pct`% above the +//! cluster mean relative to the coldest one. This is intentionally +//! not a per-node check: single-hot-node scenarios below the +//! cluster mean delta are handled by the separate +//! `rebalance_scheduler` CPU/queue triggers. +//! +//! ## Move selection +//! +//! For each hot→cold pair, the planner walks the routing table in +//! stable (sorted by group_id, then vshard_id) order and picks +//! vshards the hot node is currently leading. It caps moves at +//! `max_moves_per_group` moves from any single group (so one +//! over-replicated group can't consume the entire in-flight budget) +//! and at `max_moves_total` across the whole plan (so the dispatcher +//! never has more than that many migrations in flight at once). +//! +//! Determinism: the plan is deterministic given the same inputs, +//! including tie-breaks. Two nodes computing the plan at the same +//! instant produce byte-identical outputs. + +use std::collections::HashMap; + +use tracing::debug; + +use crate::rebalance::PlannedMove; +use crate::routing::RoutingTable; +use crate::topology::ClusterTopology; + +use super::metrics::{LoadMetrics, LoadWeights, normalized_score}; + +/// Configuration for [`compute_load_based_plan`]. +#[derive(Debug, Clone)] +pub struct RebalancerPlanConfig { + /// If `(max - min) > (threshold_pct / 100) * mean`, we plan moves. + /// Default: 20%. + pub imbalance_threshold_pct: u8, + /// Maximum moves from any single Raft group per plan. Default 1. + pub max_moves_per_group: usize, + /// Maximum moves in the entire plan. Default 10. + pub max_moves_total: usize, + /// Weights applied to the load dimensions when scoring. + pub weights: LoadWeights, +} + +impl Default for RebalancerPlanConfig { + fn default() -> Self { + Self { + imbalance_threshold_pct: 20, + max_moves_per_group: 1, + max_moves_total: 10, + weights: LoadWeights::default(), + } + } +} + +/// Compute a load-driven rebalance plan. Returns an empty vector if +/// the cluster is already within the imbalance threshold or if there +/// are fewer than two nodes to compare. +pub fn compute_load_based_plan( + metrics: &[LoadMetrics], + routing: &RoutingTable, + topology: &ClusterTopology, + cfg: &RebalancerPlanConfig, +) -> Vec { + if metrics.len() < 2 { + return Vec::new(); + } + + // Score every node, then sort ascending so the hot list and cold + // list are natural slices. `f64` isn't Ord, so use total_cmp for + // NaN-free deterministic ordering. + let mut scored: Vec<(u64, f64)> = metrics + .iter() + .map(|m| (m.node_id, normalized_score(m, &cfg.weights))) + .collect(); + scored.sort_by(|a, b| a.1.total_cmp(&b.1).then_with(|| a.0.cmp(&b.0))); + + let min = scored.first().map(|(_, s)| *s).unwrap_or(0.0); + let max = scored.last().map(|(_, s)| *s).unwrap_or(0.0); + let mean: f64 = scored.iter().map(|(_, s)| *s).sum::() / scored.len() as f64; + + // Imbalance gate. A zero-mean cluster (everything idle) is + // considered already balanced — nothing to move. + if mean <= 0.0 { + return Vec::new(); + } + let threshold = (cfg.imbalance_threshold_pct as f64 / 100.0) * mean; + if (max - min) <= threshold { + debug!( + max, + min, mean, threshold, "rebalancer: cluster within imbalance threshold" + ); + return Vec::new(); + } + + // Only Active nodes are valid migration targets. Cold candidates + // must be Active and must not already be the source for a move. + let active_set: std::collections::HashSet = + topology.active_nodes().iter().map(|n| n.node_id).collect(); + + // Hot = strictly above mean; cold = strictly below mean. Using + // the mean as the split point (rather than index-based halving) + // correctly handles asymmetric distributions where a single + // outlier pulls one node above an otherwise balanced cluster — + // the below-mean nodes stay in the cold set even if they tie + // with each other. + let hot_nodes: Vec = scored + .iter() + .rev() // hottest first + .filter(|(_, s)| *s > mean) + .map(|(id, _)| *id) + .collect(); + let cold_nodes: Vec = scored + .iter() + .filter(|(_, s)| *s < mean) + .filter(|(id, _)| active_set.contains(id)) + .map(|(id, _)| *id) + .collect(); + + if cold_nodes.is_empty() { + return Vec::new(); + } + + // Walk routing in stable order — group id ascending, then vshard + // id ascending — and pick moves until we hit the caps. + let mut group_ids: Vec = routing.group_members().keys().copied().collect(); + group_ids.sort_unstable(); + + let mut moves: Vec = Vec::new(); + let mut per_group_count: HashMap = HashMap::new(); + let mut cold_cursor = 0usize; + + 'outer: for hot in &hot_nodes { + if !active_set.contains(hot) { + continue; + } + for &gid in &group_ids { + if moves.len() >= cfg.max_moves_total { + break 'outer; + } + let info = match routing.group_info(gid) { + Some(i) => i, + None => continue, + }; + if info.leader != *hot { + continue; + } + if *per_group_count.get(&gid).unwrap_or(&0) >= cfg.max_moves_per_group { + continue; + } + // Pick the group's lowest vshard id deterministically. + let mut vshards = routing.vshards_for_group(gid); + vshards.sort_unstable(); + let Some(&vshard_id) = vshards.first() else { + continue; + }; + let target = cold_nodes[cold_cursor % cold_nodes.len()]; + if target == *hot { + continue; + } + moves.push(PlannedMove { + vshard_id, + source_node: *hot, + target_node: target, + source_group: gid, + }); + *per_group_count.entry(gid).or_default() += 1; + cold_cursor += 1; + } + } + + moves +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::topology::{NodeInfo, NodeState}; + use std::net::SocketAddr; + + fn topo(nodes: &[u64]) -> ClusterTopology { + let mut t = ClusterTopology::new(); + for (i, id) in nodes.iter().enumerate() { + let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + t.add_node(NodeInfo::new(*id, a, NodeState::Active)); + } + t + } + + fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics { + LoadMetrics { + node_id: id, + vshards_led: v, + bytes_stored: bytes_mib * 1_048_576, + writes_per_sec: w, + reads_per_sec: r, + cpu_utilization: 0.0, + } + } + + #[test] + fn empty_metrics_returns_empty_plan() { + let t = topo(&[1, 2]); + let r = RoutingTable::uniform(2, &[1, 2], 1); + let plan = compute_load_based_plan(&[], &r, &t, &RebalancerPlanConfig::default()); + assert!(plan.is_empty()); + } + + #[test] + fn single_node_returns_empty_plan() { + let t = topo(&[1]); + let r = RoutingTable::uniform(1, &[1], 1); + let plan = compute_load_based_plan( + &[lm(1, 100, 100, 100.0, 100.0)], + &r, + &t, + &RebalancerPlanConfig::default(), + ); + assert!(plan.is_empty()); + } + + #[test] + fn balanced_cluster_no_moves() { + let t = topo(&[1, 2, 3]); + let r = RoutingTable::uniform(3, &[1, 2, 3], 1); + let metrics = vec![ + lm(1, 10, 100, 50.0, 50.0), + lm(2, 10, 100, 50.0, 50.0), + lm(3, 10, 100, 50.0, 50.0), + ]; + let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default()); + assert!(plan.is_empty()); + } + + #[test] + fn imbalance_above_threshold_triggers_moves() { + let t = topo(&[1, 2, 3]); + let r = RoutingTable::uniform(6, &[1, 2, 3], 1); + // Node 1 massively overloaded. + let metrics = vec![ + lm(1, 200, 1000, 500.0, 500.0), + lm(2, 10, 50, 25.0, 25.0), + lm(3, 10, 50, 25.0, 25.0), + ]; + let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default()); + assert!(!plan.is_empty()); + // Every move must source from node 1. + for m in &plan { + assert_eq!(m.source_node, 1); + } + } + + #[test] + fn plan_respects_max_moves_total() { + let t = topo(&[1, 2]); + // 20 groups so node 1 can lead many. + let mut r = RoutingTable::uniform(20, &[1, 2], 1); + for gid in 0..20 { + r.set_leader(gid, 1); + } + let metrics = vec![lm(1, 2000, 10_000, 5000.0, 5000.0), lm(2, 1, 1, 1.0, 1.0)]; + let cfg = RebalancerPlanConfig { + max_moves_total: 4, + max_moves_per_group: 1, + ..Default::default() + }; + let plan = compute_load_based_plan(&metrics, &r, &t, &cfg); + assert_eq!(plan.len(), 4); + } + + #[test] + fn plan_respects_max_moves_per_group() { + let t = topo(&[1, 2]); + let mut r = RoutingTable::uniform(3, &[1, 2], 1); + for gid in 0..3 { + r.set_leader(gid, 1); + } + let metrics = vec![lm(1, 2000, 10_000, 5000.0, 5000.0), lm(2, 1, 1, 1.0, 1.0)]; + let cfg = RebalancerPlanConfig { + max_moves_total: 99, + max_moves_per_group: 1, + ..Default::default() + }; + let plan = compute_load_based_plan(&metrics, &r, &t, &cfg); + // With max_moves_per_group=1 and 3 groups, at most 3 moves. + assert!(plan.len() <= 3); + let mut by_group: HashMap = HashMap::new(); + for m in &plan { + *by_group.entry(m.source_group).or_default() += 1; + } + for (_, count) in by_group { + assert!(count <= 1); + } + } + + #[test] + fn plan_is_deterministic() { + let t = topo(&[1, 2, 3]); + let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1); + for gid in 0..6 { + r.set_leader(gid, 1); + } + let metrics = vec![ + lm(1, 500, 5000, 200.0, 200.0), + lm(2, 5, 5, 5.0, 5.0), + lm(3, 5, 5, 5.0, 5.0), + ]; + let cfg = RebalancerPlanConfig::default(); + let p1 = compute_load_based_plan(&metrics, &r, &t, &cfg); + let p2 = compute_load_based_plan(&metrics, &r, &t, &cfg); + let p1_tuples: Vec<_> = p1 + .iter() + .map(|m| (m.vshard_id, m.source_node, m.target_node, m.source_group)) + .collect(); + let p2_tuples: Vec<_> = p2 + .iter() + .map(|m| (m.vshard_id, m.source_node, m.target_node, m.source_group)) + .collect(); + assert_eq!(p1_tuples, p2_tuples); + } + + #[test] + fn idle_cluster_never_triggers() { + let t = topo(&[1, 2, 3]); + let r = RoutingTable::uniform(3, &[1, 2, 3], 1); + let metrics = vec![ + lm(1, 0, 0, 0.0, 0.0), + lm(2, 0, 0, 0.0, 0.0), + lm(3, 0, 0, 0.0, 0.0), + ]; + let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default()); + assert!(plan.is_empty()); + } + + #[test] + fn cold_node_must_be_active() { + // Node 3 is not Active (it's Draining) → cannot receive. + let mut t = topo(&[1, 2, 3]); + t.set_state(3, NodeState::Draining); + let mut r = RoutingTable::uniform(2, &[1, 2, 3], 1); + r.set_leader(0, 1); + r.set_leader(1, 1); + let metrics = vec![ + lm(1, 500, 5000, 200.0, 200.0), + lm(2, 5, 5, 5.0, 5.0), + lm(3, 0, 0, 0.0, 0.0), + ]; + let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default()); + for m in &plan { + assert_ne!(m.target_node, 3, "Draining node must not receive moves"); + } + } +} diff --git a/nodedb-cluster/src/routing.rs b/nodedb-cluster/src/routing.rs index fdaaab83..1587bd37 100644 --- a/nodedb-cluster/src/routing.rs +++ b/nodedb-cluster/src/routing.rs @@ -155,6 +155,28 @@ impl RoutingTable { } } + /// Remove a node from a group's voter and learner lists. If the + /// removed node was the current leader hint, the hint is cleared + /// so the next query drives a fresh discovery. Returns `true` if + /// the group existed and anything was actually removed. + /// + /// The caller is responsible for safety: dropping below the + /// configured replication factor must be gated by + /// `decommission::safety::check_can_decommission`. + pub fn remove_group_member(&mut self, group_id: u64, node_id: u64) -> bool { + let Some(info) = self.group_members.get_mut(&group_id) else { + return false; + }; + let before_members = info.members.len(); + let before_learners = info.learners.len(); + info.members.retain(|&id| id != node_id); + info.learners.retain(|&id| id != node_id); + if info.leader == node_id { + info.leader = 0; + } + info.members.len() != before_members || info.learners.len() != before_learners + } + /// Update the learner list for a Raft group. pub fn set_group_learners(&mut self, group_id: u64, learners: Vec) { if let Some(info) = self.group_members.get_mut(&group_id) { @@ -274,6 +296,31 @@ mod tests { assert_eq!(rt.leader_for_vshard(0).unwrap(), 99); } + #[test] + fn remove_group_member_strips_voter_and_clears_leader() { + let mut rt = RoutingTable::uniform(2, &[1, 2, 3], 3); + rt.set_leader(0, 2); + assert!(rt.remove_group_member(0, 2)); + let info = rt.group_info(0).unwrap(); + assert!(!info.members.contains(&2)); + assert_eq!(info.leader, 0, "leader hint should be cleared"); + } + + #[test] + fn remove_group_member_strips_learner_only() { + let mut rt = RoutingTable::uniform(2, &[1, 2, 3], 3); + rt.add_group_learner(0, 9); + assert!(rt.remove_group_member(0, 9)); + let info = rt.group_info(0).unwrap(); + assert!(!info.learners.contains(&9)); + } + + #[test] + fn remove_group_member_unknown_group_returns_false() { + let mut rt = RoutingTable::uniform(1, &[1, 2], 2); + assert!(!rt.remove_group_member(99, 1)); + } + #[test] fn vshard_not_mapped() { let rt = RoutingTable::uniform(2, &[1, 2], 2); diff --git a/nodedb-cluster/src/routing_liveness.rs b/nodedb-cluster/src/routing_liveness.rs new file mode 100644 index 00000000..3e98a3c6 --- /dev/null +++ b/nodedb-cluster/src/routing_liveness.rs @@ -0,0 +1,182 @@ +//! Liveness-driven routing invalidation. +//! +//! [`RoutingLivenessHook`] is a [`MembershipSubscriber`] that clears +//! the leader hint for every Raft group whose leaseholder has just +//! been marked `Suspect`, `Dead`, or `Left` by the SWIM failure +//! detector. After the hook fires, the next query that consults the +//! routing table observes `leader == 0` (the "no leader known" +//! sentinel) and falls through to a fresh leader discovery via the +//! existing `NotLeader`-triggered election path. Clients see at most +//! one retry: the stale hint, the failed dispatch, and a refreshed +//! leader lookup. +//! +//! The hook is storage-agnostic: it holds `Arc>` +//! and a resolver closure that maps the string-keyed SWIM `NodeId` +//! to the numeric `u64` id used throughout the rest of the cluster +//! crate. Wiring layers (start_cluster, tests) supply the resolver +//! appropriate to their topology source. +//! +//! The hook is intentionally sync and cheap — a single `RwLock::write`, +//! a linear scan over group_members, and `set_leader(gid, 0)` for +//! each affected group. No I/O, no spawning. That keeps it safe to +//! call directly from the detector run loop. + +use std::sync::{Arc, RwLock}; + +use nodedb_types::NodeId; +use tracing::debug; + +use crate::routing::RoutingTable; +use crate::swim::MemberState; +use crate::swim::subscriber::MembershipSubscriber; + +/// Resolver mapping SWIM `NodeId` → numeric `u64` routing-table id. +/// +/// Returns `None` for members SWIM knows about but the routing table +/// does not (placeholder `seed:` entries before the first real +/// probe, transient learners, etc.). Those are silently ignored. +pub type NodeIdResolver = Arc Option + Send + Sync>; + +/// Clears the leader hint for every group led by a node that SWIM +/// has marked Suspect/Dead/Left. +pub struct RoutingLivenessHook { + routing: Arc>, + resolver: NodeIdResolver, +} + +impl RoutingLivenessHook { + pub fn new(routing: Arc>, resolver: NodeIdResolver) -> Self { + Self { routing, resolver } + } +} + +impl MembershipSubscriber for RoutingLivenessHook { + fn on_state_change(&self, node_id: &NodeId, _old: Option, new: MemberState) { + // Alive transitions are a no-op: the next query will refresh + // the leader hint naturally on NotLeader. We only invalidate + // when a leader has observably stopped being reachable. + if !matches!( + new, + MemberState::Suspect | MemberState::Dead | MemberState::Left + ) { + return; + } + + let Some(numeric_id) = (self.resolver)(node_id) else { + // SWIM knows about a node the routing table doesn't — a + // seed placeholder, a learner mid-join, or a node that + // was never registered. Nothing to invalidate. + return; + }; + + let mut rt = self.routing.write().unwrap_or_else(|p| p.into_inner()); + let affected: Vec = rt + .group_members() + .iter() + .filter(|(_, info)| info.leader == numeric_id) + .map(|(gid, _)| *gid) + .collect(); + for gid in &affected { + rt.set_leader(*gid, 0); + } + if !affected.is_empty() { + debug!( + ?node_id, + ?new, + numeric_id, + groups_invalidated = affected.len(), + "routing liveness hook cleared leader hints" + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn rt_with_leaders(pairs: &[(u64, u64)], rf: usize) -> Arc> { + // Build a routing table with `pairs.len()` groups where group + // `gid` has leader `leader`. Uses the uniform constructor to + // pick a membership, then overrides the leader. + let nodes: Vec = pairs.iter().map(|(_, l)| *l).collect(); + let mut rt = RoutingTable::uniform(pairs.len() as u64, &nodes, rf); + for (gid, leader) in pairs { + rt.set_leader(*gid, *leader); + } + Arc::new(RwLock::new(rt)) + } + + fn resolver_for(map: &'static [(&'static str, u64)]) -> NodeIdResolver { + Arc::new(move |nid: &NodeId| { + map.iter() + .find(|(s, _)| *s == nid.as_str()) + .map(|(_, n)| *n) + }) + } + + #[test] + fn dead_transition_clears_leader_for_owned_groups() { + let rt = rt_with_leaders(&[(0, 1), (1, 2), (2, 1), (3, 3)], 1); + let hook = + RoutingLivenessHook::new(rt.clone(), resolver_for(&[("a", 1), ("b", 2), ("c", 3)])); + + hook.on_state_change( + &NodeId::new("a"), + Some(MemberState::Alive), + MemberState::Dead, + ); + + let guard = rt.read().unwrap(); + assert_eq!(guard.group_info(0).unwrap().leader, 0); + assert_eq!(guard.group_info(1).unwrap().leader, 2); + assert_eq!(guard.group_info(2).unwrap().leader, 0); + assert_eq!(guard.group_info(3).unwrap().leader, 3); + } + + #[test] + fn suspect_transition_also_invalidates() { + let rt = rt_with_leaders(&[(0, 7)], 1); + let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("x", 7)])); + hook.on_state_change( + &NodeId::new("x"), + Some(MemberState::Alive), + MemberState::Suspect, + ); + assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 0); + } + + #[test] + fn alive_transition_is_noop() { + let rt = rt_with_leaders(&[(0, 5)], 1); + let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("q", 5)])); + hook.on_state_change(&NodeId::new("q"), None, MemberState::Alive); + assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 5); + } + + #[test] + fn unresolved_node_id_is_ignored() { + let rt = rt_with_leaders(&[(0, 1)], 1); + let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("a", 1)])); + // NodeId "seed:127.0.0.1:9000" is not in the resolver map. + hook.on_state_change( + &NodeId::new("seed:127.0.0.1:9000"), + Some(MemberState::Alive), + MemberState::Dead, + ); + // Leader untouched because the resolver returned None. + assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 1); + } + + #[test] + fn left_is_also_invalidating() { + let rt = rt_with_leaders(&[(0, 2)], 1); + let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("b", 2)])); + hook.on_state_change( + &NodeId::new("b"), + Some(MemberState::Alive), + MemberState::Left, + ); + assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 0); + } +} diff --git a/nodedb-cluster/src/swim/bootstrap.rs b/nodedb-cluster/src/swim/bootstrap.rs index 943190d1..739e6ab1 100644 --- a/nodedb-cluster/src/swim/bootstrap.rs +++ b/nodedb-cluster/src/swim/bootstrap.rs @@ -29,6 +29,7 @@ use super::incarnation::Incarnation; use super::member::MemberState; use super::member::record::MemberUpdate; use super::membership::MembershipList; +use super::subscriber::MembershipSubscriber; /// Owns a running SWIM detector and its shutdown plumbing. /// @@ -88,6 +89,20 @@ pub async fn spawn( local_addr: SocketAddr, seeds: Vec, transport: Arc, +) -> Result { + spawn_with_subscribers(cfg, local_id, local_addr, seeds, transport, Vec::new()).await +} + +/// Same as [`spawn`] but installs the given [`MembershipSubscriber`]s +/// on the detector before its run loop starts, so every state +/// transition is observed from the very first probe round. +pub async fn spawn_with_subscribers( + cfg: SwimConfig, + local_id: NodeId, + local_addr: SocketAddr, + seeds: Vec, + transport: Arc, + subscribers: Vec>, ) -> Result { cfg.validate()?; @@ -112,11 +127,12 @@ pub async fn spawn( } let initial_inc = cfg.initial_incarnation; - let detector = Arc::new(FailureDetector::new( + let detector = Arc::new(FailureDetector::with_subscribers( cfg, Arc::clone(&membership), transport, ProbeScheduler::new(), + subscribers, )); // Prime the dissemination queue with our own Alive record so the diff --git a/nodedb-cluster/src/swim/detector/mod.rs b/nodedb-cluster/src/swim/detector/mod.rs index 829d285a..5dccfc7b 100644 --- a/nodedb-cluster/src/swim/detector/mod.rs +++ b/nodedb-cluster/src/swim/detector/mod.rs @@ -4,8 +4,8 @@ //! probe scheduler, the suspicion timer, and the main `tokio::select!` //! loop. All actual networking is pushed behind the [`Transport`] trait //! so unit tests can run fully in-process against [`InMemoryTransport`] -//! and the real UDP transport in E-ε can slot in without touching the -//! detector logic. +//! while production uses [`UdpTransport`] — both slot into the same +//! detector without touching its logic. pub mod probe_round; pub mod runner; diff --git a/nodedb-cluster/src/swim/detector/probe_round.rs b/nodedb-cluster/src/swim/detector/probe_round.rs index 882a6a78..02fd0b99 100644 --- a/nodedb-cluster/src/swim/detector/probe_round.rs +++ b/nodedb-cluster/src/swim/detector/probe_round.rs @@ -413,13 +413,16 @@ mod tests { ); } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn indirect_ack_saves_target() { + // No `start_paused` — paused time auto-advances timeouts + // before polling channel-woken tasks, making the indirect + // path race the timeout. With real time, the 40ms probe + // timeout is ample for the in-memory fabric (sub-µs delivery). let fab = TransportFabric::new(); - let local = Arc::new(fab.bind(addr(7000)).await) as Arc; - // Target bound but silent on the direct channel. + let local = Arc::new(fab.bind(addr(7000)).await); let _silent = fab.bind(addr(7001)).await; - let helper = fab.bind(addr(7002)).await; + let helper = Arc::new(fab.bind(addr(7002)).await); let list = membership_with_peers( "local", 7000, @@ -432,38 +435,74 @@ mod tests { let mut sched = ProbeScheduler::with_seed(1); let inflight = Arc::new(InflightProbes::new()); - // Helper task: forwards any PingReq it sees into an Ack via the - // inflight registry. Paused-runtime auto-advance drives the - // direct-ping timeout on the main task. - let inflight_helper = Arc::clone(&inflight); + // Helper task: respond to Ping (direct probe) with Ack and to + // PingReq (indirect probe) with a forwarded Ack — mirrors the + // production runner recv-loop + handle_ping_req path. The + // scheduler may pick n2 as direct target or as indirect helper + // depending on the shuffle seed, so both must be handled. + let helper_t: Arc = helper.clone(); let responder = tokio::spawn(async move { loop { - let (_from, msg) = match helper.recv().await { + let (from, msg) = match helper_t.recv().await { Ok(v) => v, Err(_) => return, }; - if let SwimMessage::PingReq(req) = msg { - inflight_helper - .resolve( - req.probe_id, - SwimMessage::Ack(Ack { - probe_id: req.probe_id, - from: req.target.clone(), - incarnation: Incarnation::new(9), - piggyback: vec![], - }), - ) - .await; - return; + match msg { + SwimMessage::Ping(ping) => { + let _ = helper_t + .send( + from, + SwimMessage::Ack(Ack { + probe_id: ping.probe_id, + from: NodeId::new("n2"), + incarnation: Incarnation::new(9), + piggyback: vec![], + }), + ) + .await; + return; + } + SwimMessage::PingReq(req) => { + let _ = helper_t + .send( + from, + SwimMessage::Ack(Ack { + probe_id: req.probe_id, + from: req.target.clone(), + incarnation: Incarnation::new(9), + piggyback: vec![], + }), + ) + .await; + return; + } + _ => {} } } }); + // Recv-loop on the local endpoint: resolves inflight probes + // when Acks arrive — mirrors the production runner recv-loop. + let recv_t: Arc = local.clone(); + let recv_inflight = Arc::clone(&inflight); + let recv_loop = tokio::spawn(async move { + loop { + let (_from, msg) = match recv_t.recv().await { + Ok(v) => v, + Err(_) => return, + }; + if let SwimMessage::Ack(ref ack) = msg { + recv_inflight.resolve(ack.probe_id, msg).await; + } + } + }); + + let local_dyn: Arc = local.clone(); let dissemination = Arc::new(DisseminationQueue::new()); let outcome = ProbeRound { scheduler: &mut sched, membership: &list, - transport: &local, + transport: &local_dyn, inflight: &inflight, dissemination: &dissemination, probe_timeout: cfg().probe_timeout, @@ -476,9 +515,8 @@ mod tests { .execute() .await .expect("run"); - let _ = responder.await; - // Either direct (unlikely — n1 is silent) or indirect ack via n2. - // Whichever path fires, the outcome must be Acked. + responder.abort(); + recv_loop.abort(); assert!(matches!(outcome, ProbeOutcome::Acked { .. })); } diff --git a/nodedb-cluster/src/swim/detector/runner.rs b/nodedb-cluster/src/swim/detector/runner.rs index 23997583..ef16bb2b 100644 --- a/nodedb-cluster/src/swim/detector/runner.rs +++ b/nodedb-cluster/src/swim/detector/runner.rs @@ -19,6 +19,7 @@ use crate::swim::incarnation::Incarnation; use crate::swim::member::MemberState; use crate::swim::member::record::MemberUpdate; use crate::swim::membership::{MembershipList, MergeOutcome}; +use crate::swim::subscriber::MembershipSubscriber; use crate::swim::wire::{Ack, Ping, PingReq, ProbeId, SwimMessage}; use super::probe_round::{InflightProbes, ProbeOutcome, ProbeRound}; @@ -41,6 +42,7 @@ pub struct FailureDetector { dissemination: Arc, probe_counter: AtomicU64, local_incarnation: Mutex, + subscribers: Vec>, } impl FailureDetector { @@ -51,6 +53,18 @@ impl FailureDetector { membership: Arc, transport: Arc, scheduler: ProbeScheduler, + ) -> Self { + Self::with_subscribers(cfg, membership, transport, scheduler, Vec::new()) + } + + /// Construct with a list of [`MembershipSubscriber`]s that will be + /// notified on every member state transition. + pub fn with_subscribers( + cfg: SwimConfig, + membership: Arc, + transport: Arc, + scheduler: ProbeScheduler, + subscribers: Vec>, ) -> Self { let initial_inc = cfg.initial_incarnation; Self { @@ -63,7 +77,30 @@ impl FailureDetector { dissemination: Arc::new(DisseminationQueue::new()), probe_counter: AtomicU64::new(0), local_incarnation: Mutex::new(initial_inc), + subscribers, + } + } + + /// Apply an update via [`apply_and_disseminate`] while notifying + /// every subscriber of any resulting state transition. Returns the + /// raw [`MergeOutcome`] so callers can still react to + /// `SelfRefute` etc. + fn apply_and_notify(&self, update: &MemberUpdate) -> MergeOutcome { + let old_state = self.membership.get(&update.node_id).map(|m| m.state); + let outcome = apply_and_disseminate(&self.membership, &self.dissemination, update); + if self.subscribers.is_empty() { + return outcome; + } + let new_state = match self.membership.get(&update.node_id) { + Some(m) => m.state, + None => return outcome, + }; + if old_state != Some(new_state) { + for sub in &self.subscribers { + sub.on_state_change(&update.node_id, old_state, new_state); + } } + outcome } /// Shared reference to the dissemination queue. Tests use it to @@ -78,7 +115,7 @@ impl FailureDetector { /// local incarnation so subsequent probes advertise the new value. async fn ingest_piggyback(&self, piggyback: &[MemberUpdate]) { for update in piggyback { - let outcome = apply_and_disseminate(&self.membership, &self.dissemination, update); + let outcome = self.apply_and_notify(update); if let MergeOutcome::SelfRefute { new_incarnation } = outcome { let mut guard = self.local_incarnation.lock().await; if new_incarnation > *guard { @@ -140,7 +177,7 @@ impl FailureDetector { state: MemberState::Dead, incarnation: member.incarnation, }; - apply_and_disseminate(&self.membership, &self.dissemination, &dead_update); + self.apply_and_notify(&dead_update); } } @@ -174,7 +211,7 @@ impl FailureDetector { state: MemberState::Suspect, incarnation: member.incarnation, }; - apply_and_disseminate(&self.membership, &self.dissemination, &suspect_update); + self.apply_and_notify(&suspect_update); let cluster_size = self.membership.len(); self.suspicion.lock().await.arm( target, @@ -282,9 +319,9 @@ impl FailureDetector { } /// Refute a self-suspect rumour by bumping local incarnation and - /// rebroadcasting `Alive`. E-γ exposes the handle so tests can - /// assert the behaviour; the dissemination queue in E-δ will call - /// this automatically from the piggyback ingestor. + /// rebroadcasting `Alive`. Exposed for tests that assert the + /// refutation machinery directly; the piggyback ingestor calls + /// the same underlying path automatically in production. #[cfg(test)] pub async fn bump_local_incarnation(&self, past: Incarnation) -> Incarnation { let mut guard = self.local_incarnation.lock().await; diff --git a/nodedb-cluster/src/swim/member/record.rs b/nodedb-cluster/src/swim/member/record.rs index 22bde368..8aa98653 100644 --- a/nodedb-cluster/src/swim/member/record.rs +++ b/nodedb-cluster/src/swim/member/record.rs @@ -55,7 +55,7 @@ impl Member { } /// Serializable subset of a `Member` — everything except the monotonic -/// instant. E-β will use this as the wire payload for membership deltas. +/// instant. Used as the wire payload for membership deltas. #[derive( Debug, Clone, diff --git a/nodedb-cluster/src/swim/membership/list.rs b/nodedb-cluster/src/swim/membership/list.rs index be2d975a..e2049625 100644 --- a/nodedb-cluster/src/swim/membership/list.rs +++ b/nodedb-cluster/src/swim/membership/list.rs @@ -117,7 +117,7 @@ impl MembershipList { } /// Apply a rumour to the table. Returns the merge outcome so the caller - /// can drive the dissemination queue (E-δ). On `SelfRefute`, the local + /// can drive the dissemination queue. On `SelfRefute`, the local /// record is updated in place to carry the bumped incarnation before /// returning, so the caller only needs to gossip the new record. pub fn apply(&self, update: &MemberUpdate) -> MergeOutcome { diff --git a/nodedb-cluster/src/swim/mod.rs b/nodedb-cluster/src/swim/mod.rs index 2500e5f7..0fa706e6 100644 --- a/nodedb-cluster/src/swim/mod.rs +++ b/nodedb-cluster/src/swim/mod.rs @@ -6,19 +6,16 @@ //! incarnation refutation, dedicated acks) used by modern systems such as //! Hashicorp memberlist and Cassandra's gossiper. //! -//! ## Layer map (Phase E) +//! ## Layer map //! -//! | Sub-batch | Contents | -//! |-----------|------------------------------------------------------------| -//! | **E-α** | Core types — `config`, `error`, `incarnation`, `member`, `membership` (this file's children) | -//! | E-β | Wire messages (`Ping`/`PingReq`/`Ack`/`Nack`) + zerompk codec | -//! | E-γ | Failure detector loop over an injected transport trait | -//! | E-δ | Piggyback dissemination queue + convergence tests | -//! | E-ε | Real UDP transport, bootstrap seeding, cluster integration | -//! -//! E-α is deliberately side-effect-free: no tasks, no I/O, no wire formats. -//! It exposes the pure data model — member states, incarnation numbers, and -//! the state-merge rule — that every later sub-batch builds on. +//! - `config`, `error`, `incarnation`, `member`, `membership` — pure +//! data model: states, incarnation numbers, and the merge rule. +//! - `wire` — `Ping` / `PingReq` / `Ack` / `Nack` datagrams + codec. +//! - `detector` — failure detector loop over a pluggable transport +//! trait, scheduler, suspicion timer, probe round machinery. +//! - `dissemination` — piggyback queue with `lambda * log(n)` fanout. +//! - `bootstrap` — one-stop `spawn` entry point. +//! - `subscriber` — hook trait fired on every membership transition. pub mod bootstrap; pub mod config; @@ -28,6 +25,7 @@ pub mod error; pub mod incarnation; pub mod member; pub mod membership; +pub mod subscriber; pub mod wire; pub use bootstrap::{SwimHandle, spawn}; @@ -40,4 +38,5 @@ pub use error::SwimError; pub use incarnation::Incarnation; pub use member::{Member, MemberState}; pub use membership::{MembershipList, MembershipSnapshot, merge_update}; +pub use subscriber::MembershipSubscriber; pub use wire::{Ack, Nack, NackReason, Ping, PingReq, ProbeId, SwimMessage}; diff --git a/nodedb-cluster/src/swim/subscriber.rs b/nodedb-cluster/src/swim/subscriber.rs new file mode 100644 index 00000000..e7a20746 --- /dev/null +++ b/nodedb-cluster/src/swim/subscriber.rs @@ -0,0 +1,30 @@ +//! `MembershipSubscriber` — hook fired whenever SWIM observes a +//! member state transition. +//! +//! The failure detector invokes every registered subscriber *after* +//! applying an update to the [`MembershipList`](super::membership::MembershipList) +//! and dissemination queue, so subscribers see the post-merge view. +//! +//! Subscribers are synchronous and must not block — they typically do +//! cheap in-memory bookkeeping (e.g. clearing a routing leader hint). +//! Heavier work belongs on a dedicated task the subscriber spawns +//! itself. +//! +//! ## Lifecycle +//! +//! - `old = None` means "first time we've seen this node" (insert). +//! - `old = Some(state)` means the member existed and transitioned to +//! a strictly different `new` state. The detector never calls the +//! hook for no-op reapplies. +//! - `Left` is terminal — after it fires once the member is gone. + +use nodedb_types::NodeId; + +use super::member::MemberState; + +/// Hook trait for observers that react to SWIM membership changes. +pub trait MembershipSubscriber: Send + Sync { + /// Called after the membership list has accepted a state change + /// for `node_id`. `old` is `None` on first-time insert. + fn on_state_change(&self, node_id: &NodeId, old: Option, new: MemberState); +} diff --git a/nodedb-cluster/src/swim/wire/message.rs b/nodedb-cluster/src/swim/wire/message.rs index da884b96..56d16636 100644 --- a/nodedb-cluster/src/swim/wire/message.rs +++ b/nodedb-cluster/src/swim/wire/message.rs @@ -31,7 +31,7 @@ pub enum SwimMessage { impl SwimMessage { /// Mutable borrow of the piggyback slot, independent of variant. - /// Used by the dissemination queue (E-δ) to stamp outgoing deltas + /// Used by the dissemination queue to stamp outgoing deltas /// without caring which message type it is stamping onto. pub fn piggyback_mut(&mut self) -> &mut Vec { match self { @@ -53,7 +53,7 @@ impl SwimMessage { } /// Drop piggyback entries beyond `max`. Used before encoding to keep - /// a datagram below the UDP MTU — the dissemination queue (E-δ) will + /// a datagram below the UDP MTU — the dissemination queue will /// decide which updates are highest-priority; this helper just /// enforces the upper bound. pub fn truncate_piggyback(&mut self, max: usize) { diff --git a/nodedb-cluster/src/swim/wire/probe.rs b/nodedb-cluster/src/swim/wire/probe.rs index 3a115019..d17b0373 100644 --- a/nodedb-cluster/src/swim/wire/probe.rs +++ b/nodedb-cluster/src/swim/wire/probe.rs @@ -1,9 +1,8 @@ //! SWIM probe message structs. //! -//! These are the four datagram types the failure detector exchanges over -//! the network once E-ε wires in a transport. They are pure data types -//! with `serde` derives — no I/O, no validation beyond what the type -//! system enforces. +//! These are the four datagram types the failure detector exchanges +//! over the network. They are pure data types with `serde` derives — +//! no I/O, no validation beyond what the type system enforces. //! //! ## Message flow (reference) //! @@ -21,9 +20,7 @@ //! ``` //! //! Every message carries a bounded `piggyback: Vec` slot -//! used for gossip-style dissemination of membership deltas (E-δ). The -//! wire format reserves the slot now so later sub-batches don't need a -//! compatibility break. +//! used for gossip-style dissemination of membership deltas. use nodedb_types::NodeId; use serde::{Deserialize, Serialize}; diff --git a/nodedb-cluster/src/transport/client.rs b/nodedb-cluster/src/transport/client.rs index 71a7d5fd..3fbf3e15 100644 --- a/nodedb-cluster/src/transport/client.rs +++ b/nodedb-cluster/src/transport/client.rs @@ -245,15 +245,23 @@ impl NexarTransport { .ok_or(ClusterError::NodeUnreachable { node_id: target })? }; - // Connect. - let conn = self + // Connect — bounded by rpc_timeout so a hung QUIC handshake + // (peer not yet serving) doesn't block for the full 30s idle timeout. + let connecting = self .listener .endpoint() .connect_with(self.client_config.clone(), addr, SNI_HOSTNAME) .map_err(|e| ClusterError::Transport { detail: format!("connect to node {target} at {addr}: {e}"), - })? + })?; + let conn = tokio::time::timeout(self.rpc_timeout, connecting) .await + .map_err(|_| ClusterError::Transport { + detail: format!( + "handshake timeout ({}ms) with node {target} at {addr}", + self.rpc_timeout.as_millis() + ), + })? .map_err(|e| ClusterError::Transport { detail: format!("handshake with node {target} at {addr}: {e}"), })?; diff --git a/nodedb-cluster/tests/common/mod.rs b/nodedb-cluster/tests/common/mod.rs index b2c1d137..341682e6 100644 --- a/nodedb-cluster/tests/common/mod.rs +++ b/nodedb-cluster/tests/common/mod.rs @@ -187,6 +187,8 @@ impl TestNode { max_backoff_secs: 2, }, swim_udp_addr: None, + election_timeout_min: std::time::Duration::from_millis(150), + election_timeout_max: std::time::Duration::from_millis(300), }; let lifecycle = ClusterLifecycleTracker::new(); @@ -292,9 +294,9 @@ impl TestNode { } /// Number of committed `CatalogDdl` entries observed by this - /// node's cache applier. After batch 1e the cluster crate - /// treats catalog DDL payloads as opaque — this counter is - /// what tests assert on for replication correctness. + /// node's cache applier. The cluster crate treats catalog DDL + /// payloads as opaque — this counter is what tests assert on + /// for replication correctness. pub fn catalog_entries_applied(&self) -> u64 { self.metadata_cache .read() diff --git a/nodedb-cluster/tests/decommission_flow.rs b/nodedb-cluster/tests/decommission_flow.rs new file mode 100644 index 00000000..ef317a9d --- /dev/null +++ b/nodedb-cluster/tests/decommission_flow.rs @@ -0,0 +1,153 @@ +//! End-to-end decommission flow. +//! +//! Wires every piece of the decommission subsystem together without +//! standing up a real metadata Raft group: +//! +//! - `CacheApplier::with_live_state` holds shared topology + routing. +//! - A direct in-memory `MetadataProposer` encodes each proposed +//! entry, feeds it straight into the applier with a synthetic +//! monotonically-increasing index, and returns the index — i.e. a +//! "propose and wait for commit" that is instantaneous. +//! - `DecommissionCoordinator` walks a `plan_full_decommission` +//! output through that proposer. +//! - `DecommissionObserver` watches the local topology for the +//! target's state transition and fires its shutdown watch. +//! +//! The real metadata Raft path is already exercised by +//! `metadata_replication.rs`; this test focuses on the decommission +//! state machine end to end: plan → propose → apply → live state +//! → observer signal. + +use std::net::SocketAddr; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex, RwLock}; +use std::time::Duration; + +use async_trait::async_trait; + +use nodedb_cluster::decommission::{ + DecommissionCoordinator, DecommissionObserver, MetadataProposer, plan_full_decommission, +}; +use nodedb_cluster::error::Result; +use nodedb_cluster::metadata_group::{CacheApplier, MetadataApplier, MetadataCache, encode_entry}; +use nodedb_cluster::routing::RoutingTable; +use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState}; +use nodedb_cluster::{DecommissionRunResult, MetadataEntry}; + +/// In-memory proposer that encodes every entry and immediately feeds +/// it through an attached `CacheApplier`, returning a synthetic +/// monotonically-increasing index. This is the "one-node metadata +/// group" equivalent the test uses to drive the decommission +/// state machine end to end in a few hundred microseconds. +struct DirectProposer { + applier: Arc, + next_index: AtomicU64, + proposed: Mutex>, +} + +impl DirectProposer { + fn new(applier: Arc) -> Arc { + Arc::new(Self { + applier, + next_index: AtomicU64::new(1), + proposed: Mutex::new(Vec::new()), + }) + } +} + +#[async_trait] +impl MetadataProposer for DirectProposer { + async fn propose_and_wait(&self, entry: MetadataEntry) -> Result { + let idx = self.next_index.fetch_add(1, Ordering::SeqCst); + let bytes = encode_entry(&entry).expect("encode metadata entry"); + self.applier.apply(&[(idx, bytes)]); + self.proposed.lock().unwrap().push(entry); + Ok(idx) + } +} + +#[tokio::test] +async fn end_to_end_decommission_drains_node_and_signals_shutdown() { + // --- 3 active nodes, 4 groups, RF=3. Decommission node 3 + // while RF=2 is the surviving quorum target. + let mut topo = ClusterTopology::new(); + for (i, id) in [1u64, 2, 3].iter().enumerate() { + let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + topo.add_node(NodeInfo::new(*id, a, NodeState::Active)); + } + let topology = Arc::new(RwLock::new(topo)); + let mut rt = RoutingTable::uniform(4, &[1, 2, 3], 3); + // Make node 3 the leader of at least one group so the plan + // emits a LeadershipTransfer entry and the applier must handle + // it live. + rt.set_leader(0, 3); + rt.set_leader(1, 1); + rt.set_leader(2, 3); + rt.set_leader(3, 2); + let routing = Arc::new(RwLock::new(rt)); + + // --- Applier with live topology + routing cascading. + let cache = Arc::new(RwLock::new(MetadataCache::new())); + let applier = Arc::new( + CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone()), + ); + let proposer = DirectProposer::new(applier.clone()); + + // --- Observer running on node 3 (the target). + let (observer, mut shutdown_rx) = + DecommissionObserver::new(topology.clone(), 3, Duration::from_millis(10)); + + // --- Build the plan from a snapshot of the live state. + let plan = { + let t = topology.read().unwrap(); + let r = routing.read().unwrap(); + plan_full_decommission(3, &t, &r, 2).expect("plan") + }; + let plan_len = plan.entries.len(); + + // --- Drive the coordinator. + let coordinator = DecommissionCoordinator::new(plan, proposer.clone()); + let result: DecommissionRunResult = coordinator.run().await.expect("coordinator run"); + assert_eq!(result.node_id, 3); + assert_eq!(result.entries_committed, plan_len); + + // --- Assert live state now reflects the decommission outcome. + // + // Topology: node 3 is gone (final `Leave` entry removed it). + { + let t = topology.read().unwrap(); + assert!( + t.get_node(3).is_none(), + "node 3 should be removed from topology after Leave" + ); + // Node 1 and 2 still present and unchanged. + assert_eq!(t.get_node(1).unwrap().state, NodeState::Active); + assert_eq!(t.get_node(2).unwrap().state, NodeState::Active); + } + + // Routing: node 3 is no longer in any group's member set, and + // the groups it used to lead have had their leader hints + // updated via LeadershipTransfer. + { + let r = routing.read().unwrap(); + for (gid, info) in r.group_members() { + assert!( + !info.members.contains(&3), + "group {gid} still contains node 3 after decommission" + ); + assert!( + !info.learners.contains(&3), + "group {gid} still has node 3 as learner after decommission" + ); + } + // Group 0 was led by 3 → LeadershipTransfer emitted a new + // non-3 leader; group 2 likewise. + assert_ne!(r.group_info(0).unwrap().leader, 3); + assert_ne!(r.group_info(2).unwrap().leader, 3); + } + + // --- Observer must now fire its shutdown signal on the very + // next check — the topology change already landed. + assert!(observer.check_once()); + assert!(*shutdown_rx.borrow_and_update()); +} diff --git a/nodedb-cluster/tests/elastic_scaling.rs b/nodedb-cluster/tests/elastic_scaling.rs new file mode 100644 index 00000000..4574c721 --- /dev/null +++ b/nodedb-cluster/tests/elastic_scaling.rs @@ -0,0 +1,169 @@ +//! Elastic add/remove — proves the end-to-end path from membership +//! change to rebalancer dispatch. +//! +//! - **Add-node**: 3 balanced nodes, 4th node joins with zero load → +//! kick fires → sweep dispatches moves to the new node. +//! - **Remove-node**: covered by `decommission_flow.rs` — the +//! decommission plan strips the node from all groups, and the +//! rebalancer loop naturally re-evaluates on its next tick. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex, RwLock}; +use std::time::Duration; + +use async_trait::async_trait; + +use nodedb_cluster::error::Result; +use nodedb_cluster::rebalance::PlannedMove; +use nodedb_cluster::rebalancer::{ + AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher, + RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig, +}; +use nodedb_cluster::routing::RoutingTable; +use nodedb_cluster::swim::MemberState; +use nodedb_cluster::swim::subscriber::MembershipSubscriber; +use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState}; +use nodedb_types::NodeId; + +struct DynamicProvider { + metrics: Mutex>, +} + +impl DynamicProvider { + fn new(initial: Vec) -> Arc { + Arc::new(Self { + metrics: Mutex::new(initial), + }) + } + fn push(&self, m: LoadMetrics) { + self.metrics.lock().unwrap().push(m); + } +} + +#[async_trait] +impl LoadMetricsProvider for DynamicProvider { + async fn snapshot(&self) -> Result> { + Ok(self.metrics.lock().unwrap().clone()) + } +} + +struct RecordingDispatcher { + calls: Mutex>, + fired: AtomicBool, +} + +impl RecordingDispatcher { + fn new() -> Arc { + Arc::new(Self { + calls: Mutex::new(Vec::new()), + fired: AtomicBool::new(false), + }) + } +} + +#[async_trait] +impl MigrationDispatcher for RecordingDispatcher { + async fn dispatch(&self, mv: PlannedMove) -> Result<()> { + self.calls.lock().unwrap().push(mv); + self.fired.store(true, Ordering::SeqCst); + Ok(()) + } +} + +fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics { + LoadMetrics { + node_id: id, + vshards_led: v, + bytes_stored: bytes_mib * 1_048_576, + writes_per_sec: w, + reads_per_sec: r, + cpu_utilization: 0.0, + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn add_node_triggers_rebalance_via_kick() { + // --- Initial state: 3 balanced nodes, 6 groups. + let mut topo = ClusterTopology::new(); + for (i, id) in [1u64, 2, 3].iter().enumerate() { + let a: std::net::SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + topo.add_node(NodeInfo::new(*id, a, NodeState::Active)); + } + let topology = Arc::new(RwLock::new(topo)); + let mut rt = RoutingTable::uniform(6, &[1, 2, 3], 1); + // Node 1 leads all 6 groups → hot. + for gid in 0..6 { + rt.set_leader(gid, 1); + } + let routing = Arc::new(RwLock::new(rt)); + + // Metrics: node 1 hot, 2 and 3 moderate. + let provider = DynamicProvider::new(vec![ + lm(1, 200, 2000, 200.0, 200.0), + lm(2, 50, 500, 50.0, 50.0), + lm(3, 50, 500, 50.0, 50.0), + ]); + + let dispatcher = RecordingDispatcher::new(); + let gate: Arc = Arc::new(AlwaysReadyGate); + + // Use a long interval so the normal tick doesn't fire before the + // kick does — the kick is the signal we're testing. + let rloop = Arc::new(RebalancerLoop::new( + RebalancerLoopConfig { + interval: Duration::from_secs(300), + ..Default::default() + }, + provider.clone() as Arc, + dispatcher.clone() as Arc, + gate, + routing.clone(), + topology.clone(), + )); + + // Wire the kick hook. + let kick_hook = RebalancerKickHook::new(rloop.kick_handle()); + + let (shutdown_tx, shutdown_rx) = tokio::sync::watch::channel(false); + let handle = tokio::spawn({ + let d = Arc::clone(&rloop); + async move { d.run(shutdown_rx).await } + }); + + // --- Simulate node 4 joining. + { + let mut t = topology.write().unwrap(); + let a: std::net::SocketAddr = "127.0.0.1:9003".parse().unwrap(); + t.add_node(NodeInfo::new(4, a, NodeState::Active)); + } + // Add node 4's zero-load metrics so the planner sees it as cold. + provider.push(lm(4, 0, 0, 0.0, 0.0)); + + // Fire the SWIM membership hook — this should kick the loop. + kick_hook.on_state_change(&NodeId::new("node-4"), None, MemberState::Alive); + + // Wait for the dispatcher to fire. + let deadline = std::time::Instant::now() + Duration::from_secs(3); + while std::time::Instant::now() < deadline { + if dispatcher.fired.load(Ordering::SeqCst) { + break; + } + tokio::time::sleep(Duration::from_millis(50)).await; + } + assert!( + dispatcher.fired.load(Ordering::SeqCst), + "kick did not trigger a rebalancer dispatch" + ); + + // At least one move should target node 4 (the cold newcomer). + let calls = dispatcher.calls.lock().unwrap().clone(); + assert!(!calls.is_empty()); + let to_4 = calls.iter().filter(|m| m.target_node == 4).count(); + assert!( + to_4 > 0, + "expected at least one move targeting node 4, got {to_4}" + ); + + let _ = shutdown_tx.send(true); + let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; +} diff --git a/nodedb-cluster/tests/metadata_replication.rs b/nodedb-cluster/tests/metadata_replication.rs index 6ed9c58f..dd0133ee 100644 --- a/nodedb-cluster/tests/metadata_replication.rs +++ b/nodedb-cluster/tests/metadata_replication.rs @@ -1,8 +1,8 @@ //! Integration test: replicated metadata group commits + cache apply. //! -//! After batch 1e the `nodedb-cluster` crate no longer understands -//! per-DDL-object descriptor shapes — `CatalogDdl { payload }` is -//! opaque here. This test verifies the cluster-side plumbing +//! The `nodedb-cluster` crate does not understand per-DDL-object +//! descriptor shapes — `CatalogDdl { payload }` is opaque here. +//! This test verifies the cluster-side plumbing //! (raft commit + metadata applier dispatch + cache watermark) //! using synthetic opaque payloads. End-to-end cross-node DDL //! visibility (applier decoding + redb writeback + pgwire visibility) diff --git a/nodedb-cluster/tests/reachability_loop.rs b/nodedb-cluster/tests/reachability_loop.rs new file mode 100644 index 00000000..9e0ab007 --- /dev/null +++ b/nodedb-cluster/tests/reachability_loop.rs @@ -0,0 +1,142 @@ +//! Reachability loop closes the circuit-breaker blind spot. +//! +//! Scenario: peer 42 starts out unreachable so its breaker opens. +//! After a few seconds the peer "recovers" (the mock prober flips +//! from Err to Ok). The reachability driver must observe the next +//! sweep as a success and drive the breaker back to `Closed` without +//! any user traffic. + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; + +use nodedb_cluster::circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, CircuitState}; +use nodedb_cluster::error::{ClusterError, Result}; +use nodedb_cluster::reachability::{ + ReachabilityDriver, ReachabilityDriverConfig, ReachabilityProber, +}; +use tokio::sync::watch; + +/// Mock prober whose success/failure can be flipped at runtime by +/// the test. Every probe call increments a hit counter so the test +/// can prove the sweep actually ran. +struct Flappy { + healthy: AtomicBool, +} + +impl Flappy { + fn new() -> Arc { + Arc::new(Self { + healthy: AtomicBool::new(false), + }) + } + fn heal(&self) { + self.healthy.store(true, Ordering::SeqCst); + } +} + +#[async_trait] +impl ReachabilityProber for Flappy { + async fn probe(&self, peer: u64) -> Result<()> { + if self.healthy.load(Ordering::SeqCst) { + Ok(()) + } else { + Err(ClusterError::Transport { + detail: format!("mock: peer {peer} unreachable"), + }) + } + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn reachability_loop_recovers_open_breaker_without_user_traffic() { + // --- Shared breaker, opened immediately for peer 42. --- + let breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig { + failure_threshold: 1, + // Short cooldown so HalfOpen is eligible quickly — the + // driver still needs to drive the actual transition. + cooldown: Duration::from_millis(100), + })); + breaker.record_failure(42); + assert_eq!(breaker.state(42), CircuitState::Open); + + // --- Flappy prober starts "unhealthy". --- + let prober = Flappy::new(); + + // The driver's sweep_once calls probe() but does NOT itself + // drive record_success/record_failure — production relies on + // NexarTransport::send_rpc for that, and the mock has no such + // wrapper. So we install a relay closure that records the + // outcome against the breaker on the driver's behalf. This is + // the minimal glue needed to exercise the real loop end-to-end. + struct RelayProber { + inner: Arc, + breaker: Arc, + } + #[async_trait] + impl ReachabilityProber for RelayProber { + async fn probe(&self, peer: u64) -> Result<()> { + // Mirror send_rpc: check → probe → record outcome. + if self.breaker.check(peer).is_err() { + return Err(ClusterError::CircuitOpen { + node_id: peer, + failures: self.breaker.failure_count(peer), + }); + } + match self.inner.probe(peer).await { + Ok(()) => { + self.breaker.record_success(peer); + Ok(()) + } + Err(e) => { + self.breaker.record_failure(peer); + Err(e) + } + } + } + } + let relay: Arc = Arc::new(RelayProber { + inner: prober.clone(), + breaker: Arc::clone(&breaker), + }); + + let driver = Arc::new(ReachabilityDriver::new( + Arc::clone(&breaker), + relay, + ReachabilityDriverConfig { + interval: Duration::from_millis(150), + }, + )); + let (shutdown_tx, shutdown_rx) = watch::channel(false); + let handle = tokio::spawn({ + let d = Arc::clone(&driver); + async move { d.run(shutdown_rx).await } + }); + + // --- First few sweeps: probe keeps failing, breaker stays Open. --- + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!( + breaker.state(42), + CircuitState::Open, + "breaker should stay open while peer is unhealthy" + ); + + // --- Heal the peer. Next sweep should drive Open → HalfOpen → Closed. --- + prober.heal(); + + let deadline = Instant::now() + Duration::from_secs(3); + loop { + if breaker.state(42) == CircuitState::Closed { + break; + } + if Instant::now() >= deadline { + panic!("breaker never recovered; state = {:?}", breaker.state(42)); + } + tokio::time::sleep(Duration::from_millis(50)).await; + } + + let _ = shutdown_tx.send(true); + let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; +} diff --git a/nodedb-cluster/tests/rebalancer_loop.rs b/nodedb-cluster/tests/rebalancer_loop.rs new file mode 100644 index 00000000..b32cf2b6 --- /dev/null +++ b/nodedb-cluster/tests/rebalancer_loop.rs @@ -0,0 +1,177 @@ +//! End-to-end rebalancer driver loop. +//! +//! Wires every piece of the rebalancer together without standing up +//! the real `MigrationExecutor`: +//! +//! - A shared `Arc>` + `Arc>`. +//! - A `StaticProvider` returning a canned set of `LoadMetrics` so +//! node 1 is massively hotter than nodes 2 and 3. +//! - A `DirectDispatcher` that simulates instantaneous migration +//! completion by reassigning the vshard's group leader in the +//! live routing table and recording the call for assertions. +//! - An `AlwaysReadyGate` — no election gating in this synthetic +//! scenario. +//! +//! The test spawns the loop, advances through one sweep, asserts +//! the dispatcher observed moves exclusively from node 1 as source, +//! and asserts the routing table was actually mutated — proving the +//! full plan → dispatch → apply chain. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex, RwLock}; +use std::time::Duration; + +use async_trait::async_trait; +use tokio::sync::watch; + +use nodedb_cluster::error::Result; +use nodedb_cluster::rebalance::PlannedMove; +use nodedb_cluster::rebalancer::{ + AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher, + RebalancerLoop, RebalancerLoopConfig, +}; +use nodedb_cluster::routing::RoutingTable; +use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState}; + +struct StaticProvider(Vec); + +#[async_trait] +impl LoadMetricsProvider for StaticProvider { + async fn snapshot(&self) -> Result> { + Ok(self.0.clone()) + } +} + +struct DirectDispatcher { + routing: Arc>, + calls: Mutex>, + fired: AtomicBool, +} + +impl DirectDispatcher { + fn new(routing: Arc>) -> Arc { + Arc::new(Self { + routing, + calls: Mutex::new(Vec::new()), + fired: AtomicBool::new(false), + }) + } + fn calls(&self) -> Vec { + self.calls.lock().unwrap().clone() + } + fn fired(&self) -> bool { + self.fired.load(Ordering::SeqCst) + } +} + +#[async_trait] +impl MigrationDispatcher for DirectDispatcher { + async fn dispatch(&self, mv: PlannedMove) -> Result<()> { + // Simulate a completed migration by flipping the group + // leader to the target node. + { + let mut rt = self.routing.write().unwrap_or_else(|p| p.into_inner()); + rt.set_leader(mv.source_group, mv.target_node); + } + self.calls.lock().unwrap().push(mv); + self.fired.store(true, Ordering::SeqCst); + Ok(()) + } +} + +fn topo(nodes: &[u64]) -> Arc> { + let mut t = ClusterTopology::new(); + for (i, id) in nodes.iter().enumerate() { + let a: std::net::SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap(); + t.add_node(NodeInfo::new(*id, a, NodeState::Active)); + } + Arc::new(RwLock::new(t)) +} + +fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics { + LoadMetrics { + node_id: id, + vshards_led: v, + bytes_stored: bytes_mib * 1_048_576, + writes_per_sec: w, + reads_per_sec: r, + cpu_utilization: 0.0, + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn rebalancer_loop_dispatches_and_mutates_routing() { + // --- 3 active nodes, 6 groups, node 1 leads all of them (hot). + let topology = topo(&[1, 2, 3]); + let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1); + for gid in 0..6 { + r.set_leader(gid, 1); + } + let routing = Arc::new(RwLock::new(r)); + + // --- Hot node 1, cold 2 and 3. + let metrics: Arc = Arc::new(StaticProvider(vec![ + lm(1, 500, 5000, 200.0, 200.0), + lm(2, 5, 5, 5.0, 5.0), + lm(3, 5, 5, 5.0, 5.0), + ])); + + let dispatcher = DirectDispatcher::new(routing.clone()); + let gate: Arc = Arc::new(AlwaysReadyGate); + + let rloop = Arc::new(RebalancerLoop::new( + RebalancerLoopConfig { + interval: Duration::from_millis(50), + ..Default::default() + }, + metrics, + dispatcher.clone() as Arc, + gate, + routing.clone(), + topology.clone(), + )); + + let (shutdown_tx, shutdown_rx) = watch::channel(false); + let handle = tokio::spawn({ + let d = Arc::clone(&rloop); + async move { d.run(shutdown_rx).await } + }); + + // Wall-clock wait — the loop uses real time, so just give it a + // couple of intervals to sweep + spawn + dispatch. + let deadline = std::time::Instant::now() + Duration::from_secs(3); + while std::time::Instant::now() < deadline { + if dispatcher.fired() { + break; + } + tokio::time::sleep(Duration::from_millis(50)).await; + } + assert!( + dispatcher.fired(), + "rebalancer loop never dispatched a move" + ); + + // Every move must have node 1 as source. + let calls = dispatcher.calls(); + assert!(!calls.is_empty()); + for c in &calls { + assert_eq!(c.source_node, 1, "source must be the hot node"); + assert_ne!(c.target_node, 1, "target must differ from source"); + } + + // Routing mutation: at least one group previously led by 1 now + // has a non-1 leader. + { + let rt = routing.read().unwrap(); + let still_on_1 = (0..6) + .filter(|gid| rt.group_info(*gid).unwrap().leader == 1) + .count(); + assert!( + still_on_1 < 6, + "at least one group should have moved off node 1" + ); + } + + let _ = shutdown_tx.send(true); + let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; +} diff --git a/nodedb-cluster/tests/swim_routing_invalidation.rs b/nodedb-cluster/tests/swim_routing_invalidation.rs new file mode 100644 index 00000000..6dd67422 --- /dev/null +++ b/nodedb-cluster/tests/swim_routing_invalidation.rs @@ -0,0 +1,159 @@ +//! Liveness drives routing invalidation. +//! +//! Three UDP-backed SWIM nodes form a full mesh. A shared +//! `RoutingTable` declares node B as the leader of group 0. A +//! `RoutingLivenessHook` subscribed to node A's detector is wired to +//! that routing table. When B is shut down, A's detector must observe +//! the Suspect→Dead transition and the hook must clear the leader +//! hint for group 0 within a few suspicion timeouts. + +use std::sync::{Arc, RwLock}; +use std::time::{Duration, Instant}; + +use nodedb_cluster::routing::RoutingTable; +use nodedb_cluster::routing_liveness::{NodeIdResolver, RoutingLivenessHook}; +use nodedb_cluster::swim::Transport; +use nodedb_cluster::swim::bootstrap::spawn_with_subscribers; +use nodedb_cluster::{ + Incarnation, MembershipSubscriber, SwimConfig, SwimHandle, UdpTransport, spawn_swim, +}; +use nodedb_types::NodeId; + +fn fast_cfg() -> SwimConfig { + SwimConfig { + probe_interval: Duration::from_millis(50), + probe_timeout: Duration::from_millis(20), + indirect_probes: 2, + suspicion_mult: 3, + min_suspicion: Duration::from_millis(150), + initial_incarnation: Incarnation::ZERO, + max_piggyback: 6, + fanout_lambda: 3, + } +} + +fn resolver_static() -> NodeIdResolver { + Arc::new(|nid: &NodeId| match nid.as_str() { + "a" => Some(1), + "b" => Some(2), + "c" => Some(3), + _ => None, + }) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn swim_dead_leader_clears_routing_hint() { + // --- Build three real UDP transports on ephemeral ports. --- + let t_a = Arc::new( + UdpTransport::bind("127.0.0.1:0".parse().unwrap()) + .await + .unwrap(), + ); + let t_b = Arc::new( + UdpTransport::bind("127.0.0.1:0".parse().unwrap()) + .await + .unwrap(), + ); + let t_c = Arc::new( + UdpTransport::bind("127.0.0.1:0".parse().unwrap()) + .await + .unwrap(), + ); + let addr_a = t_a.local_addr(); + let addr_b = t_b.local_addr(); + let addr_c = t_c.local_addr(); + + // --- Shared routing table: 4 groups, leader = node b (id=2) for groups 0 and 2. --- + let rt = Arc::new(RwLock::new(RoutingTable::uniform(4, &[1, 2, 3], 3))); + { + let mut guard = rt.write().unwrap(); + guard.set_leader(0, 2); + guard.set_leader(1, 1); + guard.set_leader(2, 2); + guard.set_leader(3, 3); + } + + // --- Hook node A to the routing table. --- + let hook: Arc = + Arc::new(RoutingLivenessHook::new(rt.clone(), resolver_static())); + + let h_a: SwimHandle = spawn_with_subscribers( + fast_cfg(), + NodeId::new("a"), + addr_a, + vec![addr_b, addr_c], + t_a.clone() as Arc, + vec![hook], + ) + .await + .expect("spawn a"); + let h_b: SwimHandle = spawn_swim( + fast_cfg(), + NodeId::new("b"), + addr_b, + vec![addr_a, addr_c], + t_b.clone() as Arc, + ) + .await + .expect("spawn b"); + let h_c: SwimHandle = spawn_swim( + fast_cfg(), + NodeId::new("c"), + addr_c, + vec![addr_a, addr_b], + t_c.clone() as Arc, + ) + .await + .expect("spawn c"); + + // --- Wait for A to learn about B (real id, not placeholder). --- + let deadline = Instant::now() + Duration::from_secs(5); + loop { + let seen = h_a.membership().get(&NodeId::new("b")).is_some(); + if seen || Instant::now() >= deadline { + assert!(seen, "A never learned B's real NodeId"); + break; + } + tokio::time::sleep(Duration::from_millis(50)).await; + } + + // --- Sanity: group 0 still led by node 2. --- + { + let guard = rt.read().unwrap(); + assert_eq!(guard.group_info(0).unwrap().leader, 2); + } + + // --- Shut B down and wait for A to invalidate the leader hint. --- + h_b.shutdown().await; + + let deadline = Instant::now() + Duration::from_secs(5); + loop { + let cleared = { + let guard = rt.read().unwrap(); + guard.group_info(0).unwrap().leader == 0 && guard.group_info(2).unwrap().leader == 0 + }; + if cleared { + break; + } + if Instant::now() >= deadline { + panic!("routing hook never cleared leader hints for groups led by B"); + } + tokio::time::sleep(Duration::from_millis(50)).await; + } + + // Groups led by A must be untouched — A is still alive and probing. + // We do NOT assert on groups led by C: under real UDP races the + // detector may transiently flag C as Suspect while B is being + // demoted, which is the correct behaviour of the hook. + { + let guard = rt.read().unwrap(); + assert_eq!( + guard.group_info(1).unwrap().leader, + 1, + "group led by local node A must not be invalidated" + ); + } + + h_a.shutdown().await; + h_c.shutdown().await; +} diff --git a/nodedb-raft/src/node/core.rs b/nodedb-raft/src/node/core.rs index 33ef4d11..0df2408a 100644 --- a/nodedb-raft/src/node/core.rs +++ b/nodedb-raft/src/node/core.rs @@ -5,6 +5,7 @@ //! replication) live in [`super::internal`]. RPC handlers live in //! [`super::rpc`]. +use std::collections::HashSet; use std::time::Instant; use crate::error::{RaftError, Result}; @@ -61,7 +62,7 @@ pub struct RaftNode { /// When the next heartbeat should be sent (leader only). pub(super) heartbeat_deadline: Instant, /// Votes received in current election. - pub(super) votes_received: Vec, + pub(super) votes_received: HashSet, /// Pending ready output. pub(super) ready: Ready, /// Known leader ID (0 = unknown). @@ -89,7 +90,7 @@ impl RaftNode { leader_state: None, election_deadline: now + config.election_timeout_max, heartbeat_deadline: now, - votes_received: Vec::new(), + votes_received: HashSet::new(), ready: Ready::default(), leader_id: 0, config, diff --git a/nodedb-raft/src/node/rpc.rs b/nodedb-raft/src/node/rpc.rs index 31a2af41..d2c5e4d4 100644 --- a/nodedb-raft/src/node/rpc.rs +++ b/nodedb-raft/src/node/rpc.rs @@ -166,7 +166,7 @@ impl RaftNode { } /// Handle RequestVote response (candidate only). - pub fn handle_request_vote_response(&mut self, _peer: u64, resp: &RequestVoteResponse) { + pub fn handle_request_vote_response(&mut self, peer: u64, resp: &RequestVoteResponse) { if resp.term > self.hard_state.current_term { self.become_follower(resp.term); return; @@ -177,7 +177,7 @@ impl RaftNode { } if resp.vote_granted { - self.votes_received.push(resp.term); + self.votes_received.insert(peer); let vote_count = self.votes_received.len() + 1; // +1 for self-vote if vote_count >= self.config.quorum() { diff --git a/nodedb-sql/src/ddl_ast/mod.rs b/nodedb-sql/src/ddl_ast/mod.rs new file mode 100644 index 00000000..7e115e38 --- /dev/null +++ b/nodedb-sql/src/ddl_ast/mod.rs @@ -0,0 +1,16 @@ +//! Typed AST for NodeDB-specific DDL statements. +//! +//! Every DDL command the system supports is represented as a variant +//! of [`NodedbStatement`]. The DDL router matches on this enum +//! instead of string prefixes, so the compiler catches missing +//! handlers when a new DDL is added. +//! +//! The parser ([`parse`]) converts raw SQL into a `NodedbStatement` +//! using whitespace-split token matching — the same technique the +//! old string-prefix router used, but producing a typed output. + +pub mod parse; +pub mod statement; + +pub use parse::parse; +pub use statement::NodedbStatement; diff --git a/nodedb-sql/src/ddl_ast/parse.rs b/nodedb-sql/src/ddl_ast/parse.rs new file mode 100644 index 00000000..78fb6635 --- /dev/null +++ b/nodedb-sql/src/ddl_ast/parse.rs @@ -0,0 +1,605 @@ +//! Parse raw SQL into a [`NodedbStatement`]. + +use super::statement::NodedbStatement; + +/// Try to parse a DDL statement from raw SQL. Returns `None` for +/// non-DDL queries (SELECT, INSERT, etc.) that should flow through +/// the normal planner. +pub fn parse(sql: &str) -> Option { + let trimmed = sql.trim(); + if trimmed.is_empty() { + return None; + } + let upper = trimmed.to_uppercase(); + let parts: Vec<&str> = trimmed.split_whitespace().collect(); + if parts.is_empty() { + return None; + } + + // ── Collection lifecycle ───────────────────────────────────── + if upper.starts_with("CREATE COLLECTION ") || upper.starts_with("CREATE TABLE ") { + let if_not_exists = upper.contains("IF NOT EXISTS"); + let name = extract_name_after_keyword(&parts, "COLLECTION") + .or_else(|| extract_name_after_keyword(&parts, "TABLE"))?; + return Some(NodedbStatement::CreateCollection { + name, + if_not_exists, + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP COLLECTION ") || upper.starts_with("DROP TABLE ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "COLLECTION") + .or_else(|| extract_name_after_if_exists(&parts, "TABLE"))?; + return Some(NodedbStatement::DropCollection { name, if_exists }); + } + if upper.starts_with("ALTER COLLECTION ") || upper.starts_with("ALTER TABLE ") { + let name = extract_name_after_keyword(&parts, "COLLECTION") + .or_else(|| extract_name_after_keyword(&parts, "TABLE"))?; + return Some(NodedbStatement::AlterCollection { + name, + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DESCRIBE ") && !upper.starts_with("DESCRIBE SEQUENCE") { + let name = parts.get(1)?.to_string(); + return Some(NodedbStatement::DescribeCollection { name }); + } + if upper == "\\D" || upper == "SHOW COLLECTIONS" || upper.starts_with("SHOW COLLECTIONS") { + return Some(NodedbStatement::ShowCollections); + } + + // ── Index ──────────────────────────────────────────────────── + if upper.starts_with("CREATE UNIQUE INDEX ") || upper.starts_with("CREATE UNIQUE IND") { + return Some(NodedbStatement::CreateIndex { + unique: true, + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("CREATE INDEX ") { + return Some(NodedbStatement::CreateIndex { + unique: false, + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP INDEX ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "INDEX")?; + return Some(NodedbStatement::DropIndex { + name, + collection: None, + if_exists, + }); + } + if upper.starts_with("SHOW INDEX") { + let collection = parts.get(2).map(|s| s.to_string()); + return Some(NodedbStatement::ShowIndexes { collection }); + } + if upper.starts_with("REINDEX ") { + let collection = parts.get(1)?.to_string(); + return Some(NodedbStatement::Reindex { collection }); + } + + // ── Trigger ────────────────────────────────────────────────── + if upper.starts_with("CREATE ") && upper.contains("TRIGGER ") { + let or_replace = upper.contains("OR REPLACE"); + let deferred = upper.contains("DEFERRED"); + let sync = upper.contains("SYNC"); + return Some(NodedbStatement::CreateTrigger { + or_replace, + deferred, + sync, + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP TRIGGER ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "TRIGGER")?; + let collection = extract_after_keyword(&parts, "ON").unwrap_or_default(); + return Some(NodedbStatement::DropTrigger { + name, + collection, + if_exists, + }); + } + if upper.starts_with("ALTER TRIGGER ") { + return Some(NodedbStatement::AlterTrigger { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("SHOW TRIGGERS") { + let collection = if upper.starts_with("SHOW TRIGGERS ON ") { + parts.get(3).map(|s| s.to_string()) + } else { + None + }; + return Some(NodedbStatement::ShowTriggers { collection }); + } + + // ── Schedule ───────────────────────────────────────────────── + if upper.starts_with("CREATE SCHEDULE ") { + return Some(NodedbStatement::CreateSchedule { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP SCHEDULE ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "SCHEDULE")?; + return Some(NodedbStatement::DropSchedule { name, if_exists }); + } + if upper.starts_with("ALTER SCHEDULE ") { + return Some(NodedbStatement::AlterSchedule { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("SHOW SCHEDULE HISTORY ") { + let name = parts.get(3)?.to_string(); + return Some(NodedbStatement::ShowScheduleHistory { name }); + } + if upper == "SHOW SCHEDULES" || upper.starts_with("SHOW SCHEDULES") { + return Some(NodedbStatement::ShowSchedules); + } + + // ── Sequence ───────────────────────────────────────────────── + if upper.starts_with("CREATE SEQUENCE ") { + let if_not_exists = upper.contains("IF NOT EXISTS"); + let name = extract_name_after_if_exists(&parts, "SEQUENCE")?; + return Some(NodedbStatement::CreateSequence { + name, + if_not_exists, + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP SEQUENCE ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "SEQUENCE")?; + return Some(NodedbStatement::DropSequence { name, if_exists }); + } + if upper.starts_with("ALTER SEQUENCE ") { + return Some(NodedbStatement::AlterSequence { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DESCRIBE SEQUENCE ") { + let name = parts.get(2)?.to_string(); + return Some(NodedbStatement::DescribeSequence { name }); + } + if upper == "SHOW SEQUENCES" || upper.starts_with("SHOW SEQUENCES") { + return Some(NodedbStatement::ShowSequences); + } + + // ── Alert ──────────────────────────────────────────────────── + if upper.starts_with("CREATE ALERT ") { + return Some(NodedbStatement::CreateAlert { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP ALERT ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "ALERT")?; + return Some(NodedbStatement::DropAlert { name, if_exists }); + } + if upper.starts_with("ALTER ALERT ") { + return Some(NodedbStatement::AlterAlert { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("SHOW ALERT STATUS ") { + let name = parts.get(3)?.to_string(); + return Some(NodedbStatement::ShowAlertStatus { name }); + } + if upper.starts_with("SHOW ALERT") && !upper.starts_with("SHOW ALERT STATUS") { + return Some(NodedbStatement::ShowAlerts); + } + + // ── Retention policy ───────────────────────────────────────── + if upper.starts_with("CREATE RETENTION POLICY ") { + return Some(NodedbStatement::CreateRetentionPolicy { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP RETENTION POLICY ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "POLICY")?; + return Some(NodedbStatement::DropRetentionPolicy { name, if_exists }); + } + if upper.starts_with("ALTER RETENTION POLICY ") { + return Some(NodedbStatement::AlterRetentionPolicy { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("SHOW RETENTION POLIC") { + return Some(NodedbStatement::ShowRetentionPolicies); + } + + // ── Cluster admin ──────────────────────────────────────────── + if upper.starts_with("SHOW CLUSTER") { + return Some(NodedbStatement::ShowCluster); + } + if upper.starts_with("SHOW MIGRATIONS") { + return Some(NodedbStatement::ShowMigrations); + } + if upper.starts_with("SHOW RANGES") { + return Some(NodedbStatement::ShowRanges); + } + if upper.starts_with("SHOW ROUTING") { + return Some(NodedbStatement::ShowRouting); + } + if upper.starts_with("SHOW SCHEMA VERSION") { + return Some(NodedbStatement::ShowSchemaVersion); + } + if upper.starts_with("SHOW PEER HEALTH") { + return Some(NodedbStatement::ShowPeerHealth); + } + if upper.starts_with("REBALANCE") { + return Some(NodedbStatement::Rebalance); + } + if upper.starts_with("SHOW RAFT GROUP ") { + let id = parts.get(3)?.to_string(); + return Some(NodedbStatement::ShowRaftGroup { group_id: id }); + } + if upper.starts_with("SHOW RAFT GROUPS") || upper.starts_with("SHOW RAFT") { + return Some(NodedbStatement::ShowRaftGroups); + } + if upper.starts_with("ALTER RAFT GROUP ") { + return Some(NodedbStatement::AlterRaftGroup { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("REMOVE NODE ") { + let id = parts.get(2)?.to_string(); + return Some(NodedbStatement::RemoveNode { node_id: id }); + } + if upper.starts_with("SHOW NODE ") { + let id = parts.get(2)?.to_string(); + return Some(NodedbStatement::ShowNode { node_id: id }); + } + if upper.starts_with("SHOW NODES") { + return Some(NodedbStatement::ShowNodes); + } + + // ── Maintenance ────────────────────────────────────────────── + if upper.starts_with("ANALYZE") { + let collection = parts.get(1).map(|s| s.to_string()); + return Some(NodedbStatement::Analyze { collection }); + } + if upper.starts_with("COMPACT ") { + let collection = parts.get(1)?.to_string(); + return Some(NodedbStatement::Compact { collection }); + } + if upper.starts_with("SHOW COMPACTION ST") { + return Some(NodedbStatement::ShowCompactionStatus); + } + if upper.starts_with("SHOW STORAGE") { + let collection = parts.get(2).map(|s| s.to_string()); + return Some(NodedbStatement::ShowStorage { collection }); + } + + // ── Backup / restore ───────────────────────────────────────── + if upper.starts_with("BACKUP TENANT ") { + return Some(NodedbStatement::BackupTenant { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("RESTORE TENANT ") { + let dry_run = upper.ends_with(" DRY RUN") || upper.ends_with(" DRYRUN"); + return Some(NodedbStatement::RestoreTenant { + dry_run, + raw_sql: trimmed.to_string(), + }); + } + + // ── User / auth ────────────────────────────────────────────── + if upper.starts_with("CREATE USER ") { + return Some(NodedbStatement::CreateUser { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP USER ") { + let username = parts.get(2)?.to_string(); + return Some(NodedbStatement::DropUser { username }); + } + if upper.starts_with("ALTER USER ") { + return Some(NodedbStatement::AlterUser { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("SHOW USERS") { + return Some(NodedbStatement::ShowUsers); + } + if upper.starts_with("GRANT ROLE ") { + return Some(NodedbStatement::GrantRole { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("REVOKE ROLE ") { + return Some(NodedbStatement::RevokeRole { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("GRANT ") { + return Some(NodedbStatement::GrantPermission { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("REVOKE ") { + return Some(NodedbStatement::RevokePermission { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("SHOW PERMISSIONS") { + let collection = parts.get(2).map(|s| s.to_string()); + return Some(NodedbStatement::ShowPermissions { collection }); + } + if upper.starts_with("SHOW GRANTS") { + let username = parts.get(2).map(|s| s.to_string()); + return Some(NodedbStatement::ShowGrants { username }); + } + if upper.starts_with("SHOW TENANTS") { + return Some(NodedbStatement::ShowTenants); + } + if upper.starts_with("SHOW AUDIT") { + return Some(NodedbStatement::ShowAuditLog); + } + if upper.starts_with("SHOW CONSTRAINTS ") { + let collection = parts.get(2)?.to_string(); + return Some(NodedbStatement::ShowConstraints { collection }); + } + if upper.starts_with("SHOW TYPEGUARD") { + let collection = parts.get(2)?.to_string(); + return Some(NodedbStatement::ShowTypeGuards { collection }); + } + + // ── Change stream ──────────────────────────────────────────── + if upper.starts_with("CREATE CHANGE STREAM ") { + return Some(NodedbStatement::CreateChangeStream { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP CHANGE STREAM ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "STREAM")?; + return Some(NodedbStatement::DropChangeStream { name, if_exists }); + } + + // ── RLS ────────────────────────────────────────────────────── + if upper.starts_with("CREATE RLS POLICY ") { + return Some(NodedbStatement::CreateRlsPolicy { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP RLS POLICY ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "POLICY")?; + let collection = extract_after_keyword(&parts, "ON").unwrap_or_default(); + return Some(NodedbStatement::DropRlsPolicy { + name, + collection, + if_exists, + }); + } + if upper.starts_with("SHOW RLS POLI") { + let collection = parts.get(3).map(|s| s.to_string()); + return Some(NodedbStatement::ShowRlsPolicies { collection }); + } + + // ── Materialized view ──────────────────────────────────────── + if upper.starts_with("CREATE MATERIALIZED VIEW ") { + return Some(NodedbStatement::CreateMaterializedView { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP MATERIALIZED VIEW ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "VIEW")?; + return Some(NodedbStatement::DropMaterializedView { name, if_exists }); + } + + // ── Continuous aggregate ───────────────────────────────────── + if upper.starts_with("CREATE CONTINUOUS AGGREGATE ") { + return Some(NodedbStatement::CreateContinuousAggregate { + raw_sql: trimmed.to_string(), + }); + } + if upper.starts_with("DROP CONTINUOUS AGGREGATE ") { + let if_exists = upper.contains("IF EXISTS"); + let name = extract_name_after_if_exists(&parts, "AGGREGATE")?; + return Some(NodedbStatement::DropContinuousAggregate { name, if_exists }); + } + + None +} + +/// Extract the object name that follows a keyword (e.g. "COLLECTION" +/// in "CREATE COLLECTION users ..."). Handles IF NOT EXISTS by +/// skipping those tokens. +fn extract_name_after_keyword(parts: &[&str], keyword: &str) -> Option { + let kw_upper = keyword.to_uppercase(); + let pos = parts.iter().position(|p| p.to_uppercase() == kw_upper)?; + let mut idx = pos + 1; + // Skip IF NOT EXISTS tokens. + if parts.get(idx).map(|s| s.to_uppercase()) == Some("IF".to_string()) { + idx += 1; // NOT + if parts.get(idx).map(|s| s.to_uppercase()) == Some("NOT".to_string()) { + idx += 1; // EXISTS + } + if parts.get(idx).map(|s| s.to_uppercase()) == Some("EXISTS".to_string()) { + idx += 1; + } + } + parts.get(idx).map(|s| s.to_string()) +} + +/// Extract the object name for DROP-style commands where IF EXISTS +/// may appear between the keyword and the name. +fn extract_name_after_if_exists(parts: &[&str], keyword: &str) -> Option { + extract_name_after_keyword(parts, keyword) +} + +/// Extract the token after a keyword like "ON" or "TO". +fn extract_after_keyword(parts: &[&str], keyword: &str) -> Option { + let kw_upper = keyword.to_uppercase(); + let pos = parts.iter().position(|p| p.to_uppercase() == kw_upper)?; + parts.get(pos + 1).map(|s| s.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_create_collection() { + let stmt = parse("CREATE COLLECTION users (id INT, name TEXT)").unwrap(); + match stmt { + NodedbStatement::CreateCollection { + name, + if_not_exists, + .. + } => { + assert_eq!(name, "users"); + assert!(!if_not_exists); + } + other => panic!("expected CreateCollection, got {other:?}"), + } + } + + #[test] + fn parse_create_collection_if_not_exists() { + let stmt = parse("CREATE COLLECTION IF NOT EXISTS users").unwrap(); + match stmt { + NodedbStatement::CreateCollection { + name, + if_not_exists, + .. + } => { + assert_eq!(name, "users"); + assert!(if_not_exists); + } + other => panic!("expected CreateCollection, got {other:?}"), + } + } + + #[test] + fn parse_drop_collection() { + let stmt = parse("DROP COLLECTION users").unwrap(); + assert_eq!( + stmt, + NodedbStatement::DropCollection { + name: "users".into(), + if_exists: false, + } + ); + } + + #[test] + fn parse_drop_collection_if_exists() { + let stmt = parse("DROP COLLECTION IF EXISTS users").unwrap(); + assert_eq!( + stmt, + NodedbStatement::DropCollection { + name: "users".into(), + if_exists: true, + } + ); + } + + #[test] + fn parse_show_nodes() { + assert_eq!(parse("SHOW NODES"), Some(NodedbStatement::ShowNodes)); + } + + #[test] + fn parse_show_cluster() { + assert_eq!(parse("SHOW CLUSTER"), Some(NodedbStatement::ShowCluster)); + } + + #[test] + fn parse_create_trigger() { + let stmt = parse("CREATE OR REPLACE SYNC TRIGGER on_insert ...").unwrap(); + match stmt { + NodedbStatement::CreateTrigger { + or_replace, + sync, + deferred, + .. + } => { + assert!(or_replace); + assert!(sync); + assert!(!deferred); + } + other => panic!("expected CreateTrigger, got {other:?}"), + } + } + + #[test] + fn parse_drop_index_if_exists() { + let stmt = parse("DROP INDEX IF EXISTS idx_name").unwrap(); + match stmt { + NodedbStatement::DropIndex { + name, if_exists, .. + } => { + assert_eq!(name, "idx_name"); + assert!(if_exists); + } + other => panic!("expected DropIndex, got {other:?}"), + } + } + + #[test] + fn parse_analyze() { + assert_eq!( + parse("ANALYZE users"), + Some(NodedbStatement::Analyze { + collection: Some("users".into()), + }) + ); + assert_eq!( + parse("ANALYZE"), + Some(NodedbStatement::Analyze { collection: None }) + ); + } + + #[test] + fn non_ddl_returns_none() { + assert!(parse("SELECT * FROM users").is_none()); + assert!(parse("INSERT INTO users VALUES (1)").is_none()); + } + + #[test] + fn parse_grant_role() { + let stmt = parse("GRANT ROLE admin TO alice").unwrap(); + match stmt { + NodedbStatement::GrantRole { raw_sql } => { + assert!(raw_sql.contains("admin")); + } + other => panic!("expected GrantRole, got {other:?}"), + } + } + + #[test] + fn parse_create_sequence_if_not_exists() { + let stmt = parse("CREATE SEQUENCE IF NOT EXISTS my_seq START 1").unwrap(); + match stmt { + NodedbStatement::CreateSequence { + name, + if_not_exists, + .. + } => { + assert_eq!(name, "my_seq"); + assert!(if_not_exists); + } + other => panic!("expected CreateSequence, got {other:?}"), + } + } + + #[test] + fn parse_restore_dry_run() { + let stmt = parse("RESTORE TENANT 1 FROM '/tmp/backup' DRY RUN").unwrap(); + match stmt { + NodedbStatement::RestoreTenant { dry_run, .. } => { + assert!(dry_run); + } + other => panic!("expected RestoreTenant, got {other:?}"), + } + } +} diff --git a/nodedb-sql/src/ddl_ast/statement.rs b/nodedb-sql/src/ddl_ast/statement.rs new file mode 100644 index 00000000..30ee3db7 --- /dev/null +++ b/nodedb-sql/src/ddl_ast/statement.rs @@ -0,0 +1,278 @@ +//! The [`NodedbStatement`] enum — one variant per DDL command. + +/// Typed representation of every NodeDB DDL statement. +/// +/// Handlers receive a fully-parsed variant instead of raw `&[&str]` +/// parts, eliminating array-index panics and enabling exhaustive +/// match coverage for new DDL commands. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum NodedbStatement { + // ── Collection lifecycle ───────────────────────────────────── + CreateCollection { + name: String, + if_not_exists: bool, + raw_sql: String, + }, + DropCollection { + name: String, + if_exists: bool, + }, + AlterCollection { + name: String, + raw_sql: String, + }, + DescribeCollection { + name: String, + }, + ShowCollections, + + // ── Index ──────────────────────────────────────────────────── + CreateIndex { + unique: bool, + raw_sql: String, + }, + DropIndex { + name: String, + collection: Option, + if_exists: bool, + }, + ShowIndexes { + collection: Option, + }, + Reindex { + collection: String, + }, + + // ── Trigger ────────────────────────────────────────────────── + CreateTrigger { + or_replace: bool, + deferred: bool, + sync: bool, + raw_sql: String, + }, + DropTrigger { + name: String, + collection: String, + if_exists: bool, + }, + AlterTrigger { + raw_sql: String, + }, + ShowTriggers { + collection: Option, + }, + + // ── Schedule ───────────────────────────────────────────────── + CreateSchedule { + raw_sql: String, + }, + DropSchedule { + name: String, + if_exists: bool, + }, + AlterSchedule { + raw_sql: String, + }, + ShowSchedules, + ShowScheduleHistory { + name: String, + }, + + // ── Sequence ───────────────────────────────────────────────── + CreateSequence { + name: String, + if_not_exists: bool, + raw_sql: String, + }, + DropSequence { + name: String, + if_exists: bool, + }, + AlterSequence { + raw_sql: String, + }, + DescribeSequence { + name: String, + }, + ShowSequences, + + // ── Alert ──────────────────────────────────────────────────── + CreateAlert { + raw_sql: String, + }, + DropAlert { + name: String, + if_exists: bool, + }, + AlterAlert { + raw_sql: String, + }, + ShowAlerts, + ShowAlertStatus { + name: String, + }, + + // ── Retention policy ───────────────────────────────────────── + CreateRetentionPolicy { + raw_sql: String, + }, + DropRetentionPolicy { + name: String, + if_exists: bool, + }, + AlterRetentionPolicy { + raw_sql: String, + }, + ShowRetentionPolicies, + + // ── Change stream ──────────────────────────────────────────── + CreateChangeStream { + raw_sql: String, + }, + DropChangeStream { + name: String, + if_exists: bool, + }, + AlterChangeStream { + raw_sql: String, + }, + ShowChangeStreams, + + // ── Consumer group ─────────────────────────────────────────── + CreateConsumerGroup { + raw_sql: String, + }, + DropConsumerGroup { + name: String, + stream: String, + if_exists: bool, + }, + ShowConsumerGroups { + stream: Option, + }, + + // ── RLS policy ─────────────────────────────────────────────── + CreateRlsPolicy { + raw_sql: String, + }, + DropRlsPolicy { + name: String, + collection: String, + if_exists: bool, + }, + ShowRlsPolicies { + collection: Option, + }, + + // ── Materialized view ──────────────────────────────────────── + CreateMaterializedView { + raw_sql: String, + }, + DropMaterializedView { + name: String, + if_exists: bool, + }, + ShowMaterializedViews, + + // ── Continuous aggregate ───────────────────────────────────── + CreateContinuousAggregate { + raw_sql: String, + }, + DropContinuousAggregate { + name: String, + if_exists: bool, + }, + ShowContinuousAggregates, + + // ── Backup / restore ───────────────────────────────────────── + BackupTenant { + raw_sql: String, + }, + RestoreTenant { + dry_run: bool, + raw_sql: String, + }, + + // ── Cluster admin ──────────────────────────────────────────── + ShowNodes, + ShowNode { + node_id: String, + }, + RemoveNode { + node_id: String, + }, + ShowCluster, + ShowMigrations, + ShowRanges, + ShowRouting, + ShowSchemaVersion, + ShowPeerHealth, + Rebalance, + ShowRaftGroups, + ShowRaftGroup { + group_id: String, + }, + AlterRaftGroup { + raw_sql: String, + }, + + // ── Maintenance ────────────────────────────────────────────── + Analyze { + collection: Option, + }, + Compact { + collection: String, + }, + ShowStorage { + collection: Option, + }, + ShowCompactionStatus, + + // ── User / auth / grant ────────────────────────────────────── + CreateUser { + raw_sql: String, + }, + DropUser { + username: String, + }, + AlterUser { + raw_sql: String, + }, + ShowUsers, + GrantRole { + raw_sql: String, + }, + RevokeRole { + raw_sql: String, + }, + GrantPermission { + raw_sql: String, + }, + RevokePermission { + raw_sql: String, + }, + ShowPermissions { + collection: Option, + }, + ShowGrants { + username: Option, + }, + + // ── Miscellaneous ──────────────────────────────────────────── + ShowTenants, + ShowAuditLog, + ShowConstraints { + collection: String, + }, + ShowTypeGuards { + collection: String, + }, + + /// Catch-all for DDL-like commands not yet promoted to their + /// own variant. Preserves the raw SQL for the legacy dispatch + /// path so new variants can be added incrementally without + /// breaking existing handlers. + Other { + raw_sql: String, + }, +} diff --git a/nodedb-sql/src/lib.rs b/nodedb-sql/src/lib.rs index 269d8ea4..4b614fe2 100644 --- a/nodedb-sql/src/lib.rs +++ b/nodedb-sql/src/lib.rs @@ -9,6 +9,7 @@ //! ``` pub mod catalog; +pub mod ddl_ast; pub mod engine_rules; pub mod error; pub mod functions; diff --git a/nodedb-types/src/config/tuning/network.rs b/nodedb-types/src/config/tuning/network.rs index 888fa982..4fd9d956 100644 --- a/nodedb-types/src/config/tuning/network.rs +++ b/nodedb-types/src/config/tuning/network.rs @@ -223,10 +223,10 @@ fn default_raft_tick_interval_ms() -> u64 { 10 } fn default_election_timeout_min_secs() -> u64 { - 60 + 2 } fn default_election_timeout_max_secs() -> u64 { - 120 + 5 } fn default_rpc_timeout_secs() -> u64 { 5 diff --git a/nodedb/src/control/cluster/handle.rs b/nodedb/src/control/cluster/handle.rs index 3cede98e..bc8845e1 100644 --- a/nodedb/src/control/cluster/handle.rs +++ b/nodedb/src/control/cluster/handle.rs @@ -33,4 +33,8 @@ pub struct ClusterHandle { /// stays `Clone` while still guaranteeing single-transfer /// semantics at runtime. pub multi_raft: Mutex>, + /// Cluster catalog (redb-backed topology + routing persistence). + /// Shared with the `HealthMonitor` for persisting topology changes + /// on failure detection and recovery. + pub catalog: Arc, } diff --git a/nodedb/src/control/cluster/init.rs b/nodedb/src/control/cluster/init.rs index ef06fe4b..3315b7d8 100644 --- a/nodedb/src/control/cluster/init.rs +++ b/nodedb/src/control/cluster/init.rs @@ -38,7 +38,7 @@ pub async fn init_cluster( "cluster QUIC transport bound" ); - init_cluster_with_transport(config, transport, data_dir).await + init_cluster_with_transport(config, transport, data_dir, transport_tuning).await } /// Initialize the cluster using a pre-bound QUIC transport. @@ -56,13 +56,15 @@ pub async fn init_cluster_with_transport( config: &ClusterSettings, transport: Arc, data_dir: &std::path::Path, + transport_tuning: &ClusterTransportTuning, ) -> crate::Result { // 2. Open cluster catalog. let catalog_path = data_dir.join("cluster.redb"); - let catalog = + let catalog = Arc::new( nodedb_cluster::ClusterCatalog::open(&catalog_path).map_err(|e| crate::Error::Config { detail: format!("cluster catalog: {e}"), - })?; + })?, + ); // 3. Bootstrap, join, or restart. let cluster_config = nodedb_cluster::ClusterConfig { @@ -75,6 +77,12 @@ pub async fn init_cluster_with_transport( force_bootstrap: config.force_bootstrap, join_retry: join_retry_policy_from_env(), swim_udp_addr: None, + election_timeout_min: std::time::Duration::from_secs( + transport_tuning.election_timeout_min_secs, + ), + election_timeout_max: std::time::Duration::from_secs( + transport_tuning.election_timeout_max_secs, + ), }; let lifecycle = nodedb_cluster::ClusterLifecycleTracker::new(); @@ -105,6 +113,7 @@ pub async fn init_cluster_with_transport( applied_index_watcher, node_id: config.node_id, multi_raft: Mutex::new(Some(state.multi_raft)), + catalog, }) } diff --git a/nodedb/src/control/cluster/start_raft.rs b/nodedb/src/control/cluster/start_raft.rs index 1c14c57c..bc968bc4 100644 --- a/nodedb/src/control/cluster/start_raft.rs +++ b/nodedb/src/control/cluster/start_raft.rs @@ -112,7 +112,7 @@ pub fn start_raft( // Start the RPC server (accepts inbound QUIC connections). let transport_serve = handle.transport.clone(); let rl_handler = raft_loop.clone(); - let sr_serve = shutdown_rx; + let sr_serve = shutdown_rx.clone(); tokio::spawn(async move { if let Err(e) = transport_serve.serve(rl_handler, sr_serve).await { tracing::error!(error = %e, "raft RPC server failed"); @@ -138,6 +138,27 @@ pub fn start_raft( ); } + // Start the health monitor (periodic pings, failure detection, + // topology re-broadcast). Without this, topology updates are + // only propagated via the fire-and-forget broadcast during the + // join flow — if that single broadcast is lost (peer QUIC server + // not yet accepting), the peer never converges. + let health_config = nodedb_cluster::HealthConfig { + ping_interval: Duration::from_secs(transport_tuning.health_ping_interval_secs), + failure_threshold: transport_tuning.health_failure_threshold, + }; + let health_monitor = Arc::new(nodedb_cluster::HealthMonitor::new( + handle.node_id, + handle.transport.clone(), + handle.topology.clone(), + handle.catalog.clone(), + health_config, + )); + let sr_health = shutdown_rx; + tokio::spawn(async move { + health_monitor.run(sr_health).await; + }); + info!(node_id = handle.node_id, "raft loop and RPC server started"); Ok(ready_rx) diff --git a/nodedb/src/control/metadata_proposer.rs b/nodedb/src/control/metadata_proposer.rs index 8a8314d5..6077a92f 100644 --- a/nodedb/src/control/metadata_proposer.rs +++ b/nodedb/src/control/metadata_proposer.rs @@ -176,6 +176,15 @@ pub fn propose_catalog_entry_with_timeout( } let payload = catalog_entry::encode(entry)?; + + // DDL transaction buffer: if a transactional DDL session is + // active on this thread (BEGIN ... COMMIT), buffer the payload + // instead of proposing immediately. The buffered entries will + // be proposed as a single MetadataEntry::Batch at COMMIT time. + if crate::control::server::pgwire::session::ddl_buffer::try_buffer(payload.clone()) { + return Ok(0); + } + let metadata_entry = MetadataEntry::CatalogDdl { payload }; let raw = encode_entry(&metadata_entry).map_err(|e| Error::Config { detail: format!("metadata entry encode: {e}"), diff --git a/nodedb/src/control/server/http/routes/health.rs b/nodedb/src/control/server/http/routes/health.rs index a97e02af..6a717d57 100644 --- a/nodedb/src/control/server/http/routes/health.rs +++ b/nodedb/src/control/server/http/routes/health.rs @@ -1,4 +1,12 @@ //! Health check endpoints. +//! +//! | Endpoint | Method | Purpose | k8s probe | +//! |-------------------|--------|-----------------------------|---------------| +//! | `/health/live` | GET | Process alive (always 200) | liveness | +//! | `/healthz` | GET | Ready to serve traffic | readiness | +//! | `/health` | GET | Liveness with cluster info | — | +//! | `/health/ready` | GET | WAL recovered | readiness alt | +//! | `/health/drain` | POST | Trigger graceful drain | preStop hook | use axum::extract::State; use axum::http::StatusCode; @@ -7,13 +15,36 @@ use serde_json::json; use super::super::auth::AppState; -/// GET /healthz — k8s-style readiness/liveness probe. +/// GET /health/live — unconditional liveness probe. /// -/// Returns `200 OK` when the node has reached `GatewayEnable` and is -/// serving traffic. Returns `503 Service Unavailable` during startup or if -/// startup has failed. This endpoint bypasses the startup gate middleware -/// and is always reachable, making it suitable as a k8s readiness probe. +/// Always returns 200. If this endpoint fails to respond, the +/// process is dead and should be restarted. No internal state is +/// checked — the mere ability to respond proves the event loop and +/// HTTP listener are alive. +pub async fn live() -> impl IntoResponse { + (StatusCode::OK, axum::Json(json!({ "status": "alive" }))) +} + +/// GET /healthz — k8s-style readiness probe. +/// +/// Returns `200 OK` when the node has reached `GatewayEnable`, is +/// serving traffic, and is NOT draining/decommissioned. Returns +/// `503 Service Unavailable` during startup, after startup failure, +/// or when the node is being decommissioned. pub async fn healthz(State(state): State) -> impl IntoResponse { + // Check decommission state via the cluster observer (if present). + if let Some(obs) = state.shared.cluster_observer.get() { + let snap = obs.snapshot(); + let label = snap.lifecycle_label(); + if label == "draining" || label == "decommissioned" || label == "failed" { + let body = json!({ + "status": "draining", + "lifecycle": label, + "node_id": state.shared.node_id, + }); + return (StatusCode::SERVICE_UNAVAILABLE, axum::Json(body)); + } + } let health = crate::control::startup::health::observe(&state.shared.startup); let (status, body) = crate::control::startup::health::to_http_response(&health); (status, axum::Json(body)) @@ -60,3 +91,31 @@ pub async fn ready(State(state): State) -> impl IntoResponse { }); (status, axum::Json(body)) } + +/// POST /health/drain — trigger graceful connection drain. +/// +/// Signals the canonical `ShutdownWatch` so every background loop +/// begins its cooperative exit. Subsequent `/healthz` calls return +/// 503, which causes the k8s readiness probe to fail and the +/// service mesh to stop routing new connections to this node. +/// +/// Designed for use as a k8s `preStop` hook: +/// +/// ```yaml +/// lifecycle: +/// preStop: +/// httpGet: +/// path: /health/drain +/// port: http +/// ``` +pub async fn drain(State(state): State) -> impl IntoResponse { + tracing::info!(node_id = state.shared.node_id, "drain requested via HTTP"); + state.shared.shutdown.signal(); + ( + StatusCode::OK, + axum::Json(json!({ + "status": "draining", + "node_id": state.shared.node_id, + })), + ) +} diff --git a/nodedb/src/control/server/http/server.rs b/nodedb/src/control/server/http/server.rs index 1a7e8d28..33d7a11c 100644 --- a/nodedb/src/control/server/http/server.rs +++ b/nodedb/src/control/server/http/server.rs @@ -3,7 +3,9 @@ //! Endpoints: //! - GET /healthz — k8s readiness/liveness (always reachable; 503 until GatewayEnable) //! - GET /health — liveness +//! - GET /health/live — unconditional liveness probe //! - GET /health/ready — readiness (WAL recovered) +//! - POST /health/drain — trigger graceful drain //! - GET /metrics — Prometheus-format metrics (requires monitor role) //! - POST /query — execute DDL via HTTP (requires auth) @@ -29,7 +31,9 @@ fn build_router(state: AppState) -> Router { // /healthz is always reachable — returns 503 during startup, 200 after. .route("/healthz", get(routes::health::healthz)) .route("/health", get(routes::health::health)) + .route("/health/live", get(routes::health::live)) .route("/health/ready", get(routes::health::ready)) + .route("/health/drain", post(routes::health::drain)) .route("/metrics", get(routes::metrics::metrics)) .route("/query", post(routes::query::query)) .route("/status", get(routes::status::status)) @@ -98,8 +102,8 @@ fn build_router(state: AppState) -> Router { /// Axum middleware that gates non-health routes on [`StartupPhase::GatewayEnable`]. /// -/// `/healthz`, `/health`, and `/health/ready` are always let through so k8s -/// readiness probes can observe startup progress. All other routes receive a +/// All `/health*` paths (liveness, readiness, drain) are always let through so +/// k8s probes can observe startup progress. All other routes receive a /// `503 Service Unavailable` until the node reaches `GatewayEnable`. async fn startup_gate_middleware( State(app_state): State, diff --git a/nodedb/src/control/server/native/dispatch/sql.rs b/nodedb/src/control/server/native/dispatch/sql.rs index 570b3c21..7831d3cb 100644 --- a/nodedb/src/control/server/native/dispatch/sql.rs +++ b/nodedb/src/control/server/native/dispatch/sql.rs @@ -282,6 +282,9 @@ fn is_session_show(upper: &str) -> bool { && !upper.starts_with("SHOW PEER") && !upper.starts_with("SHOW NODES") && !upper.starts_with("SHOW NODE ") + && !upper.starts_with("SHOW RANGES") + && !upper.starts_with("SHOW ROUTING") + && !upper.starts_with("SHOW SCHEMA VERSION") && !upper.starts_with("SHOW COLLECTIONS") && !upper.starts_with("SHOW AUDIT") && !upper.starts_with("SHOW PERMISSIONS") diff --git a/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs b/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs index 0bcc576a..7228ada9 100644 --- a/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs +++ b/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs @@ -32,7 +32,7 @@ pub fn handle_auth_user( )); } - let upper0 = parts[0].to_uppercase(); + let upper0 = parts.first().map(|s| s.to_uppercase()).unwrap_or_default(); match upper0.as_str() { "DEACTIVATE" => deactivate_auth_user(state, identity, parts), "ALTER" => alter_auth_user_status(state, identity, parts), diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs b/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs index 58509162..fbd34c9f 100644 --- a/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs +++ b/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs @@ -1,11 +1,17 @@ pub mod health; pub mod migration; pub mod raft; +pub mod ranges; pub mod rebalance_cmd; +pub mod routing_hint; +pub mod schema_version; pub mod topology; pub use health::show_peer_health; pub use migration::show_migrations; pub use raft::{alter_raft_group, show_raft_group, show_raft_groups}; +pub use ranges::show_ranges; pub use rebalance_cmd::rebalance; +pub use routing_hint::show_routing; +pub use schema_version::show_schema_version; pub use topology::{remove_node, show_cluster, show_node, show_nodes}; diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs b/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs new file mode 100644 index 00000000..8c82e2a1 --- /dev/null +++ b/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs @@ -0,0 +1,75 @@ +//! `SHOW RANGES` — vshard distribution across the cluster. + +use std::sync::Arc; + +use futures::stream; +use pgwire::api::results::{DataRowEncoder, QueryResponse, Response}; +use pgwire::error::PgWireResult; + +use crate::control::security::identity::AuthenticatedIdentity; +use crate::control::state::SharedState; + +use super::super::super::types::{int8_field, sqlstate_error, text_field}; + +/// SHOW RANGES — list vshards with leaseholder and replica info. +/// +/// Columns: vshard_id, group_id, leaseholder, replicas. +/// Superuser only. +pub fn show_ranges( + state: &SharedState, + identity: &AuthenticatedIdentity, +) -> PgWireResult> { + if !identity.is_superuser { + return Err(sqlstate_error( + "42501", + "permission denied: only superuser can view ranges", + )); + } + + let routing = match &state.cluster_routing { + Some(r) => r, + None => { + return Err(sqlstate_error( + "55000", + "cluster mode not enabled (single-node instance)", + )); + } + }; + + let schema = Arc::new(vec![ + int8_field("vshard_id"), + int8_field("group_id"), + int8_field("leaseholder"), + text_field("replicas"), + ]); + + let mut rows = Vec::new(); + let mut encoder = DataRowEncoder::new(schema.clone()); + + let rt = routing.read().unwrap_or_else(|p| p.into_inner()); + for vshard_id in 0..nodedb_cluster::routing::VSHARD_COUNT { + let group_id = rt.group_for_vshard(vshard_id).unwrap_or(0); + let (leader, replicas_str) = match rt.group_info(group_id) { + Some(info) => { + let replicas: String = info + .members + .iter() + .map(|m| m.to_string()) + .collect::>() + .join(", "); + (info.leader as i64, replicas) + } + None => (0i64, String::new()), + }; + encoder.encode_field(&(vshard_id as i64))?; + encoder.encode_field(&(group_id as i64))?; + encoder.encode_field(&leader)?; + encoder.encode_field(&replicas_str)?; + rows.push(Ok(encoder.take_row())); + } + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs b/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs new file mode 100644 index 00000000..4bdb57a5 --- /dev/null +++ b/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs @@ -0,0 +1,72 @@ +//! `SHOW ROUTING` — expose the vshard → leaseholder → node address +//! mapping so smart clients can cache it and route writes directly +//! to the leaseholder, skipping the gateway hop. +//! +//! Result columns: `vshard_id`, `group_id`, `leaseholder_node_id`, +//! `leaseholder_addr`. + +use std::sync::Arc; + +use futures::stream; +use pgwire::api::results::{DataRowEncoder, QueryResponse, Response}; +use pgwire::error::PgWireResult; + +use crate::control::security::identity::AuthenticatedIdentity; +use crate::control::state::SharedState; + +use super::super::super::types::{int8_field, sqlstate_error, text_field}; + +/// SHOW ROUTING — full vshard → leaseholder → address table. +/// +/// Any authenticated user may call this (smart-client libs need it). +pub fn show_routing( + state: &SharedState, + _identity: &AuthenticatedIdentity, +) -> PgWireResult> { + let routing = match &state.cluster_routing { + Some(r) => r, + None => { + return Err(sqlstate_error( + "55000", + "cluster mode not enabled (single-node instance)", + )); + } + }; + + let schema = Arc::new(vec![ + int8_field("vshard_id"), + int8_field("group_id"), + int8_field("leaseholder_node_id"), + text_field("leaseholder_addr"), + ]); + + let mut rows = Vec::new(); + let mut encoder = DataRowEncoder::new(schema.clone()); + + let rt = routing.read().unwrap_or_else(|p| p.into_inner()); + let topo_guard = state + .cluster_topology + .as_ref() + .map(|t| t.read().unwrap_or_else(|p| p.into_inner())); + + for vshard_id in 0..nodedb_cluster::routing::VSHARD_COUNT { + let group_id = rt.group_for_vshard(vshard_id).unwrap_or(0); + let leader = rt.group_info(group_id).map(|info| info.leader).unwrap_or(0); + let addr = topo_guard + .as_ref() + .and_then(|topo| topo.get_node(leader)) + .map(|n| n.addr.clone()) + .unwrap_or_default(); + + encoder.encode_field(&(vshard_id as i64))?; + encoder.encode_field(&(group_id as i64))?; + encoder.encode_field(&(leader as i64))?; + encoder.encode_field(&addr)?; + rows.push(Ok(encoder.take_row())); + } + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs b/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs new file mode 100644 index 00000000..9c0e9d94 --- /dev/null +++ b/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs @@ -0,0 +1,57 @@ +//! `SHOW SCHEMA VERSION` — current descriptor version visible on +//! this node. + +use std::sync::Arc; + +use futures::stream; +use pgwire::api::results::{DataRowEncoder, QueryResponse, Response}; +use pgwire::error::PgWireResult; + +use crate::control::security::identity::AuthenticatedIdentity; +use crate::control::state::SharedState; + +use super::super::super::types::{sqlstate_error, text_field}; + +/// SHOW SCHEMA VERSION — report the current descriptor version +/// counter and per-collection metadata if available. +pub fn show_schema_version( + state: &SharedState, + identity: &AuthenticatedIdentity, +) -> PgWireResult> { + if !identity.is_superuser { + return Err(sqlstate_error( + "42501", + "permission denied: only superuser can view schema version", + )); + } + + let schema = Arc::new(vec![text_field("property"), text_field("value")]); + + let mut rows = Vec::new(); + let mut encoder = DataRowEncoder::new(schema.clone()); + + let version = state.schema_version.current(); + encoder.encode_field(&"schema_version")?; + encoder.encode_field(&version.to_string())?; + rows.push(Ok(encoder.take_row())); + + let applied_index = { + let cache = state + .metadata_cache + .read() + .unwrap_or_else(|p| p.into_inner()); + cache.applied_index + }; + encoder.encode_field(&"metadata_applied_index")?; + encoder.encode_field(&applied_index.to_string())?; + rows.push(Ok(encoder.take_row())); + + encoder.encode_field(&"node_id")?; + encoder.encode_field(&state.node_id.to_string())?; + rows.push(Ok(encoder.take_row())); + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} diff --git a/nodedb/src/control/server/pgwire/ddl/grant/role.rs b/nodedb/src/control/server/pgwire/ddl/grant/role.rs index cc902b36..2eef069a 100644 --- a/nodedb/src/control/server/pgwire/ddl/grant/role.rs +++ b/nodedb/src/control/server/pgwire/ddl/grant/role.rs @@ -57,6 +57,13 @@ pub fn grant_role( ) -> PgWireResult> { require_admin(identity, "grant roles")?; + if parts.len() < 5 { + return Err(sqlstate_error( + "42601", + "syntax: GRANT ROLE TO ", + )); + } + let role = parse_role(parts[2]); if matches!(role, Role::Superuser) && !identity.is_superuser { @@ -94,6 +101,13 @@ pub fn revoke_role( ) -> PgWireResult> { require_admin(identity, "revoke roles")?; + if parts.len() < 5 { + return Err(sqlstate_error( + "42601", + "syntax: REVOKE ROLE FROM ", + )); + } + let role = parse_role(parts[2]); if !parts[3].eq_ignore_ascii_case("FROM") { diff --git a/nodedb/src/control/server/pgwire/ddl/router/admin.rs b/nodedb/src/control/server/pgwire/ddl/router/admin.rs index c85cdc9a..49ab2031 100644 --- a/nodedb/src/control/server/pgwire/ddl/router/admin.rs +++ b/nodedb/src/control/server/pgwire/ddl/router/admin.rs @@ -441,6 +441,15 @@ pub(super) async fn dispatch( if upper.starts_with("REMOVE NODE ") { return Some(super::super::cluster::remove_node(state, identity, parts)); } + if upper.starts_with("SHOW RANGES") { + return Some(super::super::cluster::show_ranges(state, identity)); + } + if upper.starts_with("SHOW ROUTING") { + return Some(super::super::cluster::show_routing(state, identity)); + } + if upper.starts_with("SHOW SCHEMA VERSION") { + return Some(super::super::cluster::show_schema_version(state, identity)); + } // Introspection. if upper.starts_with("SHOW USERS") { diff --git a/nodedb/src/control/server/pgwire/ddl/router/ast.rs b/nodedb/src/control/server/pgwire/ddl/router/ast.rs new file mode 100644 index 00000000..8793d904 --- /dev/null +++ b/nodedb/src/control/server/pgwire/ddl/router/ast.rs @@ -0,0 +1,231 @@ +//! AST-based DDL dispatch — typed fast path. +//! +//! Runs before the legacy string-prefix routers. Handles +//! `IF [NOT] EXISTS` at the dispatch level so individual handlers +//! don't need to check. Falls through to legacy dispatch for +//! `Other` variants and for statements where the typed path +//! delegates to the existing handler (via `raw_sql`). + +use pgwire::api::results::{Response, Tag}; +use pgwire::error::PgWireResult; + +use nodedb_sql::ddl_ast::NodedbStatement; + +use crate::control::security::identity::AuthenticatedIdentity; +use crate::control::state::SharedState; + +/// Try to dispatch a parsed `NodedbStatement`. Returns `Some` if +/// fully handled, `None` if the statement should fall through to +/// the legacy dispatch. +pub(super) fn try_dispatch( + state: &SharedState, + identity: &AuthenticatedIdentity, + stmt: &NodedbStatement, +) -> Option>> { + match stmt { + // ── IF NOT EXISTS: swallow duplicate-creation errors ────── + NodedbStatement::CreateCollection { + name, + if_not_exists: true, + .. + } => { + if collection_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new("CREATE COLLECTION"))])); + } + None // fall through to legacy CREATE handler + } + + NodedbStatement::CreateSequence { + name, + if_not_exists: true, + .. + } => { + if sequence_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new("CREATE SEQUENCE"))])); + } + None + } + + // ── IF EXISTS: swallow not-found errors on DROP ────────── + NodedbStatement::DropCollection { + name, + if_exists: true, + } => { + if !collection_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new("DROP COLLECTION"))])); + } + None + } + + NodedbStatement::DropIndex { + if_exists: true, .. + } => None, // legacy handler has its own check + + NodedbStatement::DropTrigger { + name, + if_exists: true, + .. + } => { + if !trigger_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new("DROP TRIGGER"))])); + } + None + } + + NodedbStatement::DropSchedule { + name, + if_exists: true, + } => { + if !schedule_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new("DROP SCHEDULE"))])); + } + None + } + + NodedbStatement::DropSequence { + name, + if_exists: true, + } => { + if !sequence_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new("DROP SEQUENCE"))])); + } + None + } + + NodedbStatement::DropAlert { + name, + if_exists: true, + } => { + if !alert_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new("DROP ALERT"))])); + } + None + } + + NodedbStatement::DropRetentionPolicy { + name, + if_exists: true, + } => { + if !retention_policy_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new( + "DROP RETENTION POLICY", + ))])); + } + None + } + + NodedbStatement::DropChangeStream { + name, + if_exists: true, + } => { + if !change_stream_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new( + "DROP CHANGE STREAM", + ))])); + } + None + } + + NodedbStatement::DropMaterializedView { + name, + if_exists: true, + } => { + if !materialized_view_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new( + "DROP MATERIALIZED VIEW", + ))])); + } + None + } + + NodedbStatement::DropContinuousAggregate { + name, + if_exists: true, + } => { + if !continuous_aggregate_exists(state, identity, name) { + return Some(Ok(vec![Response::Execution(Tag::new( + "DROP CONTINUOUS AGGREGATE", + ))])); + } + None + } + + NodedbStatement::DropRlsPolicy { + if_exists: true, .. + } => { + // RLS policy existence check would need collection context; + // fall through to legacy handler which already handles this. + None + } + + NodedbStatement::DropConsumerGroup { + if_exists: true, .. + } => None, // legacy handler + + // All other variants fall through to legacy dispatch. + _ => None, + } +} + +fn collection_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool { + let Some(catalog) = state.credentials.catalog() else { + return false; + }; + let tid = identity.tenant_id.as_u32(); + matches!(catalog.get_collection(tid, name), Ok(Some(_))) +} + +fn trigger_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool { + let Some(catalog) = state.credentials.catalog() else { + return false; + }; + let tid = identity.tenant_id.as_u32(); + matches!(catalog.get_trigger(tid, name), Ok(Some(_))) +} + +fn schedule_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool { + let tid = identity.tenant_id.as_u32(); + state.schedule_registry.get(tid, name).is_some() +} + +fn sequence_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool { + let tid = identity.tenant_id.as_u32(); + state.sequence_registry.exists(tid, name) +} + +fn alert_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool { + let tid = identity.tenant_id.as_u32(); + state.alert_registry.get(tid, name).is_some() +} + +fn retention_policy_exists( + state: &SharedState, + identity: &AuthenticatedIdentity, + name: &str, +) -> bool { + let tid = identity.tenant_id.as_u32(); + state.retention_policy_registry.get(tid, name).is_some() +} + +fn change_stream_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool { + let tid = identity.tenant_id.as_u32(); + state.stream_registry.get(tid, name).is_some() +} + +fn materialized_view_exists( + state: &SharedState, + identity: &AuthenticatedIdentity, + name: &str, +) -> bool { + let tid = identity.tenant_id.as_u32(); + state.mv_registry.get_def(tid, name).is_some() +} + +fn continuous_aggregate_exists( + state: &SharedState, + identity: &AuthenticatedIdentity, + name: &str, +) -> bool { + let tid = identity.tenant_id.as_u32(); + state.mv_registry.get_def(tid, name).is_some() +} diff --git a/nodedb/src/control/server/pgwire/ddl/router/mod.rs b/nodedb/src/control/server/pgwire/ddl/router/mod.rs index 73de64f0..905bb53f 100644 --- a/nodedb/src/control/server/pgwire/ddl/router/mod.rs +++ b/nodedb/src/control/server/pgwire/ddl/router/mod.rs @@ -1,4 +1,5 @@ mod admin; +mod ast; mod auth; mod collaborative; mod dsl; @@ -26,6 +27,18 @@ pub async fn dispatch( identity: &AuthenticatedIdentity, sql: &str, ) -> Option>> { + // AST-typed fast path: parse once, handle IF [NOT] EXISTS at the + // dispatch level, then fall through to legacy handlers for the + // actual execution. This is the incremental migration path — + // once every legacy handler has been ported to accept a typed + // NodedbStatement, the string-prefix routers below can be + // removed entirely. + if let Some(stmt) = nodedb_sql::ddl_ast::parse(sql) + && let Some(r) = ast::try_dispatch(state, identity, &stmt) + { + return Some(r); + } + let upper = sql.to_uppercase(); let parts: Vec<&str> = sql.split_whitespace().collect(); diff --git a/nodedb/src/control/server/pgwire/handler/retry.rs b/nodedb/src/control/server/pgwire/handler/retry.rs index 3e793ad9..bf536bf2 100644 --- a/nodedb/src/control/server/pgwire/handler/retry.rs +++ b/nodedb/src/control/server/pgwire/handler/retry.rs @@ -18,8 +18,8 @@ //! //! ## Retry budget //! -//! Three attempts total with 50ms, 100ms, 200ms backoff between -//! them — roughly 350ms of tolerance for a drain to complete. +//! Five attempts total with 50/100/200/400 ms backoff between +//! them — roughly 750ms of tolerance for a drain to complete. //! The `DEFAULT_DRAIN_TIMEOUT` from `metadata_proposer` is 35s, //! so in practice either drain completes within our retry budget //! (the proposer is actively draining and is probably close to @@ -31,13 +31,17 @@ use std::time::Duration; use crate::error::Error; /// Maximum number of attempts (including the initial call). -const MAX_ATTEMPTS: usize = 3; +const MAX_ATTEMPTS: usize = 5; /// Backoff durations BETWEEN attempts. `BACKOFFS[i]` is the sleep /// duration before attempt `i + 1`. Length must be /// `MAX_ATTEMPTS - 1`. -const BACKOFFS: [Duration; MAX_ATTEMPTS - 1] = - [Duration::from_millis(50), Duration::from_millis(100)]; +const BACKOFFS: [Duration; MAX_ATTEMPTS - 1] = [ + Duration::from_millis(50), + Duration::from_millis(100), + Duration::from_millis(200), + Duration::from_millis(400), +]; /// Run `op` up to `MAX_ATTEMPTS` times. Retries only on /// `Error::RetryableSchemaChanged`. Any other error (including diff --git a/nodedb/src/control/server/pgwire/handler/session_cmds.rs b/nodedb/src/control/server/pgwire/handler/session_cmds.rs index e3848f2e..ac7baa64 100644 --- a/nodedb/src/control/server/pgwire/handler/session_cmds.rs +++ b/nodedb/src/control/server/pgwire/handler/session_cmds.rs @@ -67,6 +67,19 @@ impl NodeDbPgHandler { } } + if key == super::super::session::read_consistency::PARAM_KEY + && super::super::session::read_consistency::parse_value(&value).is_none() + { + return Err(PgWireError::UserError(Box::new(ErrorInfo::new( + "ERROR".to_owned(), + "22023".to_owned(), + format!( + "invalid value for {}: '{value}'. Valid: strong, bounded_staleness:, eventual", + super::super::session::read_consistency::PARAM_KEY + ), + )))); + } + if key == "nodedb.tenant_id" && value.parse::().is_err() { return Err(PgWireError::UserError(Box::new(ErrorInfo::new( "ERROR".to_owned(), diff --git a/nodedb/src/control/server/pgwire/handler/sql_exec.rs b/nodedb/src/control/server/pgwire/handler/sql_exec.rs index ef469e04..69936c8e 100644 --- a/nodedb/src/control/server/pgwire/handler/sql_exec.rs +++ b/nodedb/src/control/server/pgwire/handler/sql_exec.rs @@ -217,6 +217,9 @@ impl NodeDbPgHandler { && !upper.starts_with("SHOW PEER") && !upper.starts_with("SHOW NODES") && !upper.starts_with("SHOW NODE ") + && !upper.starts_with("SHOW RANGES") + && !upper.starts_with("SHOW ROUTING") + && !upper.starts_with("SHOW SCHEMA VERSION") && !upper.starts_with("SHOW COLLECTIONS") && !upper.starts_with("SHOW AUDIT") && !upper.starts_with("SHOW PERMISSIONS") @@ -283,6 +286,13 @@ impl NodeDbPgHandler { ); } + // pg_catalog virtual tables — intercept before the normal planner. + if let Some(result) = + super::super::pg_catalog::try_pg_catalog(&self.state, identity, &upper) + { + return result; + } + if let Some(result) = super::super::ddl::dispatch(&self.state, identity, sql_trimmed).await { return result; diff --git a/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs b/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs index 63ea8026..0da593bb 100644 --- a/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs +++ b/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs @@ -18,6 +18,7 @@ impl NodeDbPgHandler { let next = self.state.wal.next_lsn(); crate::types::Lsn::new(next.as_u64().saturating_sub(1)) }; + crate::control::server::pgwire::session::ddl_buffer::activate(); self.sessions.begin(addr, snapshot_lsn).map_err(|msg| { PgWireError::UserError(Box::new(ErrorInfo::new( "ERROR".to_owned(), @@ -171,6 +172,35 @@ impl NodeDbPgHandler { } } + // Flush any buffered DDL entries as a single atomic batch. + if let Some(payloads) = crate::control::server::pgwire::session::ddl_buffer::take() + && !payloads.is_empty() + { + use nodedb_cluster::{MetadataEntry, encode_entry}; + let sub_entries: Vec = payloads + .into_iter() + .map(|p| MetadataEntry::CatalogDdl { payload: p }) + .collect(); + let batch = MetadataEntry::Batch { + entries: sub_entries, + }; + if let Some(handle) = self.state.metadata_raft.get() { + let raw = encode_entry(&batch).map_err(|e| { + PgWireError::UserError(Box::new(ErrorInfo::new( + "ERROR".to_owned(), + "XX000".to_owned(), + format!("DDL batch encode: {e}"), + ))) + })?; + handle.propose(raw).map_err(|e| { + PgWireError::UserError(Box::new(ErrorInfo::new( + "ERROR".to_owned(), + "XX000".to_owned(), + format!("DDL batch propose: {e}"), + ))) + })?; + } + } // Close non-WITH-HOLD cursors on transaction end. self.sessions.close_non_hold_cursors(addr); Ok(vec![Response::Execution(Tag::new("COMMIT"))]) @@ -182,6 +212,7 @@ impl NodeDbPgHandler { identity: &AuthenticatedIdentity, addr: &std::net::SocketAddr, ) -> PgWireResult> { + crate::control::server::pgwire::session::ddl_buffer::discard(); let reservations = self.sessions.rollback(addr).unwrap_or_default(); for handle in &reservations { let key = &handle.sequence_key; diff --git a/nodedb/src/control/server/pgwire/mod.rs b/nodedb/src/control/server/pgwire/mod.rs index 21cb8ac2..c2c90ed1 100644 --- a/nodedb/src/control/server/pgwire/mod.rs +++ b/nodedb/src/control/server/pgwire/mod.rs @@ -2,5 +2,6 @@ pub mod ddl; pub mod factory; pub mod handler; pub mod listener; +pub mod pg_catalog; pub mod session; pub mod types; diff --git a/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs b/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs new file mode 100644 index 00000000..86d481a1 --- /dev/null +++ b/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs @@ -0,0 +1,95 @@ +//! pg_catalog query interception and dispatch. + +use pgwire::api::results::Response; +use pgwire::error::PgWireResult; + +use crate::control::security::identity::AuthenticatedIdentity; +use crate::control::state::SharedState; + +use super::tables; + +/// Try to handle a SQL query as a pg_catalog virtual-table lookup. +/// +/// Returns `Some(Ok(response))` if the query targets a known +/// pg_catalog table, `None` if the query should fall through to the +/// normal planner. The `upper` argument is the uppercased SQL. +pub fn try_pg_catalog( + state: &SharedState, + identity: &AuthenticatedIdentity, + upper: &str, +) -> Option>> { + let table = extract_pg_catalog_table(upper)?; + let result = match table { + "pg_database" => tables::pg_database(), + "pg_namespace" => tables::pg_namespace(), + "pg_type" => tables::pg_type(), + "pg_class" => tables::pg_class(state, identity), + "pg_attribute" => tables::pg_attribute(state, identity), + "pg_index" => tables::pg_index(), + "pg_authid" => tables::pg_authid(state, identity), + _ => return None, + }; + Some(result) +} + +/// Extract the first `pg_catalog.` or bare `pg_
` +/// reference from a FROM clause. Returns the lowercase table name +/// if found. +fn extract_pg_catalog_table(upper: &str) -> Option<&'static str> { + let known = [ + "pg_database", + "pg_namespace", + "pg_type", + "pg_class", + "pg_attribute", + "pg_index", + "pg_authid", + ]; + for table in &known { + let qualified = format!("PG_CATALOG.{}", table.to_uppercase()); + let bare = table.to_uppercase(); + if upper.contains(&qualified) || upper.contains(&bare) { + return Some(table); + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extracts_qualified_table() { + let sql = "SELECT * FROM pg_catalog.pg_class WHERE relkind = 'r'"; + assert_eq!( + extract_pg_catalog_table(&sql.to_uppercase()), + Some("pg_class") + ); + } + + #[test] + fn extracts_bare_table() { + let sql = "SELECT oid, typname FROM pg_type"; + assert_eq!( + extract_pg_catalog_table(&sql.to_uppercase()), + Some("pg_type") + ); + } + + #[test] + fn no_match_for_regular_query() { + let sql = "SELECT * FROM users WHERE id = 1"; + assert_eq!(extract_pg_catalog_table(&sql.to_uppercase()), None); + } + + #[test] + fn handles_join_with_pg_catalog() { + let sql = + "SELECT c.oid FROM pg_class c JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid"; + assert_eq!( + extract_pg_catalog_table(&sql.to_uppercase()), + Some("pg_namespace") + ); + } +} diff --git a/nodedb/src/control/server/pgwire/pg_catalog/mod.rs b/nodedb/src/control/server/pgwire/pg_catalog/mod.rs new file mode 100644 index 00000000..01e74e69 --- /dev/null +++ b/nodedb/src/control/server/pgwire/pg_catalog/mod.rs @@ -0,0 +1,19 @@ +//! Minimal `pg_catalog` virtual-table emulation. +//! +//! Generic Postgres clients (DBeaver, pgAdmin, SQLAlchemy, psql's +//! `\dt`) issue `SELECT` queries against `pg_catalog.*` tables to +//! discover schemas, types, and tables. Without a response they +//! either error out or show an empty catalog. This module intercepts +//! those queries and returns rows synthesised from NodeDB's own +//! `SystemCatalog` and credential store. +//! +//! The interception is pattern-based: we extract the first +//! `pg_catalog.
` (or bare `pg_
`) reference from the +//! `FROM` clause and delegate to the matching virtual table handler. +//! The result always returns ALL rows with a fixed column schema — +//! clients that send `WHERE` clauses filter client-side. + +pub mod dispatch; +pub mod tables; + +pub use dispatch::try_pg_catalog; diff --git a/nodedb/src/control/server/pgwire/pg_catalog/tables.rs b/nodedb/src/control/server/pgwire/pg_catalog/tables.rs new file mode 100644 index 00000000..6e46e63e --- /dev/null +++ b/nodedb/src/control/server/pgwire/pg_catalog/tables.rs @@ -0,0 +1,270 @@ +//! Virtual table row generators for each pg_catalog table. + +use std::sync::Arc; + +use futures::stream; +use pgwire::api::results::{DataRowEncoder, QueryResponse, Response}; +use pgwire::error::PgWireResult; + +use crate::control::security::identity::AuthenticatedIdentity; +use crate::control::server::pgwire::types::{bool_field, int4_field, int8_field, text_field}; +use crate::control::state::SharedState; + +/// `pg_database` — one row: the current database. +pub fn pg_database() -> PgWireResult> { + let schema = Arc::new(vec![ + int8_field("oid"), + text_field("datname"), + text_field("datdba"), + text_field("encoding"), + ]); + let mut encoder = DataRowEncoder::new(schema.clone()); + encoder.encode_field(&1i64)?; + encoder.encode_field(&"nodedb")?; + encoder.encode_field(&"nodedb")?; + encoder.encode_field(&"UTF8")?; + let rows = vec![Ok(encoder.take_row())]; + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} + +/// `pg_namespace` — schemas: `public` + `pg_catalog`. +pub fn pg_namespace() -> PgWireResult> { + let schema = Arc::new(vec![ + int8_field("oid"), + text_field("nspname"), + int8_field("nspowner"), + ]); + let mut encoder = DataRowEncoder::new(schema.clone()); + let mut rows = Vec::new(); + + encoder.encode_field(&11i64)?; + encoder.encode_field(&"pg_catalog")?; + encoder.encode_field(&10i64)?; + rows.push(Ok(encoder.take_row())); + + encoder.encode_field(&2200i64)?; + encoder.encode_field(&"public")?; + encoder.encode_field(&10i64)?; + rows.push(Ok(encoder.take_row())); + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} + +/// `pg_type` — common Postgres type OIDs that client drivers need. +pub fn pg_type() -> PgWireResult> { + let schema = Arc::new(vec![ + int8_field("oid"), + text_field("typname"), + int8_field("typnamespace"), + int4_field("typlen"), + text_field("typtype"), + ]); + + let types: &[(i64, &str, i32, &str)] = &[ + (16, "bool", 1, "b"), + (20, "int8", 8, "b"), + (21, "int2", 2, "b"), + (23, "int4", 4, "b"), + (25, "text", -1, "b"), + (114, "json", -1, "b"), + (700, "float4", 4, "b"), + (701, "float8", 8, "b"), + (1043, "varchar", -1, "b"), + (1082, "date", 4, "b"), + (1114, "timestamp", 8, "b"), + (1184, "timestamptz", 8, "b"), + (2950, "uuid", 16, "b"), + (3802, "jsonb", -1, "b"), + ]; + + let mut rows = Vec::with_capacity(types.len()); + let mut encoder = DataRowEncoder::new(schema.clone()); + + for &(oid, name, len, typtype) in types { + encoder.encode_field(&oid)?; + encoder.encode_field(&name)?; + encoder.encode_field(&11i64)?; + encoder.encode_field(&len)?; + encoder.encode_field(&typtype)?; + rows.push(Ok(encoder.take_row())); + } + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} + +/// `pg_class` — one row per active collection (mapped as relation). +pub fn pg_class( + state: &SharedState, + identity: &AuthenticatedIdentity, +) -> PgWireResult> { + let schema = Arc::new(vec![ + int8_field("oid"), + text_field("relname"), + int8_field("relnamespace"), + text_field("relkind"), + int8_field("relowner"), + ]); + + let collections = load_collections(state, identity); + + let mut rows = Vec::with_capacity(collections.len()); + let mut encoder = DataRowEncoder::new(schema.clone()); + + for (i, coll) in collections.iter().enumerate() { + let oid = 16384i64 + i as i64; + encoder.encode_field(&oid)?; + encoder.encode_field(&coll.name.as_str())?; + encoder.encode_field(&2200i64)?; + encoder.encode_field(&"r")?; + encoder.encode_field(&10i64)?; + rows.push(Ok(encoder.take_row())); + } + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} + +/// `pg_attribute` — one row per field in strict-schema collections. +pub fn pg_attribute( + state: &SharedState, + identity: &AuthenticatedIdentity, +) -> PgWireResult> { + let schema = Arc::new(vec![ + int8_field("attrelid"), + text_field("attname"), + int8_field("atttypid"), + int4_field("attnum"), + int4_field("attlen"), + bool_field("attnotnull"), + ]); + + let collections = load_collections(state, identity); + + let mut rows = Vec::new(); + let mut encoder = DataRowEncoder::new(schema.clone()); + + for (i, coll) in collections.iter().enumerate() { + let rel_oid = 16384i64 + i as i64; + for (col_num, (field_name, field_type)) in coll.fields.iter().enumerate() { + let type_oid = field_type_to_oid(field_type); + encoder.encode_field(&rel_oid)?; + encoder.encode_field(&field_name.as_str())?; + encoder.encode_field(&type_oid)?; + encoder.encode_field(&((col_num + 1) as i32))?; + encoder.encode_field(&(-1i32))?; + encoder.encode_field(&false)?; + rows.push(Ok(encoder.take_row())); + } + } + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} + +/// `pg_index` — secondary indexes. +/// +/// Returns an empty result set with the correct schema. Structured +/// index metadata is not yet surfaced through `StoredCollection`; +/// once it is, this function will take `(state, identity)` and +/// populate rows from the catalog. +pub fn pg_index() -> PgWireResult> { + let schema = Arc::new(vec![ + int8_field("indexrelid"), + int8_field("indrelid"), + bool_field("indisunique"), + bool_field("indisprimary"), + ]); + + let rows: Vec> = Vec::new(); + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} + +/// `pg_authid` — users / roles. +pub fn pg_authid( + state: &SharedState, + identity: &AuthenticatedIdentity, +) -> PgWireResult> { + let schema = Arc::new(vec![ + int8_field("oid"), + text_field("rolname"), + bool_field("rolsuper"), + bool_field("rolcanlogin"), + ]); + + let mut rows = Vec::new(); + let mut encoder = DataRowEncoder::new(schema.clone()); + + let users = state.credentials.list_users(); + for (i, user) in users.iter().enumerate() { + let oid = 10i64 + i as i64; + let is_super = identity.is_superuser && user == &identity.username; + encoder.encode_field(&oid)?; + encoder.encode_field(&user.as_str())?; + encoder.encode_field(&is_super)?; + encoder.encode_field(&true)?; + rows.push(Ok(encoder.take_row())); + } + + Ok(vec![Response::Query(QueryResponse::new( + schema, + stream::iter(rows), + ))]) +} + +fn load_collections( + state: &SharedState, + identity: &AuthenticatedIdentity, +) -> Vec { + let Some(catalog) = state.credentials.catalog() else { + return Vec::new(); + }; + if identity.is_superuser { + catalog + .load_all_collections() + .unwrap_or_default() + .into_iter() + .filter(|c| c.is_active) + .collect() + } else { + catalog + .load_collections_for_tenant(identity.tenant_id.as_u32()) + .unwrap_or_default() + } +} + +fn field_type_to_oid(field_type: &str) -> i64 { + match field_type.to_lowercase().as_str() { + "bool" | "boolean" => 16, + "int" | "integer" | "int4" => 23, + "bigint" | "int8" => 20, + "smallint" | "int2" => 21, + "float" | "float4" | "real" => 700, + "double" | "float8" => 701, + "text" | "string" => 25, + "varchar" => 1043, + "json" => 114, + "jsonb" => 3802, + "uuid" => 2950, + "date" => 1082, + "timestamp" => 1114, + "timestamptz" => 1184, + _ => 25, + } +} diff --git a/nodedb/src/control/server/pgwire/session/ddl_buffer.rs b/nodedb/src/control/server/pgwire/session/ddl_buffer.rs new file mode 100644 index 00000000..59d8ae26 --- /dev/null +++ b/nodedb/src/control/server/pgwire/session/ddl_buffer.rs @@ -0,0 +1,119 @@ +//! Per-session DDL transaction buffer. +//! +//! When a pgwire session is inside a `BEGIN` block and executes DDL +//! statements (CREATE, DROP, ALTER), the `propose_catalog_entry` +//! path checks this buffer. If the buffer is active (non-None), the +//! entry is pushed into it instead of being proposed immediately. +//! +//! On `COMMIT`, the buffer is flushed as a single +//! `MetadataEntry::Batch`, so either all DDL in the transaction +//! commits atomically or none does. +//! +//! On `ROLLBACK`, the buffer is cleared without proposing. + +use std::cell::RefCell; + +/// Encoded DDL payloads buffered during a transaction. Each entry +/// is a serialized `CatalogEntry` ready for +/// `MetadataEntry::CatalogDdl { payload }`. +pub type DdlBuffer = Vec>; + +thread_local! { + /// Thread-local flag: when `Some`, `propose_catalog_entry` pushes + /// into this buffer instead of proposing through raft. Set by + /// `activate` before DDL dispatch, cleared by `take`. + /// + /// Thread-local is safe here because pgwire DDL handlers run + /// synchronously via `block_in_place` — the buffer is set and + /// read on the same OS thread within a single handler call. + static ACTIVE_BUFFER: RefCell> = const { RefCell::new(None) }; +} + +/// Activate the DDL buffer for the current thread. Any subsequent +/// call to `try_buffer` will push into this buffer instead of +/// returning `None`. +pub fn activate() { + ACTIVE_BUFFER.with(|b| { + let mut guard = b.borrow_mut(); + if guard.is_none() { + *guard = Some(Vec::new()); + } + }); +} + +/// Try to buffer a DDL payload. Returns `true` if the buffer is +/// active and the payload was pushed. Returns `false` if no buffer +/// is active (caller should propose normally). +pub fn try_buffer(payload: Vec) -> bool { + ACTIVE_BUFFER.with(|b| { + let mut guard = b.borrow_mut(); + if let Some(buf) = guard.as_mut() { + buf.push(payload); + true + } else { + false + } + }) +} + +/// Take the accumulated buffer contents and deactivate. Returns +/// `None` if the buffer was never activated. +pub fn take() -> Option { + ACTIVE_BUFFER.with(|b| b.borrow_mut().take()) +} + +/// Deactivate and discard the buffer without returning its contents. +pub fn discard() { + ACTIVE_BUFFER.with(|b| { + let _ = b.borrow_mut().take(); + }); +} + +/// Returns `true` if a DDL buffer is currently active on this thread. +pub fn is_active() -> bool { + ACTIVE_BUFFER.with(|b| b.borrow().is_some()) +} + +/// Number of DDL statements buffered in the current thread's +/// active transaction. Returns 0 if no buffer is active. +pub fn buffer_len() -> usize { + ACTIVE_BUFFER.with(|b| b.borrow().as_ref().map(|v| v.len()).unwrap_or(0)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn inactive_buffer_does_not_capture() { + discard(); // ensure clean state + assert!(!try_buffer(vec![1, 2, 3])); + assert!(!is_active()); + } + + #[test] + fn active_buffer_captures() { + activate(); + assert!(is_active()); + assert!(try_buffer(vec![1])); + assert!(try_buffer(vec![2])); + let buf = take().unwrap(); + assert_eq!(buf.len(), 2); + assert!(!is_active()); + } + + #[test] + fn discard_clears_buffer() { + activate(); + try_buffer(vec![1]); + discard(); + assert!(!is_active()); + assert!(take().is_none()); + } + + #[test] + fn take_on_inactive_returns_none() { + discard(); + assert!(take().is_none()); + } +} diff --git a/nodedb/src/control/server/pgwire/session/mod.rs b/nodedb/src/control/server/pgwire/session/mod.rs index 7ea726b5..8fad3e7a 100644 --- a/nodedb/src/control/server/pgwire/session/mod.rs +++ b/nodedb/src/control/server/pgwire/session/mod.rs @@ -1,7 +1,9 @@ mod cursor; pub mod cursor_spill; +pub mod ddl_buffer; mod live; mod params; +pub mod read_consistency; mod state; mod store; pub mod temp_tables; diff --git a/nodedb/src/control/server/pgwire/session/read_consistency.rs b/nodedb/src/control/server/pgwire/session/read_consistency.rs new file mode 100644 index 00000000..526034aa --- /dev/null +++ b/nodedb/src/control/server/pgwire/session/read_consistency.rs @@ -0,0 +1,155 @@ +//! Session-level `ReadConsistency` — wire `SET` / `SHOW` for the +//! `default_read_consistency` session parameter. +//! +//! Accepted values: +//! +//! - `'strong'` +//! - `'bounded_staleness:'` or `'bounded_staleness:s'` +//! - `'eventual'` +//! +//! The value is stored as a plain string in the session parameter +//! map. This module provides the typed parse + accessor. + +use std::net::SocketAddr; +use std::time::Duration; + +use crate::types::ReadConsistency; + +use super::store::SessionStore; + +/// Session parameter key. +pub const PARAM_KEY: &str = "default_read_consistency"; + +/// Parse a user-supplied string into a `ReadConsistency`. Returns +/// `None` on unrecognised input so the caller can return a helpful +/// error message. +pub fn parse_value(value: &str) -> Option { + let lower = value.trim().to_lowercase(); + match lower.as_str() { + "strong" => Some(ReadConsistency::Strong), + "eventual" => Some(ReadConsistency::Eventual), + _ => { + let stripped = lower.strip_prefix("bounded_staleness:")?; + let secs_str = stripped.trim_end_matches('s').trim(); + let secs: f64 = secs_str.parse().ok()?; + if secs <= 0.0 { + return None; + } + Some(ReadConsistency::BoundedStaleness(Duration::from_secs_f64( + secs, + ))) + } + } +} + +/// Format a `ReadConsistency` back into the canonical string form +/// so `SHOW default_read_consistency` returns something parseable. +pub fn format_value(rc: &ReadConsistency) -> String { + match rc { + ReadConsistency::Strong => "strong".to_string(), + ReadConsistency::Eventual => "eventual".to_string(), + ReadConsistency::BoundedStaleness(d) => { + format!("bounded_staleness:{}s", d.as_secs_f64()) + } + } +} + +impl SessionStore { + /// Resolve the effective `ReadConsistency` for a session. Falls + /// back to `Strong` if the parameter is unset or unparseable. + pub fn read_consistency(&self, addr: &SocketAddr) -> ReadConsistency { + self.get_parameter(addr, PARAM_KEY) + .and_then(|v| parse_value(&v)) + .unwrap_or_default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_strong() { + assert_eq!(parse_value("strong"), Some(ReadConsistency::Strong)); + assert_eq!(parse_value("STRONG"), Some(ReadConsistency::Strong)); + } + + #[test] + fn parse_eventual() { + assert_eq!(parse_value("eventual"), Some(ReadConsistency::Eventual)); + } + + #[test] + fn parse_bounded_staleness_seconds() { + let rc = parse_value("bounded_staleness:5").unwrap(); + assert_eq!( + rc, + ReadConsistency::BoundedStaleness(Duration::from_secs(5)) + ); + } + + #[test] + fn parse_bounded_staleness_with_s_suffix() { + let rc = parse_value("bounded_staleness:5s").unwrap(); + assert_eq!( + rc, + ReadConsistency::BoundedStaleness(Duration::from_secs(5)) + ); + } + + #[test] + fn parse_bounded_staleness_fractional() { + let rc = parse_value("bounded_staleness:0.5s").unwrap(); + assert_eq!( + rc, + ReadConsistency::BoundedStaleness(Duration::from_millis(500)) + ); + } + + #[test] + fn parse_rejects_zero_staleness() { + assert!(parse_value("bounded_staleness:0").is_none()); + } + + #[test] + fn parse_rejects_garbage() { + assert!(parse_value("foobar").is_none()); + assert!(parse_value("").is_none()); + } + + #[test] + fn format_roundtrip_strong() { + let s = format_value(&ReadConsistency::Strong); + assert_eq!(parse_value(&s), Some(ReadConsistency::Strong)); + } + + #[test] + fn format_roundtrip_bounded() { + let rc = ReadConsistency::BoundedStaleness(Duration::from_secs(10)); + let s = format_value(&rc); + assert_eq!(parse_value(&s), Some(rc)); + } + + #[test] + fn format_roundtrip_eventual() { + let s = format_value(&ReadConsistency::Eventual); + assert_eq!(parse_value(&s), Some(ReadConsistency::Eventual)); + } + + #[test] + fn session_store_defaults_to_strong() { + let store = SessionStore::new(); + let addr: SocketAddr = "127.0.0.1:5432".parse().unwrap(); + store.ensure_session(addr); + assert_eq!(store.read_consistency(&addr), ReadConsistency::Strong); + } + + #[test] + fn session_store_reads_set_value() { + let store = SessionStore::new(); + let addr: SocketAddr = "127.0.0.1:5432".parse().unwrap(); + store.ensure_session(addr); + store.set_parameter(&addr, PARAM_KEY.to_string(), "eventual".to_string()); + assert_eq!(store.read_consistency(&addr), ReadConsistency::Eventual); + } +} diff --git a/nodedb/tests/cluster_execute_request.rs b/nodedb/tests/cluster_execute_request.rs index bc02383c..453d2b15 100644 --- a/nodedb/tests/cluster_execute_request.rs +++ b/nodedb/tests/cluster_execute_request.rs @@ -162,8 +162,19 @@ async fn execute_request_cross_node_dispatch() { .await .expect("create collection"); - // Give the metadata applier on all nodes a moment to replicate. - tokio::time::sleep(Duration::from_millis(400)).await; + // Wait for the collection to be visible on every node. + common::cluster_harness::wait_for( + "cross_node_kv visible on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.cached_collection_count() >= 1) + }, + ) + .await; // Node 2 sends the request; node 1 (bootstrap leader) receives it. let sender_transport = cluster.nodes[1] diff --git a/nodedb/tests/common/cluster_harness/cluster.rs b/nodedb/tests/common/cluster_harness/cluster.rs index 5c8f7b25..ae769141 100644 --- a/nodedb/tests/common/cluster_harness/cluster.rs +++ b/nodedb/tests/common/cluster_harness/cluster.rs @@ -17,7 +17,18 @@ impl TestCluster { /// via node 1's pre-bound address. Waits until every node sees /// topology_size == 3 (10s deadline). pub async fn spawn_three() -> Result> { - Self::spawn_three_with_tuning(ClusterTransportTuning::default()).await + Self::spawn_three_with_tuning(ClusterTransportTuning { + // Fast health pings so the HealthMonitor re-broadcasts + // topology within ~1s if the initial join broadcast was missed. + health_ping_interval_secs: 1, + // Fast election timeouts so the metadata Raft group elects a + // leader well within the 10s convergence deadline, even under + // heavy parallel test load. + election_timeout_min_secs: 1, + election_timeout_max_secs: 2, + ..ClusterTransportTuning::default() + }) + .await } /// Spawn a 3-node cluster with a custom `ClusterTransportTuning`. @@ -29,12 +40,34 @@ impl TestCluster { ) -> Result> { let node1 = TestClusterNode::spawn_with_tuning(1, vec![], tuning.clone()).await?; - // Give node 1's transport + raft loop a moment to start - // accepting before peers dial in. - tokio::time::sleep(Duration::from_millis(200)).await; + // Wait until node 1 has bootstrapped (topology shows itself) + // before peers try to join. The old fixed 200ms sleep was too + // short under heavy host load (e.g. 500+ parallel unit tests + // sharing the same CPU pool), causing peers to dial before + // node 1's transport was ready — failing topology convergence. + let deadline = std::time::Instant::now() + Duration::from_secs(10); + while node1.topology_size() < 1 { + if std::time::Instant::now() >= deadline { + return Err("node 1 failed to bootstrap within 10s".into()); + } + tokio::time::sleep(Duration::from_millis(20)).await; + } let seeds = vec![node1.listen_addr]; let node2 = TestClusterNode::spawn_with_tuning(2, seeds.clone(), tuning.clone()).await?; + + // Wait for node 2's join to be reflected before spawning node 3. + // Under load, spawning both peers simultaneously can overwhelm the + // bootstrap leader's join handler, causing neither join to complete + // within the topology convergence deadline. + let deadline = std::time::Instant::now() + Duration::from_secs(10); + while node1.topology_size() < 2 { + if std::time::Instant::now() >= deadline { + return Err("node 2 failed to join within 10s".into()); + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + let node3 = TestClusterNode::spawn_with_tuning(3, seeds, tuning).await?; let cluster = Self { @@ -44,7 +77,7 @@ impl TestCluster { wait_for( "all 3 nodes report topology_size == 3", Duration::from_secs(10), - Duration::from_millis(100), + Duration::from_millis(50), || cluster.nodes.iter().all(|n| n.topology_size() == 3), ) .await; diff --git a/nodedb/tests/common/cluster_harness/node.rs b/nodedb/tests/common/cluster_harness/node.rs index db4c84af..e08461fd 100644 --- a/nodedb/tests/common/cluster_harness/node.rs +++ b/nodedb/tests/common/cluster_harness/node.rs @@ -129,6 +129,7 @@ impl TestClusterNode { &cluster_settings, transport.clone(), &data_dir_path, + &tuning, ) .await?; diff --git a/nodedb/tests/descriptor_lease_planner_integration.rs b/nodedb/tests/descriptor_lease_planner_integration.rs index a87f15a8..550434c7 100644 --- a/nodedb/tests/descriptor_lease_planner_integration.rs +++ b/nodedb/tests/descriptor_lease_planner_integration.rs @@ -16,7 +16,7 @@ use common::cluster_harness::{TestCluster, wait_for}; use nodedb_cluster::{DescriptorId, DescriptorKind}; const TENANT: u32 = 1; -const WAIT_BUDGET: Duration = Duration::from_secs(3); +const WAIT_BUDGET: Duration = Duration::from_secs(10); const POLL: Duration = Duration::from_millis(20); fn coll_id(name: &str) -> DescriptorId { diff --git a/nodedb/tests/sql_ddl_cluster.rs b/nodedb/tests/sql_ddl_cluster.rs new file mode 100644 index 00000000..640b6ae3 --- /dev/null +++ b/nodedb/tests/sql_ddl_cluster.rs @@ -0,0 +1,265 @@ +//! DDL replication correctness matrix. +//! +//! For every DDL variant that flows through the replicated metadata +//! path, this file tests: +//! +//! 1. Execute DDL on the leader → visible on every follower. +//! 2. Execute the inverse DDL → removal visible on every node. +//! 3. `IF NOT EXISTS` / `IF EXISTS` branches handled without error. +//! +//! Uses the 3-node `TestCluster` harness from `common/cluster_harness`. + +mod common; + +use std::time::Duration; + +use common::cluster_harness::{TestCluster, wait_for}; + +// ── Collection ─────────────────────────────────────────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_create_drop_collection_replicates() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + cluster + .exec_ddl_on_any_leader("CREATE COLLECTION ddl_test_coll") + .await + .expect("create"); + wait_for( + "collection visible on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.cached_collection_count() >= 1) + }, + ) + .await; + + cluster + .exec_ddl_on_any_leader("DROP COLLECTION ddl_test_coll") + .await + .expect("drop"); + wait_for( + "collection removed on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.cached_collection_count() == 0) + }, + ) + .await; + cluster.shutdown().await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_create_collection_if_not_exists() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + cluster + .exec_ddl_on_any_leader("CREATE COLLECTION ine_coll") + .await + .expect("first create"); + wait_for( + "collection visible", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.cached_collection_count() >= 1) + }, + ) + .await; + // Second CREATE IF NOT EXISTS must succeed without error. + cluster + .exec_ddl_on_any_leader("CREATE COLLECTION IF NOT EXISTS ine_coll") + .await + .expect("if not exists must not error"); + cluster.shutdown().await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_drop_collection_if_exists_missing() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + // DROP IF EXISTS on a nonexistent collection must succeed. + cluster + .exec_ddl_on_any_leader("DROP COLLECTION IF EXISTS no_such_coll") + .await + .expect("if exists on missing must not error"); + cluster.shutdown().await; +} + +// ── Sequence ───────────────────────────────────────────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_create_drop_sequence_replicates() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + cluster + .exec_ddl_on_any_leader("CREATE SEQUENCE ddl_test_seq START 1") + .await + .expect("create seq"); + wait_for( + "sequence visible on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.has_sequence(1, "ddl_test_seq")) + }, + ) + .await; + + cluster + .exec_ddl_on_any_leader("DROP SEQUENCE ddl_test_seq") + .await + .expect("drop seq"); + wait_for( + "sequence removed on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || cluster.nodes.iter().all(|n| n.sequence_count(1) == 0), + ) + .await; + cluster.shutdown().await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_create_sequence_if_not_exists() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + cluster + .exec_ddl_on_any_leader("CREATE SEQUENCE ine_seq START 1") + .await + .expect("first create"); + wait_for( + "seq visible", + Duration::from_secs(10), + Duration::from_millis(50), + || cluster.nodes.iter().all(|n| n.has_sequence(1, "ine_seq")), + ) + .await; + cluster + .exec_ddl_on_any_leader("CREATE SEQUENCE IF NOT EXISTS ine_seq START 1") + .await + .expect("if not exists must not error"); + cluster.shutdown().await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_drop_sequence_if_exists_missing() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + cluster + .exec_ddl_on_any_leader("DROP SEQUENCE IF EXISTS no_such_seq") + .await + .expect("if exists on missing must not error"); + cluster.shutdown().await; +} + +// ── Trigger ────────────────────────────────────────────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_create_drop_trigger_replicates() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + cluster + .exec_ddl_on_any_leader("CREATE COLLECTION trig_coll") + .await + .expect("create coll for trigger"); + wait_for( + "coll visible", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.cached_collection_count() >= 1) + }, + ) + .await; + + cluster + .exec_ddl_on_any_leader( + "CREATE TRIGGER ddl_test_trig AFTER INSERT ON trig_coll FOR EACH ROW BEGIN RETURN 1; END", + ) + .await + .expect("create trigger"); + wait_for( + "trigger visible on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.has_trigger(1, "ddl_test_trig")) + }, + ) + .await; + + cluster + .exec_ddl_on_any_leader("DROP TRIGGER ddl_test_trig ON trig_coll") + .await + .expect("drop trigger"); + wait_for( + "trigger removed on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| !n.has_trigger(1, "ddl_test_trig")) + }, + ) + .await; + cluster.shutdown().await; +} + +// ── Schedule ───────────────────────────────────────────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 6)] +async fn ddl_create_drop_schedule_replicates() { + let cluster = TestCluster::spawn_three().await.expect("cluster"); + cluster + .exec_ddl_on_any_leader( + "CREATE SCHEDULE ddl_test_sched CRON '0 0 * * *' AS BEGIN RETURN 1; END", + ) + .await + .expect("create schedule"); + wait_for( + "schedule visible on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| n.has_schedule(1, "ddl_test_sched")) + }, + ) + .await; + + cluster + .exec_ddl_on_any_leader("DROP SCHEDULE ddl_test_sched") + .await + .expect("drop schedule"); + wait_for( + "schedule removed on all nodes", + Duration::from_secs(10), + Duration::from_millis(50), + || { + cluster + .nodes + .iter() + .all(|n| !n.has_schedule(1, "ddl_test_sched")) + }, + ) + .await; + cluster.shutdown().await; +}