From a620362fc9bb5a85a57eeb6c18a716705bff8d16 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 25 May 2026 13:55:25 -0300 Subject: [PATCH 01/48] Implement initial scheduler module - Add `Scheduler` struct (WIP) - Add `new_slot_ticker` --- crates/core/Cargo.toml | 2 +- crates/core/src/lib.rs | 4 ++ crates/core/src/scheduler.rs | 84 ++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 crates/core/src/scheduler.rs diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 594b2d18..49ea305b 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -31,9 +31,9 @@ pluto-eth2util.workspace = true pluto-ssz.workspace = true ssz.workspace = true tree_hash.workspace = true +anyhow.workspace = true [dev-dependencies] -anyhow.workspace = true alloy.workspace = true clap.workspace = true rand.workspace = true diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 5b44a216..b0ae759a 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -29,6 +29,10 @@ pub mod parsigdb; /// DutyDB — in-memory store for unsigned duty data. pub mod dutydb; +/// Resolves beacon-chain duties per epoch, ticks the slot clock, and fans +/// duties out to downstream components. +pub mod scheduler; + mod parsigex_codec; // SSZ codec operates on compile-time-constant byte sizes and offsets. // Arithmetic is bounded and casts from `usize` to `u32` are safe because all diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs new file mode 100644 index 00000000..5cb7325c --- /dev/null +++ b/crates/core/src/scheduler.rs @@ -0,0 +1,84 @@ +use pluto_eth2api::{EthBeaconNodeApiClientError, client}; +use tokio_util::sync::CancellationToken; + +use crate::types; + +/// Errors that can occur during the scheduling process. +#[derive(Debug, thiserror::Error)] +pub enum SchedulerError { + /// Beacon Node API client error. + #[error("Error while fetching data from the Eth2 API: {0}")] + EthBeaconNodeApiClientError(#[from] EthBeaconNodeApiClientError), +} + +type Result = std::result::Result; + +struct Scheduler { + client: client::EthBeaconNodeApiClient, +} + +impl Scheduler { + pub fn new(client: client::EthBeaconNodeApiClient, builder_enabled: bool) -> Self { + Scheduler { client } + } +} + +/// Create a read channel that will be populated with new slots in real time. +/// It is also populated with the current slot immediately. +/// +/// The production of slots is cancelled when the provided [`CancellationToken`] +/// is cancelled. +async fn new_slot_ticker( + client: client::EthBeaconNodeApiClient, + ct: CancellationToken, +) -> Result> { + let genesis_time = client.fetch_genesis_time().await?; + let (slot_duration, slots_per_epoch) = client.fetch_slots_config().await?; + let slot_duration = chrono::Duration::from_std(slot_duration).unwrap(); + + let current_slot = move || { + let chain_age = chrono::Utc::now() - genesis_time; + let slot_ms = slot_duration.num_milliseconds(); + let slot = chain_age.num_milliseconds() / slot_ms; + let start_time = genesis_time + chrono::Duration::milliseconds(slot * slot_ms); + + types::Slot { + slot: types::SlotNumber::new(slot as u64), + time: start_time, + slots_per_epoch, + slot_duration, + } + }; + + let (tx, rx) = tokio::sync::mpsc::channel(100); + tokio::spawn(async move { + let mut slot = current_slot(); + + loop { + let wait = (slot.time - chrono::Utc::now()) + .to_std() + .unwrap_or_default(); + tokio::time::sleep(wait).await; + + // Avoid "thundering herd" problem by skipping slots if missed due + // to pause-the-world events (i.e. resources are already constrained). + if chrono::Utc::now() > slot.next_slot().time { + let actual = current_slot(); + tracing::warn!(actual_slot = %actual.slot, expect_slot = %slot.slot, "Slot(s) skipped"); + // skipCounter.inc() + slot = actual; + } + + let next_slot = slot.next_slot(); + + tokio::select! { + _ = ct.cancelled() => break, + _ = tx.send(slot) => {}, + } + + slot = next_slot; + } + }); + + Ok(rx) +} From 8189f08d3a11c6fc1537789125355a0c845a69d0 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 25 May 2026 16:10:33 -0300 Subject: [PATCH 02/48] Add `wait_chain_start` - Fix duplicated code --- Cargo.lock | 1 + crates/core/Cargo.toml | 1 + crates/core/src/scheduler.rs | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 5b6a5a2a..ee75dc38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5590,6 +5590,7 @@ dependencies = [ "alloy", "anyhow", "async-trait", + "backon", "base64", "built", "cancellation", diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 49ea305b..98955d14 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true publish.workspace = true [dependencies] +backon.workspace = true async-trait.workspace = true cancellation.workspace = true chrono.workspace = true diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 5cb7325c..b8ef3533 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,3 +1,6 @@ +use std::time::Duration; + +use backon::Retryable; use pluto_eth2api::{EthBeaconNodeApiClientError, client}; use tokio_util::sync::CancellationToken; @@ -82,3 +85,34 @@ async fn new_slot_ticker( Ok(rx) } + +/// Blocks until the beacon chain has started. +async fn wait_chain_start(client: pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { + // TODO: Duplicated from `crates/p2p/src/bootnode.rs` + /// Backoff configuration constants matching Go's expbackoff.FastConfig. + const FAST_BASE_DELAY: Duration = Duration::from_millis(100); + const FAST_MAX_DELAY: Duration = Duration::from_secs(5); + const FAST_MULTIPLIER: f32 = 1.6; + + // Retry with exponential backoff + let backoff = backon::ExponentialBuilder::default() + .with_min_delay(FAST_BASE_DELAY) + .with_max_delay(FAST_MAX_DELAY) + .with_factor(FAST_MULTIPLIER) + .with_jitter(); + + let fetch = || client.fetch_genesis_time(); + let genesis_time = fetch + .retry(backoff) + .notify(|err, _| tracing::error!(err = ?err, "Failure getting genesis")) + .await?; + + let now = chrono::Utc::now(); + if now < genesis_time { + let delta = (genesis_time - now).to_std().unwrap_or_default(); + tracing::info!(genesis_time = %genesis_time, sleep = ?delta, "Sleeping until genesis time"); + tokio::time::sleep(delta).await; + } + + Ok(()) +} From 67c47fdabbdcc72b3290ca5467d2e636bfc287ec Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 25 May 2026 16:24:30 -0300 Subject: [PATCH 03/48] Add `wait_beacon_sync` --- crates/core/src/scheduler.rs | 67 ++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index b8ef3533..01469211 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,6 +1,6 @@ use std::time::Duration; -use backon::Retryable; +use backon::{BackoffBuilder, Retryable}; use pluto_eth2api::{EthBeaconNodeApiClientError, client}; use tokio_util::sync::CancellationToken; @@ -86,22 +86,39 @@ async fn new_slot_ticker( Ok(rx) } -/// Blocks until the beacon chain has started. -async fn wait_chain_start(client: pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { - // TODO: Duplicated from `crates/p2p/src/bootnode.rs` +// TODO: Duplicated from `crates/p2p/src/bootnode.rs` +fn fast_backoff() -> backon::ExponentialBuilder { /// Backoff configuration constants matching Go's expbackoff.FastConfig. const FAST_BASE_DELAY: Duration = Duration::from_millis(100); const FAST_MAX_DELAY: Duration = Duration::from_secs(5); const FAST_MULTIPLIER: f32 = 1.6; - // Retry with exponential backoff - let backoff = backon::ExponentialBuilder::default() + backon::ExponentialBuilder::default() .with_min_delay(FAST_BASE_DELAY) .with_max_delay(FAST_MAX_DELAY) .with_factor(FAST_MULTIPLIER) - .with_jitter(); + .without_max_times() + .with_jitter() +} +fn default_backoff() -> backon::ExponentialBuilder { + /// Backoff configuration constants matching Go's expbackoff.DefaultConfig. + const DEFAULT_BASE_DELAY: Duration = Duration::from_secs(1); + const DEFAULT_MAX_DELAY: Duration = Duration::from_secs(120); + const DEFAULT_MULTIPLIER: f32 = 1.6; + + backon::ExponentialBuilder::default() + .with_min_delay(DEFAULT_BASE_DELAY) + .with_max_delay(DEFAULT_MAX_DELAY) + .with_factor(DEFAULT_MULTIPLIER) + .without_max_times() + .with_jitter() +} + +/// Blocks until the beacon chain has started. +async fn wait_chain_start(client: pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { let fetch = || client.fetch_genesis_time(); + let backoff = fast_backoff(); let genesis_time = fetch .retry(backoff) .notify(|err, _| tracing::error!(err = ?err, "Failure getting genesis")) @@ -116,3 +133,39 @@ async fn wait_chain_start(client: pluto_eth2api::client::EthBeaconNodeApiClient) Ok(()) } + +/// Blocks until the beacon node is synced. +async fn wait_beacon_sync(client: pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { + let fetch = || client.get_syncing_status(pluto_eth2api::GetSyncingStatusRequest {}); + let fetch_backoff = fast_backoff(); + + let mut is_syncing_backoff = default_backoff().build(); + + loop { + let response: pluto_eth2api::GetSyncingStatusResponse = fetch + .retry(fetch_backoff) + .notify(|err, _| tracing::error!(err = ?err, "Failure getting syncing status")) + .await + .map_err(pluto_eth2api::EthBeaconNodeApiClientError::RequestError)?; + + let state = match response { + pluto_eth2api::GetSyncingStatusResponse::Ok(syncing) => Ok(syncing.data), + _ => Err(pluto_eth2api::EthBeaconNodeApiClientError::UnexpectedResponse), + }?; + + if state.is_syncing { + tracing::info!( + distance = state.sync_distance, + "Waiting for beacon node to sync" + ); + let duration = is_syncing_backoff + .next() + .expect("Infinite backoff should never return None"); + tokio::time::sleep(duration).await; + } else { + break; + } + } + + Ok(()) +} From 3d19f19da6f25853d9591344e9a473b0b80c54be Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 25 May 2026 16:49:13 -0300 Subject: [PATCH 04/48] Move `valcache` from `app` to `core` - `valcache` is required by scheduler - Fixes cyclical dependency issue --- crates/app/src/eth2wrap/mod.rs | 3 --- crates/core/src/lib.rs | 3 +++ crates/{app/src/eth2wrap => core/src}/valcache.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename crates/{app/src/eth2wrap => core/src}/valcache.rs (99%) diff --git a/crates/app/src/eth2wrap/mod.rs b/crates/app/src/eth2wrap/mod.rs index 51970966..cd3ab232 100644 --- a/crates/app/src/eth2wrap/mod.rs +++ b/crates/app/src/eth2wrap/mod.rs @@ -1,5 +1,2 @@ /// Validate Beacon node versions pub mod version; - -/// Cache of Validators retrieved from the Beacon node -pub mod valcache; diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index b0ae759a..ad875f1c 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -29,6 +29,9 @@ pub mod parsigdb; /// DutyDB — in-memory store for unsigned duty data. pub mod dutydb; +/// Cache of Validators retrieved from the Beacon node +pub mod valcache; + /// Resolves beacon-chain duties per epoch, ticks the slot clock, and fans /// duties out to downstream components. pub mod scheduler; diff --git a/crates/app/src/eth2wrap/valcache.rs b/crates/core/src/valcache.rs similarity index 99% rename from crates/app/src/eth2wrap/valcache.rs rename to crates/core/src/valcache.rs index 3ec4d6b8..4d562153 100644 --- a/crates/app/src/eth2wrap/valcache.rs +++ b/crates/core/src/valcache.rs @@ -1,4 +1,4 @@ -use pluto_core::types::PubKey; +use crate::types::PubKey; use pluto_eth2api::{ EthBeaconNodeApiClient, EthBeaconNodeApiClientError, GetStateValidatorsResponseResponse, GetStateValidatorsResponseResponseDatum, PostStateValidatorsRequest, From e60be59bb919306c6df33b1dcc0d316c73df2041 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 25 May 2026 17:12:42 -0300 Subject: [PATCH 05/48] Add `resolve_active_validators` - Use `valcache` to get the list of active validators. --- crates/core/src/scheduler.rs | 63 ++++++++++++++++++++++++++++++++++-- crates/core/src/types.rs | 5 ++- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 01469211..d25e2c5d 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -4,7 +4,7 @@ use backon::{BackoffBuilder, Retryable}; use pluto_eth2api::{EthBeaconNodeApiClientError, client}; use tokio_util::sync::CancellationToken; -use crate::types; +use crate::{types, valcache}; /// Errors that can occur during the scheduling process. #[derive(Debug, thiserror::Error)] @@ -12,17 +12,30 @@ pub enum SchedulerError { /// Beacon Node API client error. #[error("Error while fetching data from the Eth2 API: {0}")] EthBeaconNodeApiClientError(#[from] EthBeaconNodeApiClientError), + + /// Validator cache error. + #[error("Error while accessing the validator cache: {0}")] + ValidatorCacheError(#[from] valcache::ValidatorCacheError), + + /// Public key error. + #[error("Error while processing public key: {0}")] + PubKeyError(#[from] types::PubKeyError), + + /// Invalid epoch error. + #[error("Invalid epoch")] + InvalidEpoch(#[from] std::num::ParseIntError), } type Result = std::result::Result; struct Scheduler { client: client::EthBeaconNodeApiClient, + valcache: valcache::ValidatorCache, } impl Scheduler { - pub fn new(client: client::EthBeaconNodeApiClient, builder_enabled: bool) -> Self { - Scheduler { client } + pub fn new(client: client::EthBeaconNodeApiClient, valcache: valcache::ValidatorCache) -> Self { + Scheduler { client, valcache } } } @@ -86,6 +99,50 @@ async fn new_slot_ticker( Ok(rx) } +struct Validator { + pubkey: types::PubKey, + v_idx: pluto_eth2api::spec::phase0::ValidatorIndex, +} + +/// Returns the active validators (including their validator index) for the +/// epoch. +async fn resolve_active_validators( + epoch: u64, + valcache: &valcache::ValidatorCache, +) -> Result> { + let (_, complete) = valcache.get_by_head().await?; + + let mut validators = vec![]; + for (index, val) in complete.iter() { + let pubkey = types::PubKey::try_from(val.validator.pubkey.as_str())?; + + // TODO: Support `submitter` + // submitter(pubkey, v.Balance, val.status.to_string()) + + // Check for active validators for the given epoch. + // The activation epoch needs to be checked in cases where this function is + // called before the epoch starts. + if !val.status.is_active() { + let activation_epoch = val + .validator + .activation_epoch + .parse::() + .map_err(SchedulerError::InvalidEpoch)?; + + if activation_epoch != epoch { + continue; + } + } + + validators.push(Validator { + pubkey, + v_idx: *index, + }); + } + + Ok(validators) +} + // TODO: Duplicated from `crates/p2p/src/bootnode.rs` fn fast_backoff() -> backon::ExponentialBuilder { /// Backoff configuration constants matching Go's expbackoff.FastConfig. diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 0ecfd477..48386e96 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -356,11 +356,14 @@ impl From<[u8; PK_LEN]> for PubKey { } /// Public key error type -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] pub enum PubKeyError { /// Invalid public key length. + #[error("Invalid public key length")] InvalidLength, + /// Invalid public key string. + #[error("Invalid public key string")] InvalidString, } From 36ed2ba5505a3c584106cb8929cb5c8327d55c9e Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 26 May 2026 12:52:10 -0300 Subject: [PATCH 06/48] Implement `run` (WIP) --- crates/core/src/scheduler.rs | 62 +++++++++++++++++++++++++++++--- crates/core/src/types.rs | 69 ++++-------------------------------- 2 files changed, 65 insertions(+), 66 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index d25e2c5d..1dfc0da3 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -31,11 +31,65 @@ type Result = std::result::Result; struct Scheduler { client: client::EthBeaconNodeApiClient, valcache: valcache::ValidatorCache, + + slot_subs: Vec>, } impl Scheduler { pub fn new(client: client::EthBeaconNodeApiClient, valcache: valcache::ValidatorCache) -> Self { - Scheduler { client, valcache } + Scheduler { + client, + valcache, + slot_subs: Vec::new(), + } + } + + /// Subscribes a callback function for triggered slots. + /// Note this should be called *before* [`Scheduler::run`]. + pub async fn subscribe_slots( + &mut self, + f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, + label: impl AsRef + Send + 'static, + ) { + let (tx, mut rx) = tokio::sync::mpsc::channel(100); + self.slot_subs.push(tx); + + tokio::spawn(async move { + while let Some(slot) = rx.recv().await { + if let Err(err) = f(&slot) { + tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); + } + } + }); + } + + pub async fn run(&mut self, ct: CancellationToken) -> Result<()> { + wait_chain_start(&self.client).await?; + wait_beacon_sync(&self.client).await?; + + let mut slot_ticker = new_slot_ticker(&self.client, ct.clone()).await?; + + loop { + tokio::select! { + _ = ct.cancelled() => break, + + Some(slot) = slot_ticker.recv() => { + tracing::info!(slot = %slot.slot, "Slot ticked"); + + // TODO: metrics + // instrumentSlot(slot) + + // equivalent to `emitCoreSlot` + for sub in &self.slot_subs { + let _ = sub.send(slot.clone()).await; + } + + // self.schedule_slot() + }, + } + } + + todo!() } } @@ -45,7 +99,7 @@ impl Scheduler { /// The production of slots is cancelled when the provided [`CancellationToken`] /// is cancelled. async fn new_slot_ticker( - client: client::EthBeaconNodeApiClient, + client: &client::EthBeaconNodeApiClient, ct: CancellationToken, ) -> Result> { let genesis_time = client.fetch_genesis_time().await?; @@ -173,7 +227,7 @@ fn default_backoff() -> backon::ExponentialBuilder { } /// Blocks until the beacon chain has started. -async fn wait_chain_start(client: pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { +async fn wait_chain_start(client: &pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { let fetch = || client.fetch_genesis_time(); let backoff = fast_backoff(); let genesis_time = fetch @@ -192,7 +246,7 @@ async fn wait_chain_start(client: pluto_eth2api::client::EthBeaconNodeApiClient) } /// Blocks until the beacon node is synced. -async fn wait_beacon_sync(client: pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { +async fn wait_beacon_sync(client: &pluto_eth2api::client::EthBeaconNodeApiClient) -> Result<()> { let fetch = || client.get_syncing_status(pluto_eth2api::GetSyncingStatusRequest {}); let fetch_backoff = fast_backoff(); diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 48386e96..21294246 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -1,3 +1,5 @@ +#![allow(missing_docs)] + //! Types for the Charon core. use std::{any::Any, collections::HashMap, fmt::Display, iter}; @@ -409,60 +411,13 @@ impl AsRef<[u8]> for PubKey { // todo: add toEth2Format for the pub key // https://github.com/ObolNetwork/charon/blob/b3008103c5429b031b63518195f4c49db4e9a68d/core/types.go#L311 -/// Duty definition type -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct DutyDefinition(T); - -impl DutyDefinition -where - T: Clone + Serialize + StdDebug, -{ - /// Create a new duty definition. - pub fn new(duty_definition: T) -> Self { - Self(duty_definition) - } +pub enum DutyDefinition { + Attester(), + Proposer(), + SyncCommittee(), } -/// Duty definition set -#[derive(Debug, Default, Clone, PartialEq, Eq)] -pub struct DutyDefinitionSet(HashMap>) -where - T: Clone + Serialize + StdDebug; - -impl DutyDefinitionSet -where - T: Clone + Serialize + StdDebug, -{ - /// Create a new duty definition set. - pub fn new() -> Self { - Self(HashMap::default()) - } - - /// Get a duty definition by duty type. - pub fn get(&self, duty_type: &DutyType) -> Option<&DutyDefinition> { - self.0.get(duty_type) - } - - /// Insert a duty definition. - pub fn insert(&mut self, duty_type: DutyType, duty_definition: DutyDefinition) { - self.0.insert(duty_type, duty_definition); - } - - /// Remove a duty definition by duty type. - pub fn remove(&mut self, duty_type: &DutyType) -> Option> { - self.0.remove(duty_type) - } - - /// Inner duty definition set. - pub fn inner(&self) -> &HashMap> { - &self.0 - } - - /// Inner duty definition set. - pub fn inner_mut(&mut self) -> &mut HashMap> { - &mut self.0 - } -} +pub type DutyDefinitionSet = HashMap; /// Unsigned data type #[derive(Debug, Clone, PartialEq, Eq)] @@ -1005,16 +960,6 @@ mod tests { assert_eq!(pk.abbreviated(), "2a2_a2a"); } - #[test] - fn duty_definition_set() { - let mut duty_definition_set = DutyDefinitionSet::new(); - duty_definition_set.insert(DutyType::Proposer, DutyDefinition::new(DutyType::Proposer)); - assert_eq!( - duty_definition_set.get(&DutyType::Proposer), - Some(&DutyDefinition::new(DutyType::Proposer)) - ); - } - #[test] fn unsigned_data_set() { let mut unsigned_data_set = UnsignedDataSet::new(); From 218be9f83df8ae6bd1a4a71e3c2ba16f8867f537 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 26 May 2026 14:15:06 -0300 Subject: [PATCH 07/48] Use broadcast channel for `emitCoreSlot` --- crates/core/src/scheduler.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 1dfc0da3..f3a0059e 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -32,7 +32,7 @@ struct Scheduler { client: client::EthBeaconNodeApiClient, valcache: valcache::ValidatorCache, - slot_subs: Vec>, + slot_broadcast: tokio::sync::broadcast::Sender, } impl Scheduler { @@ -40,7 +40,7 @@ impl Scheduler { Scheduler { client, valcache, - slot_subs: Vec::new(), + slot_broadcast: tokio::sync::broadcast::channel(100).0, } } @@ -51,11 +51,10 @@ impl Scheduler { f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, label: impl AsRef + Send + 'static, ) { - let (tx, mut rx) = tokio::sync::mpsc::channel(100); - self.slot_subs.push(tx); + let mut rx = self.slot_broadcast.subscribe(); tokio::spawn(async move { - while let Some(slot) = rx.recv().await { + while let Ok(slot) = rx.recv().await { if let Err(err) = f(&slot) { tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); } @@ -74,16 +73,18 @@ impl Scheduler { _ = ct.cancelled() => break, Some(slot) = slot_ticker.recv() => { - tracing::info!(slot = %slot.slot, "Slot ticked"); + tracing::debug!(slot = %slot.slot, "Slot ticked"); // TODO: metrics // instrumentSlot(slot) - // equivalent to `emitCoreSlot` - for sub in &self.slot_subs { - let _ = sub.send(slot.clone()).await; + // ~ `emitCoreSlot` + if self.slot_broadcast.send(slot).is_err() { + tracing::debug!("No active subscribers for slot events, closing scheduler"); + break; } + // self.schedule_slot() }, } From a1b62b43ad8194b253e9e0731c76bd08b57ee318 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 26 May 2026 14:53:41 -0300 Subject: [PATCH 08/48] Add `Inner` - Manages internal state of the scheduler - Stored in a `Mutex` for thread safety --- crates/core/src/scheduler.rs | 90 +++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index f3a0059e..7136a09e 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,11 +1,19 @@ -use std::time::Duration; +use std::{ + collections::{HashMap, hash_map::Entry}, + time::Duration, +}; use backon::{BackoffBuilder, Retryable}; use pluto_eth2api::{EthBeaconNodeApiClientError, client}; +use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; use crate::{types, valcache}; +// Trim cached duties after 3 epochs. Note inclusion delay calculation requires +// now-32 slot duties. +const TRIM_EPOCH_OFFSET: u64 = 3; + /// Errors that can occur during the scheduling process. #[derive(Debug, thiserror::Error)] pub enum SchedulerError { @@ -33,6 +41,80 @@ struct Scheduler { valcache: valcache::ValidatorCache, slot_broadcast: tokio::sync::broadcast::Sender, + + storage: Mutex, +} + +struct Inner { + resolved_epoch: u64, + resolving_epoch: u64, + duties: HashMap, + duties_by_epoch: HashMap>, +} + +impl Inner { + fn is_resolving_epoch(&self, epoch: u64) -> bool { + if self.resolving_epoch == u64::MAX { + return false; + } + + self.resolving_epoch == epoch + } + + fn is_epoch_resolved(&self, epoch: u64) -> bool { + if self.resolved_epoch == u64::MAX { + return false; + } + + self.resolved_epoch >= epoch + } + + fn is_epoch_trimmed(&self, epoch: u64) -> bool { + if self.resolved_epoch == u64::MAX { + return false; + } + + epoch >= self.resolved_epoch + TRIM_EPOCH_OFFSET + } + + fn trim_duties(&mut self, epoch: u64) { + let duties = self.duties_by_epoch.remove(&epoch); + if let Some(duties) = duties + && duties.len() > 0 + { + for duty in duties { + self.duties.remove(&duty); + } + } + } + + /// Inserts a duty definition for a given pubkey. + /// + /// Returns true if it's set, false if it was already set. + fn set_duty_definition( + &mut self, + duty: types::Duty, + epoch: u64, + pub_key: types::PubKey, + definition: types::DutyDefinition, + ) -> bool { + let def_set = self + .duties + .entry(duty.clone()) + .or_insert(HashMap::default()); + match def_set.entry(pub_key) { + Entry::Occupied(_) => return false, + Entry::Vacant(entry) => { + entry.insert(definition); + } + }; + self.duties_by_epoch + .entry(epoch) + .or_insert(Vec::new()) + .push(duty); + + true + } } impl Scheduler { @@ -41,6 +123,12 @@ impl Scheduler { client, valcache, slot_broadcast: tokio::sync::broadcast::channel(100).0, + storage: Mutex::new(Inner { + resolved_epoch: u64::MAX, + resolving_epoch: u64::MAX, + duties: HashMap::new(), + duties_by_epoch: HashMap::new(), + }), } } From efec692c92f3c512518531fd0b95b42f9a1d7b37 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 26 May 2026 17:02:45 -0300 Subject: [PATCH 09/48] Add `schedule_slot` (WIP) --- crates/core/src/scheduler.rs | 48 ++++++++++++++++++++++++++++++------ crates/core/src/types.rs | 19 ++++++++++++++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 7136a09e..b37ba8e9 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -166,18 +166,52 @@ impl Scheduler { // TODO: metrics // instrumentSlot(slot) - // ~ `emitCoreSlot` - if self.slot_broadcast.send(slot).is_err() { - tracing::debug!("No active subscribers for slot events, closing scheduler"); - break; - } + // NOTE: Ignore send errors since it just means there are no subscribers. + let _ = self.slot_broadcast.send(slot.clone()); - - // self.schedule_slot() + self.schedule_slot(slot).await; }, } } + Ok(()) + } + + async fn schedule_slot(&mut self, slot: types::Slot) { + let resolved_epoch = self.storage.lock().await.resolved_epoch; + if resolved_epoch != slot.epoch() { + tracing::debug!(slot = %slot.slot, epoch = %slot.epoch(), "Resolving duties for slot"); + + if let Err(err) = self.resolve_duties(slot.clone()).await { + tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); + } + } + + for duty_type in types::DutyType::all() { + let duty = types::Duty { + duty_type, + slot: slot.slot, + }; + + let Some(def_set) = self.storage.lock().await.duties.get(&duty) else { + // Nothing for this duty. + continue; + }; + + // TODO: + // Trigger duty async + } + + if slot.last_in_epoch() { + if let Err(err) = self.resolve_duties(slot.next_slot()).await { + tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); + } + } + + todo!() + } + + async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { todo!() } } diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 21294246..30546d10 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -77,6 +77,25 @@ impl DutyType { pub fn never_expires(&self) -> bool { matches!(self, DutyType::Exit | DutyType::BuilderRegistration) } + + /// All valid duty types. + pub fn all() -> [DutyType; 13] { + [ + DutyType::Proposer, + DutyType::Attester, + DutyType::Signature, + DutyType::Exit, + DutyType::BuilderProposer, + DutyType::BuilderRegistration, + DutyType::Randao, + DutyType::PrepareAggregator, + DutyType::Aggregator, + DutyType::SyncMessage, + DutyType::PrepareSyncContribution, + DutyType::SyncContribution, + DutyType::InfoSync, + ] + } } /// Error type for duty type conversion. From 55a20303824479e087aed153bc8a7b430bbb4488 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 26 May 2026 17:53:19 -0300 Subject: [PATCH 10/48] Implement `schedule_slot` (WIP) --- crates/core/src/scheduler.rs | 113 +++++++++++++++++++++++++---------- crates/core/src/types.rs | 26 +++++++- 2 files changed, 106 insertions(+), 33 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index b37ba8e9..864d729e 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,12 +1,13 @@ use std::{ collections::{HashMap, hash_map::Entry}, + ops::Div, time::Duration, }; use backon::{BackoffBuilder, Retryable}; use pluto_eth2api::{EthBeaconNodeApiClientError, client}; use tokio::sync::Mutex; -use tokio_util::sync::CancellationToken; +use tokio_util::{future::FutureExt, sync::CancellationToken}; use crate::{types, valcache}; @@ -41,6 +42,7 @@ struct Scheduler { valcache: valcache::ValidatorCache, slot_broadcast: tokio::sync::broadcast::Sender, + duty_broadcast: tokio::sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, storage: Mutex, } @@ -98,10 +100,7 @@ impl Inner { pub_key: types::PubKey, definition: types::DutyDefinition, ) -> bool { - let def_set = self - .duties - .entry(duty.clone()) - .or_insert(HashMap::default()); + let def_set = self.duties.entry(duty.clone()).or_default(); match def_set.entry(pub_key) { Entry::Occupied(_) => return false, Entry::Vacant(entry) => { @@ -123,6 +122,7 @@ impl Scheduler { client, valcache, slot_broadcast: tokio::sync::broadcast::channel(100).0, + duty_broadcast: tokio::sync::broadcast::channel(100).0, storage: Mutex::new(Inner { resolved_epoch: u64::MAX, resolving_epoch: u64::MAX, @@ -132,24 +132,6 @@ impl Scheduler { } } - /// Subscribes a callback function for triggered slots. - /// Note this should be called *before* [`Scheduler::run`]. - pub async fn subscribe_slots( - &mut self, - f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, - label: impl AsRef + Send + 'static, - ) { - let mut rx = self.slot_broadcast.subscribe(); - - tokio::spawn(async move { - while let Ok(slot) = rx.recv().await { - if let Err(err) = f(&slot) { - tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); - } - } - }); - } - pub async fn run(&mut self, ct: CancellationToken) -> Result<()> { wait_chain_start(&self.client).await?; wait_beacon_sync(&self.client).await?; @@ -166,10 +148,10 @@ impl Scheduler { // TODO: metrics // instrumentSlot(slot) - // NOTE: Ignore send errors since it just means there are no subscribers. + // NOTE: Ignore send errors, it means that there are no subscribers. let _ = self.slot_broadcast.send(slot.clone()); - self.schedule_slot(slot).await; + self.schedule_slot(slot, ct.clone()).await; }, } } @@ -177,7 +159,43 @@ impl Scheduler { Ok(()) } - async fn schedule_slot(&mut self, slot: types::Slot) { + /// Subscribes a callback function for triggered slots. + /// Note this should be called *before* [`Scheduler::run`]. + pub async fn subscribe_slots( + &mut self, + f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, + label: impl AsRef + Send + 'static, + ) { + let mut rx = self.slot_broadcast.subscribe(); + + tokio::spawn(async move { + while let Ok(slot) = rx.recv().await { + if let Err(err) = f(&slot) { + tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); + } + } + }); + } + + /// Subscribes a callback function for triggered duties. + /// NOTE: this should be called *before* [`Scheduler::run`]. + pub async fn subscribe_duties( + &mut self, + f: impl Fn(&types::Duty, &types::DutyDefinitionSet) -> Result<()> + Send + 'static, + label: impl AsRef + Send + 'static, + ) { + let mut rx = self.duty_broadcast.subscribe(); + + tokio::spawn(async move { + while let Ok((duty, set)) = rx.recv().await { + if let Err(err) = f(&duty, &set) { + tracing::error!(err = ?err, label = label.as_ref(), "Trigger duty subscriber error"); + } + } + }); + } + + async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { let resolved_epoch = self.storage.lock().await.resolved_epoch; if resolved_epoch != slot.epoch() { tracing::debug!(slot = %slot.slot, epoch = %slot.epoch(), "Resolving duties for slot"); @@ -193,13 +211,34 @@ impl Scheduler { slot: slot.slot, }; - let Some(def_set) = self.storage.lock().await.duties.get(&duty) else { - // Nothing for this duty. - continue; + let def_set = { + let storage = self.storage.lock().await; + let Some(def_set) = storage.duties.get(&duty) else { + // Nothing for this duty. + continue; + }; + + def_set.clone() }; - // TODO: - // Trigger duty async + let ct = ct.clone(); + let slot = slot.clone(); + let broadcast = self.duty_broadcast.clone(); + tokio::spawn(async move { + if let None = delay_slot_offset(&slot, &duty) + .with_cancellation_token_owned(ct) + .await + { + // Cancelled early + return; + } + + // TODO: + // instrument_duty(duty, def_set); + + // NOTE: Ignore send errors, it means that there are no subscribers. + let _ = broadcast.send((duty.clone(), def_set.clone())); + }); } if slot.last_in_epoch() { @@ -403,3 +442,15 @@ async fn wait_beacon_sync(client: &pluto_eth2api::client::EthBeaconNodeApiClient Ok(()) } + +/// Blocks until the slot offset for the duty has been reached. +async fn delay_slot_offset(slot: &types::Slot, duty: &types::Duty) { + let to_sleep = match duty.duty_type { + types::DutyType::Attester => slot.slot_duration.div(3) * 1, + types::DutyType::Aggregator => slot.slot_duration.div(3) * 2, + types::DutyType::SyncContribution => slot.slot_duration.div(3) * 2, + _ => return, + }; + + tokio::time::sleep(to_sleep.to_std().unwrap_or_default()).await; +} diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 30546d10..dfc122fc 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -2,7 +2,13 @@ //! Types for the Charon core. -use std::{any::Any, collections::HashMap, fmt::Display, iter}; +use std::{ + any::Any, + collections::HashMap, + fmt::Display, + iter, + ops::{Deref, DerefMut}, +}; use chrono::{DateTime, Duration, Utc}; use dyn_clone::DynClone; @@ -430,13 +436,29 @@ impl AsRef<[u8]> for PubKey { // todo: add toEth2Format for the pub key // https://github.com/ObolNetwork/charon/blob/b3008103c5429b031b63518195f4c49db4e9a68d/core/types.go#L311 +#[derive(Debug, Clone, PartialEq, Eq)] pub enum DutyDefinition { Attester(), Proposer(), SyncCommittee(), } -pub type DutyDefinitionSet = HashMap; +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct DutyDefinitionSet(HashMap); + +impl Deref for DutyDefinitionSet { + type Target = HashMap; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DutyDefinitionSet { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} /// Unsigned data type #[derive(Debug, Clone, PartialEq, Eq)] From ef1e93286883ed245bac0dd1ebc9315458aaed87 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 26 May 2026 17:58:00 -0300 Subject: [PATCH 11/48] Ignore `dead_code` during development --- crates/core/src/scheduler.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 864d729e..8529b6d6 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,3 +1,5 @@ +#![allow(dead_code, reason = "wip")] + use std::{ collections::{HashMap, hash_map::Entry}, ops::Div, From 60154661a4babbddf52bdab639c4121de22b7799 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 26 May 2026 17:59:29 -0300 Subject: [PATCH 12/48] Cleanup TODOs --- crates/core/src/scheduler.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 8529b6d6..49bf6692 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -147,8 +147,8 @@ impl Scheduler { Some(slot) = slot_ticker.recv() => { tracing::debug!(slot = %slot.slot, "Slot ticked"); - // TODO: metrics - // instrumentSlot(slot) + // TODO: + // instrument_slot(slot) // NOTE: Ignore send errors, it means that there are no subscribers. let _ = self.slot_broadcast.send(slot.clone()); @@ -248,8 +248,6 @@ impl Scheduler { tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); } } - - todo!() } async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { @@ -334,8 +332,8 @@ async fn resolve_active_validators( for (index, val) in complete.iter() { let pubkey = types::PubKey::try_from(val.validator.pubkey.as_str())?; - // TODO: Support `submitter` - // submitter(pubkey, v.Balance, val.status.to_string()) + // TODO: + // submitter(pubkey, val.balance, val.status.to_string()) // Check for active validators for the given epoch. // The activation epoch needs to be checked in cases where this function is From 0ccdd1e06589c99d5cd3657d3107469251122e17 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Wed, 27 May 2026 17:21:07 -0300 Subject: [PATCH 13/48] Resolve Attester duties --- crates/core/src/scheduler.rs | 155 ++++++++++++++++++++++++++++++++++- crates/core/src/types.rs | 54 +++++++++++- 2 files changed, 204 insertions(+), 5 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 49bf6692..eef982de 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -35,6 +35,15 @@ pub enum SchedulerError { /// Invalid epoch error. #[error("Invalid epoch")] InvalidEpoch(#[from] std::num::ParseIntError), + + /// Invalid attester duty pubkey. + #[error("Invalid attester duty pubkey: expected {expected}, got {actual}")] + InvalidAttesterDutyPubkey { + /// Expected public key. + expected: types::PubKey, + /// Actual public key. + actual: types::PubKey, + }, } type Result = std::result::Result; @@ -251,7 +260,73 @@ impl Scheduler { } async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { - todo!() + async fn inner(s: &mut Scheduler, slot: types::Slot) -> Result<()> { + let vals = resolve_active_validators(slot.epoch(), &s.valcache).await?; + + // TODO: + // activeValsGauge.Set(float64(len(vals))) + + if vals.is_empty() { + tracing::info!(slot = %slot.slot, "No active validators for slot"); + s.storage.lock().await.resolved_epoch = slot.epoch(); + return Ok(()); + } + + // Resolve Attester duties + { + let att_duties = { + let mut att_duties = fetch_attester_duties(&slot, &vals, &s.client).await?; + att_duties.sort_by_key(|ad| ad.slot); + att_duties + }; + let mut storage = s.storage.lock().await; + + for att_duty in att_duties.into_iter() { + if !storage.set_duty_definition( + types::Duty::new_attester_duty(att_duty.slot), + slot.epoch(), + att_duty.pubkey, + types::DutyDefinition::Attester(att_duty.clone()), + ) { + continue; + } + + tracing::info!( + slot = %att_duty.slot, + vidx = %att_duty.v_idx, + pubkey = %att_duty.pubkey, + epoch = %slot.epoch(), + "Resolved attester duty" + ); + + // Schedule Aggregator duty as well + let agg_duty = types::Duty::new_aggregator_duty(att_duty.slot); + if !storage.set_duty_definition( + agg_duty, + slot.epoch(), + att_duty.pubkey, + types::DutyDefinition::Attester(att_duty), + ) { + continue; + } + } + } + + todo!(); + + let mut storage = s.storage.lock().await; + storage.resolved_epoch = slot.epoch(); + storage.trim_duties(slot.epoch() - TRIM_EPOCH_OFFSET); + + Ok(()) + } + + // TODO: Improve the poor-man's `defer` + self.storage.lock().await.resolving_epoch = slot.epoch(); + let res = inner(self, slot).await; + self.storage.lock().await.resolving_epoch = u64::MAX; + + res } } @@ -454,3 +529,81 @@ async fn delay_slot_offset(slot: &types::Slot, duty: &types::Duty) { tokio::time::sleep(to_sleep.to_std().unwrap_or_default()).await; } + +/// Fetches the attester duties for the given slot and validators, and validates +/// that the returned duties match the expected validators. +async fn fetch_attester_duties( + slot: &types::Slot, + validators: &Vec, + client: &client::EthBeaconNodeApiClient, +) -> Result> { + let req = pluto_eth2api::GetAttesterDutiesRequest::builder() + .epoch(slot.epoch().to_string()) + .body(validators.iter().map(|v| v.v_idx.to_string()).collect()) + .build() + .map_err(pluto_eth2api::EthBeaconNodeApiClientError::RequestError)?; + let resp = client + .get_attester_duties(req) + .await + .map_err(pluto_eth2api::EthBeaconNodeApiClientError::RequestError)?; + + let att_duties: Vec = match resp { + pluto_eth2api::GetAttesterDutiesResponse::Ok(duties) => duties + .data + .into_iter() + .map(|d| { + d.try_into() + .map_err(|_| pluto_eth2api::EthBeaconNodeApiClientError::UnexpectedResponse) + }) + .collect::, _>>(), + _ => Err(pluto_eth2api::EthBeaconNodeApiClientError::UnexpectedResponse), + }?; + + let mut remaining = validators + .iter() + .map(|v| (v.v_idx, true)) + .collect::>(); + + let mut result = vec![]; + for att_duty in att_duties.into_iter() { + remaining.remove(&att_duty.v_idx); + + if att_duty.slot < slot.slot { + // Skip duties for earlier slots in initial epoch. + continue; + } + + let Some(pubkey) = validators + .iter() + .find(|v| v.v_idx == att_duty.v_idx) + .map(|v| v.pubkey) + else { + tracing::warn!( + vidx = att_duty.v_idx, + slot = %slot.slot, + "Ignoring unexpected attester duty" + ); + continue; + }; + + if pubkey != att_duty.pubkey { + return Err(SchedulerError::InvalidAttesterDutyPubkey { + expected: pubkey, + actual: att_duty.pubkey, + }); + } + + result.push(att_duty); + } + + if remaining.len() > 0 { + tracing::warn!( + slot = %slot.slot, + epoch = %slot.epoch(), + validator_indexes = ?remaining, + "Missing attester duties", + ); + } + + Ok(result) +} diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index dfc122fc..71ae718c 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -161,7 +161,7 @@ impl TryFrom for DutyType { } /// SlotNumber struct -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct SlotNumber(u64); impl Display for SlotNumber { @@ -436,14 +436,60 @@ impl AsRef<[u8]> for PubKey { // todo: add toEth2Format for the pub key // https://github.com/ObolNetwork/charon/blob/b3008103c5429b031b63518195f4c49db4e9a68d/core/types.go#L311 -#[derive(Debug, Clone, PartialEq, Eq)] +/// Errors in [`DutyDefinition`]. +#[derive(Debug, thiserror::Error)] +pub enum DutyDefinitionError { + /// Invalid field when parsing from a response. + #[error("invalid field `{field}` in duty definition")] + InvalidField { field: &'static str }, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct AttesterDutyDefinition { + pub pubkey: PubKey, + pub v_idx: u64, + pub slot: SlotNumber, + + inner: pluto_eth2api::types::GetAttesterDutiesResponseResponseDatum, +} + +impl TryInto + for pluto_eth2api::types::GetAttesterDutiesResponseResponseDatum +{ + type Error = DutyDefinitionError; + + fn try_into(self) -> Result { + let pubkey = PubKey::try_from(self.pubkey.as_str()) + .map_err(|_| DutyDefinitionError::InvalidField { field: "pubkey" })?; + let v_idx = + self.validator_index + .parse::() + .map_err(|_| DutyDefinitionError::InvalidField { + field: "validator_index", + })?; + let slot = SlotNumber::from( + self.slot + .parse::() + .map_err(|_| DutyDefinitionError::InvalidField { field: "slot" })?, + ); + + Ok(AttesterDutyDefinition { + pubkey, + v_idx, + slot, + inner: self, + }) + } +} + +#[derive(Debug, Clone, PartialEq)] pub enum DutyDefinition { - Attester(), + Attester(AttesterDutyDefinition), Proposer(), SyncCommittee(), } -#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Clone, PartialEq, Default)] pub struct DutyDefinitionSet(HashMap); impl Deref for DutyDefinitionSet { From 3f70aaf750f48391a997eae7736466b879ac5156 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Wed, 27 May 2026 17:42:19 -0300 Subject: [PATCH 14/48] Resolve Proposer duties --- crates/core/src/scheduler.rs | 125 ++++++++++++++++++++++++++++++----- crates/core/src/types.rs | 40 ++++++++++- 2 files changed, 146 insertions(+), 19 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index eef982de..4d9fa9a8 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -36,9 +36,9 @@ pub enum SchedulerError { #[error("Invalid epoch")] InvalidEpoch(#[from] std::num::ParseIntError), - /// Invalid attester duty pubkey. - #[error("Invalid attester duty pubkey: expected {expected}, got {actual}")] - InvalidAttesterDutyPubkey { + /// Invalid duty pubkey. + #[error("Invalid duty pubkey: expected {expected}, got {actual}")] + InvalidDutyPubkey { /// Expected public key. expected: types::PubKey, /// Actual public key. @@ -262,25 +262,31 @@ impl Scheduler { async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { async fn inner(s: &mut Scheduler, slot: types::Slot) -> Result<()> { let vals = resolve_active_validators(slot.epoch(), &s.valcache).await?; - - // TODO: - // activeValsGauge.Set(float64(len(vals))) - if vals.is_empty() { tracing::info!(slot = %slot.slot, "No active validators for slot"); s.storage.lock().await.resolved_epoch = slot.epoch(); return Ok(()); } + let att_duties = { + let mut att_duties = fetch_attester_duties(&slot, &vals, &s.client).await?; + att_duties.sort_by_key(|ad| ad.slot); + att_duties + }; + + let pro_duties = { + let mut pro_duties = fetch_proposer_duties(&slot, &vals, &s.client).await?; + pro_duties.sort_by_key(|pd| pd.slot); + pro_duties + }; + + // TODO: + // activeValsGauge.Set(float64(len(vals))) + + let mut storage = s.storage.lock().await; + // Resolve Attester duties { - let att_duties = { - let mut att_duties = fetch_attester_duties(&slot, &vals, &s.client).await?; - att_duties.sort_by_key(|ad| ad.slot); - att_duties - }; - let mut storage = s.storage.lock().await; - for att_duty in att_duties.into_iter() { if !storage.set_duty_definition( types::Duty::new_attester_duty(att_duty.slot), @@ -301,20 +307,42 @@ impl Scheduler { // Schedule Aggregator duty as well let agg_duty = types::Duty::new_aggregator_duty(att_duty.slot); - if !storage.set_duty_definition( + storage.set_duty_definition( agg_duty, slot.epoch(), att_duty.pubkey, types::DutyDefinition::Attester(att_duty), + ); + } + } + + // Resolve Proposer duties + { + for pro_duty in pro_duties.into_iter() { + if !storage.set_duty_definition( + types::Duty::new_proposer_duty(pro_duty.slot), + slot.epoch(), + pro_duty.pubkey, + types::DutyDefinition::Proposer(pro_duty.clone()), ) { continue; } + + tracing::info!( + slot = %pro_duty.slot, + vidx = %pro_duty.v_idx, + pubkey = %pro_duty.pubkey, + epoch = %slot.epoch(), + "Resolved proposer duty" + ); } } - todo!(); + // Resolve Sync Committee duties + { + todo!() + } - let mut storage = s.storage.lock().await; storage.resolved_epoch = slot.epoch(); storage.trim_duties(slot.epoch() - TRIM_EPOCH_OFFSET); @@ -587,7 +615,7 @@ async fn fetch_attester_duties( }; if pubkey != att_duty.pubkey { - return Err(SchedulerError::InvalidAttesterDutyPubkey { + return Err(SchedulerError::InvalidDutyPubkey { expected: pubkey, actual: att_duty.pubkey, }); @@ -607,3 +635,64 @@ async fn fetch_attester_duties( Ok(result) } + +/// Fetches the proposer duties for the given slot and validators, and validates +/// that the returned duties match the expected validators. +async fn fetch_proposer_duties( + slot: &types::Slot, + validators: &Vec, + client: &client::EthBeaconNodeApiClient, +) -> Result> { + let req = pluto_eth2api::GetProposerDutiesRequest::builder() + .epoch(slot.epoch().to_string()) + .build() + .map_err(pluto_eth2api::EthBeaconNodeApiClientError::RequestError)?; + let resp = client + .get_proposer_duties(req) + .await + .map_err(pluto_eth2api::EthBeaconNodeApiClientError::RequestError)?; + + let pro_duties: Vec = match resp { + pluto_eth2api::GetProposerDutiesResponse::Ok(duties) => duties + .data + .into_iter() + .map(|d| { + d.try_into() + .map_err(|_| pluto_eth2api::EthBeaconNodeApiClientError::UnexpectedResponse) + }) + .collect::, _>>(), + _ => Err(pluto_eth2api::EthBeaconNodeApiClientError::UnexpectedResponse), + }?; + + let mut result = vec![]; + for pro_duty in pro_duties.into_iter() { + if pro_duty.slot < slot.slot { + // Skip duties for earlier slots in initial epoch. + continue; + } + + let Some(pubkey) = validators + .iter() + .find(|v| v.v_idx == pro_duty.v_idx) + .map(|v| v.pubkey) + else { + tracing::warn!( + vidx = pro_duty.v_idx, + slot = %slot.slot, + "Ignoring unexpected proposer duty" + ); + continue; + }; + + if pubkey != pro_duty.pubkey { + return Err(SchedulerError::InvalidDutyPubkey { + expected: pubkey, + actual: pro_duty.pubkey, + }); + } + + result.push(pro_duty); + } + + Ok(result) +} diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 71ae718c..3159b83c 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -482,10 +482,48 @@ impl TryInto } } +#[derive(Debug, Clone, PartialEq)] +pub struct ProposerDutyDefinition { + pub pubkey: PubKey, + pub v_idx: u64, + pub slot: SlotNumber, + + inner: pluto_eth2api::types::GetProposerDutiesResponseResponseDatum, +} + +impl TryInto + for pluto_eth2api::types::GetProposerDutiesResponseResponseDatum +{ + type Error = DutyDefinitionError; + + fn try_into(self) -> Result { + let pubkey = PubKey::try_from(self.pubkey.as_str()) + .map_err(|_| DutyDefinitionError::InvalidField { field: "pubkey" })?; + let v_idx = + self.validator_index + .parse::() + .map_err(|_| DutyDefinitionError::InvalidField { + field: "validator_index", + })?; + let slot = SlotNumber::from( + self.slot + .parse::() + .map_err(|_| DutyDefinitionError::InvalidField { field: "slot" })?, + ); + + Ok(ProposerDutyDefinition { + pubkey, + v_idx, + slot, + inner: self, + }) + } +} + #[derive(Debug, Clone, PartialEq)] pub enum DutyDefinition { Attester(AttesterDutyDefinition), - Proposer(), + Proposer(ProposerDutyDefinition), SyncCommittee(), } From 624cf495cb10d292acd6c12d17381d6f2d0c0255 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Wed, 27 May 2026 18:34:49 -0300 Subject: [PATCH 15/48] Resolve Sync Committee duties --- crates/core/src/scheduler.rs | 96 +++++++++++++++++++++++++++++++----- crates/core/src/types.rs | 45 ++++++++++++++++- 2 files changed, 127 insertions(+), 14 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 4d9fa9a8..4a3ef329 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -268,18 +268,6 @@ impl Scheduler { return Ok(()); } - let att_duties = { - let mut att_duties = fetch_attester_duties(&slot, &vals, &s.client).await?; - att_duties.sort_by_key(|ad| ad.slot); - att_duties - }; - - let pro_duties = { - let mut pro_duties = fetch_proposer_duties(&slot, &vals, &s.client).await?; - pro_duties.sort_by_key(|pd| pd.slot); - pro_duties - }; - // TODO: // activeValsGauge.Set(float64(len(vals))) @@ -287,6 +275,7 @@ impl Scheduler { // Resolve Attester duties { + let att_duties = fetch_attester_duties(&slot, &vals, &s.client).await?; for att_duty in att_duties.into_iter() { if !storage.set_duty_definition( types::Duty::new_attester_duty(att_duty.slot), @@ -318,6 +307,7 @@ impl Scheduler { // Resolve Proposer duties { + let pro_duties = fetch_proposer_duties(&slot, &vals, &s.client).await?; for pro_duty in pro_duties.into_iter() { if !storage.set_duty_definition( types::Duty::new_proposer_duty(pro_duty.slot), @@ -340,7 +330,30 @@ impl Scheduler { // Resolve Sync Committee duties { - todo!() + let sync_duties = fetch_sync_committee_duties(&slot, &vals, &s.client).await?; + for sync_duty in sync_duties.into_iter() { + // TODO(charon): sync committee duties start in the slot before the sync + // committee period. + // Refer: https://github.com/ethereum/consensus-specs/blob/dev/specs/altair/validator.md#sync-committee + for sl in slot + .iter() + .take_while(|other| other.epoch() == slot.epoch()) + { + storage.set_duty_definition( + types::Duty::new_sync_contribution_duty(sl.slot), + sl.epoch(), + sync_duty.pubkey, + types::DutyDefinition::SyncCommittee(sync_duty.clone()), + ); + } + + tracing::info!( + vidx = %&sync_duty.validator_index, + pubkey = %sync_duty.pubkey, + epoch = %slot.epoch(), + "Resolved sync committee duty" + ); + } } storage.resolved_epoch = slot.epoch(); @@ -696,3 +709,60 @@ async fn fetch_proposer_duties( Ok(result) } + +/// Fetches the sync committee duties for the given slot and validators, and +/// validates that the returned duties match the expected validators. +async fn fetch_sync_committee_duties( + slot: &types::Slot, + validators: &Vec, + client: &client::EthBeaconNodeApiClient, +) -> Result> { + let req = pluto_eth2api::GetSyncCommitteeDutiesRequest::builder() + .epoch(slot.epoch().to_string()) + .body(validators.iter().map(|v| v.v_idx.to_string()).collect()) + .build() + .map_err(pluto_eth2api::EthBeaconNodeApiClientError::RequestError)?; + let resp = client + .get_sync_committee_duties(req) + .await + .map_err(pluto_eth2api::EthBeaconNodeApiClientError::RequestError)?; + + let sync_duties: Vec = match resp { + pluto_eth2api::GetSyncCommitteeDutiesResponse::Ok(duties) => duties + .data + .into_iter() + .map(|d| { + d.try_into() + .map_err(|_| pluto_eth2api::EthBeaconNodeApiClientError::UnexpectedResponse) + }) + .collect::, _>>(), + _ => Err(pluto_eth2api::EthBeaconNodeApiClientError::UnexpectedResponse), + }?; + + let mut result = vec![]; + for sync_duty in sync_duties.into_iter() { + let Some(pubkey) = validators + .iter() + .find(|v| v.v_idx == sync_duty.validator_index) + .map(|v| v.pubkey) + else { + tracing::warn!( + vidx = sync_duty.validator_index, + slot = %slot.slot, + "Ignoring unexpected sync committee duty" + ); + continue; + }; + + if pubkey != sync_duty.pubkey { + return Err(SchedulerError::InvalidDutyPubkey { + expected: pubkey, + actual: sync_duty.pubkey, + }); + } + + result.push(sync_duty); + } + + Ok(result) +} diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 3159b83c..9c7097e1 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -520,11 +520,54 @@ impl TryInto } } +#[derive(Debug, Clone, PartialEq)] +pub struct SyncCommitteeDutyDefinition { + pub pubkey: PubKey, + pub validator_index: u64, + pub validator_sync_committee_indices: Vec, + + inner: pluto_eth2api::types::GetSyncCommitteeDutiesResponseResponseDatum, +} + +impl TryInto + for pluto_eth2api::types::GetSyncCommitteeDutiesResponseResponseDatum +{ + type Error = DutyDefinitionError; + + fn try_into(self) -> Result { + let pubkey = PubKey::try_from(self.pubkey.as_str()) + .map_err(|_| DutyDefinitionError::InvalidField { field: "pubkey" })?; + let validator_index = + self.validator_index + .parse::() + .map_err(|_| DutyDefinitionError::InvalidField { + field: "validator_index", + })?; + let validator_sync_committee_indices = self + .validator_sync_committee_indices + .iter() + .map(|idx| { + idx.parse::() + .map_err(|_| DutyDefinitionError::InvalidField { + field: "validator_sync_committee_indices", + }) + }) + .collect::, DutyDefinitionError>>()?; + + Ok(SyncCommitteeDutyDefinition { + pubkey, + validator_index, + validator_sync_committee_indices, + inner: self, + }) + } +} + #[derive(Debug, Clone, PartialEq)] pub enum DutyDefinition { Attester(AttesterDutyDefinition), Proposer(ProposerDutyDefinition), - SyncCommittee(), + SyncCommittee(SyncCommitteeDutyDefinition), } #[derive(Debug, Clone, PartialEq, Default)] From b83206706e3ea501094fa17afd483e66d45edd20 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Wed, 27 May 2026 18:56:03 -0300 Subject: [PATCH 16/48] Add `get_duty_definition` - TODO's regarding concurrent access while resolving duties --- crates/core/src/scheduler.rs | 56 ++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 4a3ef329..cdabcba2 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -44,6 +44,32 @@ pub enum SchedulerError { /// Actual public key. actual: types::PubKey, }, + + /// Attempted to use the deprecated [`types::DutyType::BuilderProposer`] + /// duty type. + #[error("Deprecated duty DutyType::BuilderProposer")] + DeprecatedDutyBuilderProposer, + + /// Attempted to get a duty definition for an epoch that has already been + /// trimmed. + #[error("Epoch {epoch} has already been trimmed")] + EpochAlreadyTrimmed { + /// Trimmed epoch + epoch: u64, + + /// Duty attempted to be accessed + duty: types::Duty, + }, + + /// Duty definition not found for a resolved epoch. + #[error("Duty {duty} definition set not found in the resolved epoch {epoch}")] + DutyNotFound { + /// The resolved epoch. + epoch: u64, + + /// Duty attempted to be accessed + duty: types::Duty, + }, } type Result = std::result::Result; @@ -206,6 +232,36 @@ impl Scheduler { }); } + /// Returns the definition for a duty if a definition exists for a resolved + /// epoch. + pub async fn get_duty_definition( + &mut self, + duty: types::Duty, + ) -> Result { + if duty.duty_type == types::DutyType::BuilderProposer { + return Err(SchedulerError::DeprecatedDutyBuilderProposer); + } + + let (_, slots_per_epoch) = self.client.fetch_slots_config().await?; + let epoch = duty.slot.inner() / slots_per_epoch; + + // TODO: The `is_resolving_epoch` and similar checks are a code smell. + // Rewrite to an Actor design so that we don't have concurrent access to the + // storage + + let storage = self.storage.lock().await; + if storage.is_epoch_trimmed(epoch) { + return Err(SchedulerError::EpochAlreadyTrimmed { epoch, duty }); + } + + let def_set = storage + .duties + .get(&duty) + .ok_or_else(|| SchedulerError::DutyNotFound { epoch, duty })?; + + Ok(def_set.clone()) + } + async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { let resolved_epoch = self.storage.lock().await.resolved_epoch; if resolved_epoch != slot.epoch() { From 0b9bffca034faa2af51219fd7a9925335e5dbeeb Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Wed, 27 May 2026 19:03:16 -0300 Subject: [PATCH 17/48] Add `handle_chain_reorg` --- crates/core/src/scheduler.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index cdabcba2..18996c74 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -4,6 +4,7 @@ use std::{ collections::{HashMap, hash_map::Entry}, ops::Div, time::Duration, + u64, }; use backon::{BackoffBuilder, Retryable}; @@ -262,6 +263,26 @@ impl Scheduler { Ok(def_set.clone()) } + /// In case of a reorg of an already resolved epoch trim all duties. + /// + /// Duties will be resolved again in the nex slot. + pub async fn handle_chain_reorg(&mut self, epoch: u64) { + // NOTE: The SSE feature check should be done by the caller + let mut storage = self.storage.lock().await; + + let resolved_epoch = storage.resolved_epoch; + if epoch < resolved_epoch { + storage.trim_duties(resolved_epoch); + storage.resolved_epoch = u64::MAX; + + tracing::info!( + reorg_epoch = epoch, + resolved_epoch, + "Chain reorg event handled, duties trimmed" + ) + } + } + async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { let resolved_epoch = self.storage.lock().await.resolved_epoch; if resolved_epoch != slot.epoch() { From 5ecb4d18a578b17aae09c3af45451ea68d961427 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Wed, 27 May 2026 19:27:18 -0300 Subject: [PATCH 18/48] Cleanup parsing errors - Try to reuse existing infrastructure --- crates/core/src/scheduler.rs | 8 +-- crates/core/src/types.rs | 104 +++++++++++------------------------ 2 files changed, 34 insertions(+), 78 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 18996c74..722f882d 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -532,11 +532,9 @@ async fn resolve_active_validators( // The activation epoch needs to be checked in cases where this function is // called before the epoch starts. if !val.status.is_active() { - let activation_epoch = val - .validator - .activation_epoch - .parse::() - .map_err(SchedulerError::InvalidEpoch)?; + let activation_epoch = val.validator.activation_epoch.parse::().map_err(|_| { + pluto_eth2api::EthBeaconNodeApiClientError::ParseError("activation_epoch".into()) + })?; if activation_epoch != epoch { continue; diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 9c7097e1..7ceb8d16 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -2,13 +2,7 @@ //! Types for the Charon core. -use std::{ - any::Any, - collections::HashMap, - fmt::Display, - iter, - ops::{Deref, DerefMut}, -}; +use std::{any::Any, collections::HashMap, fmt::Display, iter}; use chrono::{DateTime, Duration, Utc}; use dyn_clone::DynClone; @@ -433,17 +427,6 @@ impl AsRef<[u8]> for PubKey { } } -// todo: add toEth2Format for the pub key -// https://github.com/ObolNetwork/charon/blob/b3008103c5429b031b63518195f4c49db4e9a68d/core/types.go#L311 - -/// Errors in [`DutyDefinition`]. -#[derive(Debug, thiserror::Error)] -pub enum DutyDefinitionError { - /// Invalid field when parsing from a response. - #[error("invalid field `{field}` in duty definition")] - InvalidField { field: &'static str }, -} - #[derive(Debug, Clone, PartialEq)] pub struct AttesterDutyDefinition { pub pubkey: PubKey, @@ -456,22 +439,18 @@ pub struct AttesterDutyDefinition { impl TryInto for pluto_eth2api::types::GetAttesterDutiesResponseResponseDatum { - type Error = DutyDefinitionError; + type Error = pluto_eth2api::EthBeaconNodeApiClientError; fn try_into(self) -> Result { let pubkey = PubKey::try_from(self.pubkey.as_str()) - .map_err(|_| DutyDefinitionError::InvalidField { field: "pubkey" })?; - let v_idx = - self.validator_index - .parse::() - .map_err(|_| DutyDefinitionError::InvalidField { - field: "validator_index", - })?; - let slot = SlotNumber::from( - self.slot - .parse::() - .map_err(|_| DutyDefinitionError::InvalidField { field: "slot" })?, - ); + .map_err(|_| pluto_eth2api::EthBeaconNodeApiClientError::ParseError("pubkey".into()))?; + let v_idx = self.validator_index.parse::().map_err(|_| { + pluto_eth2api::EthBeaconNodeApiClientError::ParseError("validator_index".into()) + })?; + let slot = + SlotNumber::from(self.slot.parse::().map_err(|_| { + pluto_eth2api::EthBeaconNodeApiClientError::ParseError("slot".into()) + })?); Ok(AttesterDutyDefinition { pubkey, @@ -494,22 +473,18 @@ pub struct ProposerDutyDefinition { impl TryInto for pluto_eth2api::types::GetProposerDutiesResponseResponseDatum { - type Error = DutyDefinitionError; + type Error = pluto_eth2api::EthBeaconNodeApiClientError; fn try_into(self) -> Result { let pubkey = PubKey::try_from(self.pubkey.as_str()) - .map_err(|_| DutyDefinitionError::InvalidField { field: "pubkey" })?; - let v_idx = - self.validator_index - .parse::() - .map_err(|_| DutyDefinitionError::InvalidField { - field: "validator_index", - })?; - let slot = SlotNumber::from( - self.slot - .parse::() - .map_err(|_| DutyDefinitionError::InvalidField { field: "slot" })?, - ); + .map_err(|_| pluto_eth2api::EthBeaconNodeApiClientError::ParseError("pubkey".into()))?; + let v_idx = self.validator_index.parse::().map_err(|_| { + pluto_eth2api::EthBeaconNodeApiClientError::ParseError("validator_index".into()) + })?; + let slot = + SlotNumber::from(self.slot.parse::().map_err(|_| { + pluto_eth2api::EthBeaconNodeApiClientError::ParseError("slot".into()) + })?); Ok(ProposerDutyDefinition { pubkey, @@ -532,27 +507,25 @@ pub struct SyncCommitteeDutyDefinition { impl TryInto for pluto_eth2api::types::GetSyncCommitteeDutiesResponseResponseDatum { - type Error = DutyDefinitionError; + type Error = pluto_eth2api::EthBeaconNodeApiClientError; fn try_into(self) -> Result { let pubkey = PubKey::try_from(self.pubkey.as_str()) - .map_err(|_| DutyDefinitionError::InvalidField { field: "pubkey" })?; - let validator_index = - self.validator_index - .parse::() - .map_err(|_| DutyDefinitionError::InvalidField { - field: "validator_index", - })?; + .map_err(|_| pluto_eth2api::EthBeaconNodeApiClientError::ParseError("pubkey".into()))?; + let validator_index = self.validator_index.parse::().map_err(|_| { + pluto_eth2api::EthBeaconNodeApiClientError::ParseError("validator_index".into()) + })?; let validator_sync_committee_indices = self .validator_sync_committee_indices .iter() .map(|idx| { - idx.parse::() - .map_err(|_| DutyDefinitionError::InvalidField { - field: "validator_sync_committee_indices", - }) + idx.parse::().map_err(|_| { + pluto_eth2api::EthBeaconNodeApiClientError::ParseError( + "validator_sync_committee_indices".into(), + ) + }) }) - .collect::, DutyDefinitionError>>()?; + .collect::, _>>()?; Ok(SyncCommitteeDutyDefinition { pubkey, @@ -570,22 +543,7 @@ pub enum DutyDefinition { SyncCommittee(SyncCommitteeDutyDefinition), } -#[derive(Debug, Clone, PartialEq, Default)] -pub struct DutyDefinitionSet(HashMap); - -impl Deref for DutyDefinitionSet { - type Target = HashMap; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl DerefMut for DutyDefinitionSet { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} +pub type DutyDefinitionSet = HashMap; /// Unsigned data type #[derive(Debug, Clone, PartialEq, Eq)] From 28d0b85832c6455f9cb31442ad80256192882360 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 16:48:30 -0300 Subject: [PATCH 19/48] Port to Actor model (WIP) --- crates/core/src/scheduler.rs | 357 ++++++++++++++++++++++++++++++++++- 1 file changed, 349 insertions(+), 8 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 722f882d..c1e92e8e 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,4 +1,5 @@ #![allow(dead_code, reason = "wip")] +#![allow(missing_docs)] use std::{ collections::{HashMap, hash_map::Entry}, @@ -9,11 +10,348 @@ use std::{ use backon::{BackoffBuilder, Retryable}; use pluto_eth2api::{EthBeaconNodeApiClientError, client}; -use tokio::sync::Mutex; +use tokio::sync::{self, Mutex}; use tokio_util::{future::FutureExt, sync::CancellationToken}; use crate::{types, valcache}; +pub struct Builder { + slot_broadcast: sync::broadcast::Sender, + duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, + reorg_rx: sync::mpsc::Receiver, +} + +impl Builder { + pub fn new() -> Self { + Builder { + slot_broadcast: sync::broadcast::channel(100).0, + duty_broadcast: sync::broadcast::channel(100).0, + reorg_rx: sync::mpsc::channel(100).1, // A channel that never receives + } + } + + /// Subscribes a callback function for triggered slots. + pub fn subscribe_slot( + &mut self, + f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, + label: impl AsRef + Send + 'static, + ) { + let mut rx = self.slot_broadcast.subscribe(); + + // TODO: We might want to return a handle so clients can `.abort()` them to drop + // the subscription + tokio::spawn(async move { + while let Ok(slot) = rx.recv().await { + if let Err(err) = f(&slot) { + tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); + } + } + }); + } + + /// Subscribes a callback function for triggered duties. + pub fn subscribe_duty( + &mut self, + f: impl Fn(&types::Duty, &types::DutyDefinitionSet) -> Result<()> + Send + 'static, + label: impl AsRef + Send + 'static, + ) { + let mut rx = self.duty_broadcast.subscribe(); + + tokio::spawn(async move { + while let Ok((duty, set)) = rx.recv().await { + if let Err(err) = f(&duty, &set) { + tracing::error!(err = ?err, label = label.as_ref(), "Trigger duty subscriber error"); + } + } + }); + } + + pub fn with_chain_reorgs(&mut self, reorg_rx: sync::mpsc::Receiver) { + // NOTE: The SSE feature check should be done by the caller + self.reorg_rx = reorg_rx; + } + + fn build(self) -> Result { + todo!() + } +} + +enum Message { + GetDutyDefinition { + duty: types::Duty, + resp: sync::oneshot::Sender>, + }, +} + +struct Handle { + sender: sync::mpsc::Sender, +} + +impl Handle { + /// Returns the definition for a duty if a definition exists for a resolved + /// epoch. + async fn get_duty_definition(&self, duty: types::Duty) -> Result { + let (tx, rx) = sync::oneshot::channel(); + let msg = Message::GetDutyDefinition { duty, resp: tx }; + + self.sender + .send(msg) + .await + .map_err(|_| SchedulerError::Terminated)?; + + // TODO: In Charon, this call has a default timeout of 100 ms while the epoch is + // being resolved. I don't like that approach. + rx.await.map_err(|_| SchedulerError::Terminated)? + } +} + +struct Actor { + client: client::EthBeaconNodeApiClient, + valcache: valcache::ValidatorCache, + + slot_broadcast: sync::broadcast::Sender, + duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, + + // TODO: Flatten + inner: Inner, +} + +impl Actor { + async fn run( + mut self, + mut slot_rx: sync::mpsc::Receiver, + mut msg_rx: sync::mpsc::Receiver, + mut reorg_rx: sync::mpsc::Receiver, + ct: CancellationToken, + ) { + loop { + tokio::select! { + biased; + + _ = ct.cancelled() => break, + + Some(epoch) = reorg_rx.recv() => { + self.handle_chain_reorg(epoch).await; + }, + + Some(slot) = slot_rx.recv() => { + tracing::debug!(slot = %slot.slot, "Slot ticked"); + + // TODO: + // instrument_slot(slot) + + // NOTE: Ignore send errors, it means that there are no subscribers. + let _ = self.slot_broadcast.send(slot.clone()); + + self.schedule_slot(slot, ct.clone()).await; + }, + + Some(msg) = msg_rx.recv() => match msg { + Message::GetDutyDefinition { duty, resp } => { + let result = self.get_duty_definition(duty).await; + let _ = resp.send(result); + }, + } + } + } + } + + /// Returns the definition for a duty if a definition exists for a resolved + /// epoch. + async fn get_duty_definition(&mut self, duty: types::Duty) -> Result { + if duty.duty_type == types::DutyType::BuilderProposer { + return Err(SchedulerError::DeprecatedDutyBuilderProposer); + } + + let (_, slots_per_epoch) = self.client.fetch_slots_config().await?; + let epoch = duty.slot.inner() / slots_per_epoch; + + // TODO: Cleanup + let storage = &self.inner; + if storage.is_epoch_trimmed(epoch) { + return Err(SchedulerError::EpochAlreadyTrimmed { epoch, duty }); + } + + let def_set = storage + .duties + .get(&duty) + .ok_or_else(|| SchedulerError::DutyNotFound { epoch, duty })?; + + Ok(def_set.clone()) + } + + /// In case of a reorg of an already resolved epoch trim all duties. + /// + /// Duties will be resolved again in the nex slot. + pub async fn handle_chain_reorg(&mut self, epoch: u64) { + let resolved_epoch = self.inner.resolved_epoch; + if epoch < resolved_epoch { + self.inner.trim_duties(resolved_epoch); + self.inner.resolved_epoch = u64::MAX; + + tracing::info!( + reorg_epoch = epoch, + resolved_epoch, + "Chain reorg event handled, duties trimmed" + ) + } + } + + /// TODO: Add docs + async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { + if self.inner.resolved_epoch != slot.epoch() { + tracing::debug!(slot = %slot.slot, epoch = %slot.epoch(), "Resolving duties for slot"); + + if let Err(err) = self.resolve_duties(slot.clone()).await { + tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); + } + } + + for duty_type in types::DutyType::all() { + let duty = types::Duty { + duty_type, + slot: slot.slot, + }; + + let def_set = { + let storage = &self.inner; + let Some(def_set) = storage.duties.get(&duty) else { + // Nothing for this duty. + continue; + }; + + def_set.clone() + }; + + let ct = ct.clone(); + let slot = slot.clone(); + let broadcast = self.duty_broadcast.clone(); + tokio::spawn(async move { + if let None = delay_slot_offset(&slot, &duty) + .with_cancellation_token_owned(ct) + .await + { + // Cancelled early + return; + } + + // TODO: + // instrument_duty(duty, def_set); + + // NOTE: Ignore send errors, it means that there are no subscribers. + let _ = broadcast.send((duty.clone(), def_set.clone())); + }); + } + + if slot.last_in_epoch() { + if let Err(err) = self.resolve_duties(slot.next_slot()).await { + tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); + } + } + } + + /// TODO: Add docs + async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { + let vals = resolve_active_validators(slot.epoch(), &self.valcache).await?; + if vals.is_empty() { + tracing::info!(slot = %slot.slot, "No active validators for slot"); + self.inner.resolved_epoch = slot.epoch(); + return Ok(()); + } + + // TODO: + // activeValsGauge.Set(float64(len(vals))) + + // Resolve Attester duties + { + let att_duties = fetch_attester_duties(&slot, &vals, &self.client).await?; + for att_duty in att_duties.into_iter() { + if !self.inner.set_duty_definition( + types::Duty::new_attester_duty(att_duty.slot), + slot.epoch(), + att_duty.pubkey, + types::DutyDefinition::Attester(att_duty.clone()), + ) { + continue; + } + + tracing::info!( + slot = %att_duty.slot, + vidx = %att_duty.v_idx, + pubkey = %att_duty.pubkey, + epoch = %slot.epoch(), + "Resolved attester duty" + ); + + // Schedule Aggregator duty as well + let agg_duty = types::Duty::new_aggregator_duty(att_duty.slot); + self.inner.set_duty_definition( + agg_duty, + slot.epoch(), + att_duty.pubkey, + types::DutyDefinition::Attester(att_duty), + ); + } + } + + // Resolve Proposer duties + { + let pro_duties = fetch_proposer_duties(&slot, &vals, &self.client).await?; + for pro_duty in pro_duties.into_iter() { + if !self.inner.set_duty_definition( + types::Duty::new_proposer_duty(pro_duty.slot), + slot.epoch(), + pro_duty.pubkey, + types::DutyDefinition::Proposer(pro_duty.clone()), + ) { + continue; + } + + tracing::info!( + slot = %pro_duty.slot, + vidx = %pro_duty.v_idx, + pubkey = %pro_duty.pubkey, + epoch = %slot.epoch(), + "Resolved proposer duty" + ); + } + } + + // Resolve Sync Committee duties + { + let sync_duties = fetch_sync_committee_duties(&slot, &vals, &self.client).await?; + for sync_duty in sync_duties.into_iter() { + // TODO(charon): sync committee duties start in the slot before the sync + // committee period. + // Refer: https://github.com/ethereum/consensus-specs/blob/dev/specs/altair/validator.md#sync-committee + for sl in slot + .iter() + .take_while(|other| other.epoch() == slot.epoch()) + { + self.inner.set_duty_definition( + types::Duty::new_sync_contribution_duty(sl.slot), + sl.epoch(), + sync_duty.pubkey, + types::DutyDefinition::SyncCommittee(sync_duty.clone()), + ); + } + + tracing::info!( + vidx = %&sync_duty.validator_index, + pubkey = %sync_duty.pubkey, + epoch = %slot.epoch(), + "Resolved sync committee duty" + ); + } + } + + self.inner.resolved_epoch = slot.epoch(); + self.inner.trim_duties(slot.epoch() - TRIM_EPOCH_OFFSET); + + Ok(()) + } +} + // Trim cached duties after 3 epochs. Note inclusion delay calculation requires // now-32 slot duties. const TRIM_EPOCH_OFFSET: u64 = 3; @@ -71,6 +409,9 @@ pub enum SchedulerError { /// Duty attempted to be accessed duty: types::Duty, }, + + #[error("Scheduler actor has been terminated")] + Terminated, } type Result = std::result::Result; @@ -79,8 +420,8 @@ struct Scheduler { client: client::EthBeaconNodeApiClient, valcache: valcache::ValidatorCache, - slot_broadcast: tokio::sync::broadcast::Sender, - duty_broadcast: tokio::sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, + slot_broadcast: sync::broadcast::Sender, + duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, storage: Mutex, } @@ -159,8 +500,8 @@ impl Scheduler { Scheduler { client, valcache, - slot_broadcast: tokio::sync::broadcast::channel(100).0, - duty_broadcast: tokio::sync::broadcast::channel(100).0, + slot_broadcast: sync::broadcast::channel(100).0, + duty_broadcast: sync::broadcast::channel(100).0, storage: Mutex::new(Inner { resolved_epoch: u64::MAX, resolving_epoch: u64::MAX, @@ -198,7 +539,7 @@ impl Scheduler { } /// Subscribes a callback function for triggered slots. - /// Note this should be called *before* [`Scheduler::run`]. + /// NOTE: this should be called *before* [`Scheduler::run`]. pub async fn subscribe_slots( &mut self, f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, @@ -456,7 +797,7 @@ impl Scheduler { async fn new_slot_ticker( client: &client::EthBeaconNodeApiClient, ct: CancellationToken, -) -> Result> { +) -> Result> { let genesis_time = client.fetch_genesis_time().await?; let (slot_duration, slots_per_epoch) = client.fetch_slots_config().await?; let slot_duration = chrono::Duration::from_std(slot_duration).unwrap(); @@ -475,7 +816,7 @@ async fn new_slot_ticker( } }; - let (tx, rx) = tokio::sync::mpsc::channel(100); + let (tx, rx) = sync::mpsc::channel(100); tokio::spawn(async move { let mut slot = current_slot(); From 601d8a9d484c21af58b6caef29e21bb7abe23238 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 17:20:27 -0300 Subject: [PATCH 20/48] Inline inner --- crates/core/src/scheduler.rs | 113 ++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 23 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index c1e92e8e..f58cdcbb 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -71,8 +71,35 @@ impl Builder { self.reorg_rx = reorg_rx; } - fn build(self) -> Result { - todo!() + async fn build( + self, + client: client::EthBeaconNodeApiClient, + ct: CancellationToken, + ) -> Result { + wait_chain_start(&client).await?; + wait_beacon_sync(&client).await?; + + let slot_rx = new_slot_ticker(&client.clone(), ct.clone()).await?; + + let actor = Actor { + client: client.clone(), + // TODO: Figure out what to pass as `pub_keys`. + // In Charon, these are not used (dead code) + valcache: valcache::ValidatorCache::new(client.clone(), Vec::new()), + + slot_broadcast: self.slot_broadcast, + duty_broadcast: self.duty_broadcast, + + resolved_epoch: u64::MAX, + duties: HashMap::new(), + duties_by_epoch: HashMap::new(), + }; + + let (msg_tx, msg_rx) = sync::mpsc::channel(100); + let handle = Handle { sender: msg_tx }; + tokio::spawn(actor.run(slot_rx, msg_rx, self.reorg_rx, ct)); + + Ok(handle) } } @@ -112,8 +139,9 @@ struct Actor { slot_broadcast: sync::broadcast::Sender, duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, - // TODO: Flatten - inner: Inner, + resolved_epoch: u64, + duties: HashMap, + duties_by_epoch: HashMap>, } impl Actor { @@ -166,13 +194,11 @@ impl Actor { let (_, slots_per_epoch) = self.client.fetch_slots_config().await?; let epoch = duty.slot.inner() / slots_per_epoch; - // TODO: Cleanup - let storage = &self.inner; - if storage.is_epoch_trimmed(epoch) { + if self.is_epoch_trimmed(epoch) { return Err(SchedulerError::EpochAlreadyTrimmed { epoch, duty }); } - let def_set = storage + let def_set = self .duties .get(&duty) .ok_or_else(|| SchedulerError::DutyNotFound { epoch, duty })?; @@ -184,10 +210,10 @@ impl Actor { /// /// Duties will be resolved again in the nex slot. pub async fn handle_chain_reorg(&mut self, epoch: u64) { - let resolved_epoch = self.inner.resolved_epoch; + let resolved_epoch = self.resolved_epoch; if epoch < resolved_epoch { - self.inner.trim_duties(resolved_epoch); - self.inner.resolved_epoch = u64::MAX; + self.trim_duties(resolved_epoch); + self.resolved_epoch = u64::MAX; tracing::info!( reorg_epoch = epoch, @@ -197,9 +223,8 @@ impl Actor { } } - /// TODO: Add docs async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { - if self.inner.resolved_epoch != slot.epoch() { + if self.resolved_epoch != slot.epoch() { tracing::debug!(slot = %slot.slot, epoch = %slot.epoch(), "Resolving duties for slot"); if let Err(err) = self.resolve_duties(slot.clone()).await { @@ -214,8 +239,7 @@ impl Actor { }; let def_set = { - let storage = &self.inner; - let Some(def_set) = storage.duties.get(&duty) else { + let Some(def_set) = self.duties.get(&duty) else { // Nothing for this duty. continue; }; @@ -250,12 +274,11 @@ impl Actor { } } - /// TODO: Add docs async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { let vals = resolve_active_validators(slot.epoch(), &self.valcache).await?; if vals.is_empty() { tracing::info!(slot = %slot.slot, "No active validators for slot"); - self.inner.resolved_epoch = slot.epoch(); + self.resolved_epoch = slot.epoch(); return Ok(()); } @@ -266,7 +289,7 @@ impl Actor { { let att_duties = fetch_attester_duties(&slot, &vals, &self.client).await?; for att_duty in att_duties.into_iter() { - if !self.inner.set_duty_definition( + if !self.set_duty_definition( types::Duty::new_attester_duty(att_duty.slot), slot.epoch(), att_duty.pubkey, @@ -285,7 +308,7 @@ impl Actor { // Schedule Aggregator duty as well let agg_duty = types::Duty::new_aggregator_duty(att_duty.slot); - self.inner.set_duty_definition( + self.set_duty_definition( agg_duty, slot.epoch(), att_duty.pubkey, @@ -298,7 +321,7 @@ impl Actor { { let pro_duties = fetch_proposer_duties(&slot, &vals, &self.client).await?; for pro_duty in pro_duties.into_iter() { - if !self.inner.set_duty_definition( + if !self.set_duty_definition( types::Duty::new_proposer_duty(pro_duty.slot), slot.epoch(), pro_duty.pubkey, @@ -328,7 +351,7 @@ impl Actor { .iter() .take_while(|other| other.epoch() == slot.epoch()) { - self.inner.set_duty_definition( + self.set_duty_definition( types::Duty::new_sync_contribution_duty(sl.slot), sl.epoch(), sync_duty.pubkey, @@ -345,11 +368,55 @@ impl Actor { } } - self.inner.resolved_epoch = slot.epoch(); - self.inner.trim_duties(slot.epoch() - TRIM_EPOCH_OFFSET); + self.resolved_epoch = slot.epoch(); + self.trim_duties(slot.epoch() - TRIM_EPOCH_OFFSET); Ok(()) } + + /// Inserts a duty definition for a given pubkey. + /// + /// Returns true if it's set, false if it was already set. + fn set_duty_definition( + &mut self, + duty: types::Duty, + epoch: u64, + pub_key: types::PubKey, + definition: types::DutyDefinition, + ) -> bool { + let def_set = self.duties.entry(duty.clone()).or_default(); + match def_set.entry(pub_key) { + Entry::Occupied(_) => return false, + Entry::Vacant(entry) => { + entry.insert(definition); + } + }; + self.duties_by_epoch + .entry(epoch) + .or_insert(Vec::new()) + .push(duty); + + true + } + + fn trim_duties(&mut self, epoch: u64) { + let duties = self.duties_by_epoch.remove(&epoch); + if let Some(duties) = duties + && duties.len() > 0 + { + for duty in duties { + self.duties.remove(&duty); + } + } + } + + fn is_epoch_trimmed(&self, epoch: u64) -> bool { + if self.resolved_epoch == u64::MAX { + return false; + } + + epoch >= self.resolved_epoch + TRIM_EPOCH_OFFSET + } } // Trim cached duties after 3 epochs. Note inclusion delay calculation requires From 6117781e1b327e3a267653527e9bc82544a35aba Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 17:35:03 -0300 Subject: [PATCH 21/48] Remove old Scheduler code - Preserve only Actor API --- crates/core/src/scheduler.rs | 529 +++++++---------------------------- 1 file changed, 95 insertions(+), 434 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index f58cdcbb..e7f6a239 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,4 +1,3 @@ -#![allow(dead_code, reason = "wip")] #![allow(missing_docs)] use std::{ @@ -10,11 +9,76 @@ use std::{ use backon::{BackoffBuilder, Retryable}; use pluto_eth2api::{EthBeaconNodeApiClientError, client}; -use tokio::sync::{self, Mutex}; +use tokio::sync; use tokio_util::{future::FutureExt, sync::CancellationToken}; use crate::{types, valcache}; +// Trim cached duties after 3 epochs. Note inclusion delay calculation requires +// now-32 slot duties. +const TRIM_EPOCH_OFFSET: u64 = 3; + +/// Errors that can occur during the scheduling process. +#[derive(Debug, thiserror::Error)] +pub enum SchedulerError { + /// Beacon Node API client error. + #[error("Error while fetching data from the Eth2 API: {0}")] + EthBeaconNodeApiClientError(#[from] EthBeaconNodeApiClientError), + + /// Validator cache error. + #[error("Error while accessing the validator cache: {0}")] + ValidatorCacheError(#[from] valcache::ValidatorCacheError), + + /// Public key error. + #[error("Error while processing public key: {0}")] + PubKeyError(#[from] types::PubKeyError), + + /// Invalid duty pubkey. + #[error("Invalid duty pubkey: expected {expected}, got {actual}")] + InvalidDutyPubkey { + /// Expected public key. + expected: types::PubKey, + /// Actual public key. + actual: types::PubKey, + }, + + /// Attempted to use the deprecated [`types::DutyType::BuilderProposer`] + /// duty type. + #[error("Deprecated duty DutyType::BuilderProposer")] + DeprecatedDutyBuilderProposer, + + /// Attempted to get a duty definition for an epoch that has already been + /// trimmed. + #[error("Epoch {epoch} has already been trimmed")] + EpochAlreadyTrimmed { + /// Trimmed epoch + epoch: u64, + + /// Duty attempted to be accessed + duty: types::Duty, + }, + + /// Attempted to get a duty definition for an epoch that has not been + /// resolved yet. + #[error("Epoch {epoch} has not been resolved yet")] + EpochNotResolved { epoch: u64 }, + + /// Duty definition not found for a resolved epoch. + #[error("Duty {duty} definition set not found in the resolved epoch {epoch}")] + DutyNotFound { + /// The resolved epoch. + epoch: u64, + + /// Duty attempted to be accessed + duty: types::Duty, + }, + + #[error("Scheduler actor has been terminated")] + Terminated, +} + +type Result = std::result::Result; + pub struct Builder { slot_broadcast: sync::broadcast::Sender, duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, @@ -22,6 +86,7 @@ pub struct Builder { } impl Builder { + /// Construct a default [`Builder`] with no chain reorg handling. pub fn new() -> Self { Builder { slot_broadcast: sync::broadcast::channel(100).0, @@ -66,12 +131,23 @@ impl Builder { }); } + /// Add a source of chain reorgs to the scheduler. + /// + /// Disabled by default. pub fn with_chain_reorgs(&mut self, reorg_rx: sync::mpsc::Receiver) { // NOTE: The SSE feature check should be done by the caller self.reorg_rx = reorg_rx; } - async fn build( + /// Construct a new Scheduler which runs in the background. This operation + /// will block until the chain has started and the beacon node is synced. + /// + /// Listeners for duties and slots should be registered before calling this + /// function. + /// + /// The returned [`Handle`] can be used to query the scheduler for duty + /// definitions. + pub async fn build( self, client: client::EthBeaconNodeApiClient, ct: CancellationToken, @@ -110,14 +186,14 @@ enum Message { }, } -struct Handle { +pub struct Handle { sender: sync::mpsc::Sender, } impl Handle { /// Returns the definition for a duty if a definition exists for a resolved /// epoch. - async fn get_duty_definition(&self, duty: types::Duty) -> Result { + pub async fn get_duty_definition(&self, duty: types::Duty) -> Result { let (tx, rx) = sync::oneshot::channel(); let msg = Message::GetDutyDefinition { duty, resp: tx }; @@ -198,6 +274,10 @@ impl Actor { return Err(SchedulerError::EpochAlreadyTrimmed { epoch, duty }); } + if !self.is_epoch_resolved(epoch) { + return Err(SchedulerError::EpochNotResolved { epoch }); + } + let def_set = self .duties .get(&duty) @@ -223,6 +303,8 @@ impl Actor { } } + /// Resolves upcoming duties and triggers resolved duties for the given + /// slot. async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { if self.resolved_epoch != slot.epoch() { tracing::debug!(slot = %slot.slot, epoch = %slot.epoch(), "Resolving duties for slot"); @@ -274,7 +356,12 @@ impl Actor { } } + /// Resolves the duties for the slot's epoch, storing the results. async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { + // NOTE: Resolving duties requires fetching data from a Beacon node. + // During this time the Scheduler actor is blocked. + // This is the same behavior as in Charon, but it might not be desirable. + let vals = resolve_active_validators(slot.epoch(), &self.valcache).await?; if vals.is_empty() { tracing::info!(slot = %slot.slot, "No active validators for slot"); @@ -399,6 +486,7 @@ impl Actor { true } + /// Deletes all duties for the given epoch. fn trim_duties(&mut self, epoch: u64) { let duties = self.duties_by_epoch.remove(&epoch); if let Some(duties) = duties @@ -410,6 +498,7 @@ impl Actor { } } + /// Returns true if the epoch's duties have been trimmed fn is_epoch_trimmed(&self, epoch: u64) -> bool { if self.resolved_epoch == u64::MAX { return false; @@ -417,98 +506,8 @@ impl Actor { epoch >= self.resolved_epoch + TRIM_EPOCH_OFFSET } -} - -// Trim cached duties after 3 epochs. Note inclusion delay calculation requires -// now-32 slot duties. -const TRIM_EPOCH_OFFSET: u64 = 3; - -/// Errors that can occur during the scheduling process. -#[derive(Debug, thiserror::Error)] -pub enum SchedulerError { - /// Beacon Node API client error. - #[error("Error while fetching data from the Eth2 API: {0}")] - EthBeaconNodeApiClientError(#[from] EthBeaconNodeApiClientError), - - /// Validator cache error. - #[error("Error while accessing the validator cache: {0}")] - ValidatorCacheError(#[from] valcache::ValidatorCacheError), - - /// Public key error. - #[error("Error while processing public key: {0}")] - PubKeyError(#[from] types::PubKeyError), - - /// Invalid epoch error. - #[error("Invalid epoch")] - InvalidEpoch(#[from] std::num::ParseIntError), - - /// Invalid duty pubkey. - #[error("Invalid duty pubkey: expected {expected}, got {actual}")] - InvalidDutyPubkey { - /// Expected public key. - expected: types::PubKey, - /// Actual public key. - actual: types::PubKey, - }, - - /// Attempted to use the deprecated [`types::DutyType::BuilderProposer`] - /// duty type. - #[error("Deprecated duty DutyType::BuilderProposer")] - DeprecatedDutyBuilderProposer, - - /// Attempted to get a duty definition for an epoch that has already been - /// trimmed. - #[error("Epoch {epoch} has already been trimmed")] - EpochAlreadyTrimmed { - /// Trimmed epoch - epoch: u64, - - /// Duty attempted to be accessed - duty: types::Duty, - }, - - /// Duty definition not found for a resolved epoch. - #[error("Duty {duty} definition set not found in the resolved epoch {epoch}")] - DutyNotFound { - /// The resolved epoch. - epoch: u64, - - /// Duty attempted to be accessed - duty: types::Duty, - }, - - #[error("Scheduler actor has been terminated")] - Terminated, -} - -type Result = std::result::Result; - -struct Scheduler { - client: client::EthBeaconNodeApiClient, - valcache: valcache::ValidatorCache, - - slot_broadcast: sync::broadcast::Sender, - duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, - - storage: Mutex, -} - -struct Inner { - resolved_epoch: u64, - resolving_epoch: u64, - duties: HashMap, - duties_by_epoch: HashMap>, -} - -impl Inner { - fn is_resolving_epoch(&self, epoch: u64) -> bool { - if self.resolving_epoch == u64::MAX { - return false; - } - - self.resolving_epoch == epoch - } + /// Returns true if the epoch is resolved fn is_epoch_resolved(&self, epoch: u64) -> bool { if self.resolved_epoch == u64::MAX { return false; @@ -516,344 +515,6 @@ impl Inner { self.resolved_epoch >= epoch } - - fn is_epoch_trimmed(&self, epoch: u64) -> bool { - if self.resolved_epoch == u64::MAX { - return false; - } - - epoch >= self.resolved_epoch + TRIM_EPOCH_OFFSET - } - - fn trim_duties(&mut self, epoch: u64) { - let duties = self.duties_by_epoch.remove(&epoch); - if let Some(duties) = duties - && duties.len() > 0 - { - for duty in duties { - self.duties.remove(&duty); - } - } - } - - /// Inserts a duty definition for a given pubkey. - /// - /// Returns true if it's set, false if it was already set. - fn set_duty_definition( - &mut self, - duty: types::Duty, - epoch: u64, - pub_key: types::PubKey, - definition: types::DutyDefinition, - ) -> bool { - let def_set = self.duties.entry(duty.clone()).or_default(); - match def_set.entry(pub_key) { - Entry::Occupied(_) => return false, - Entry::Vacant(entry) => { - entry.insert(definition); - } - }; - self.duties_by_epoch - .entry(epoch) - .or_insert(Vec::new()) - .push(duty); - - true - } -} - -impl Scheduler { - pub fn new(client: client::EthBeaconNodeApiClient, valcache: valcache::ValidatorCache) -> Self { - Scheduler { - client, - valcache, - slot_broadcast: sync::broadcast::channel(100).0, - duty_broadcast: sync::broadcast::channel(100).0, - storage: Mutex::new(Inner { - resolved_epoch: u64::MAX, - resolving_epoch: u64::MAX, - duties: HashMap::new(), - duties_by_epoch: HashMap::new(), - }), - } - } - - pub async fn run(&mut self, ct: CancellationToken) -> Result<()> { - wait_chain_start(&self.client).await?; - wait_beacon_sync(&self.client).await?; - - let mut slot_ticker = new_slot_ticker(&self.client, ct.clone()).await?; - - loop { - tokio::select! { - _ = ct.cancelled() => break, - - Some(slot) = slot_ticker.recv() => { - tracing::debug!(slot = %slot.slot, "Slot ticked"); - - // TODO: - // instrument_slot(slot) - - // NOTE: Ignore send errors, it means that there are no subscribers. - let _ = self.slot_broadcast.send(slot.clone()); - - self.schedule_slot(slot, ct.clone()).await; - }, - } - } - - Ok(()) - } - - /// Subscribes a callback function for triggered slots. - /// NOTE: this should be called *before* [`Scheduler::run`]. - pub async fn subscribe_slots( - &mut self, - f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, - label: impl AsRef + Send + 'static, - ) { - let mut rx = self.slot_broadcast.subscribe(); - - tokio::spawn(async move { - while let Ok(slot) = rx.recv().await { - if let Err(err) = f(&slot) { - tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); - } - } - }); - } - - /// Subscribes a callback function for triggered duties. - /// NOTE: this should be called *before* [`Scheduler::run`]. - pub async fn subscribe_duties( - &mut self, - f: impl Fn(&types::Duty, &types::DutyDefinitionSet) -> Result<()> + Send + 'static, - label: impl AsRef + Send + 'static, - ) { - let mut rx = self.duty_broadcast.subscribe(); - - tokio::spawn(async move { - while let Ok((duty, set)) = rx.recv().await { - if let Err(err) = f(&duty, &set) { - tracing::error!(err = ?err, label = label.as_ref(), "Trigger duty subscriber error"); - } - } - }); - } - - /// Returns the definition for a duty if a definition exists for a resolved - /// epoch. - pub async fn get_duty_definition( - &mut self, - duty: types::Duty, - ) -> Result { - if duty.duty_type == types::DutyType::BuilderProposer { - return Err(SchedulerError::DeprecatedDutyBuilderProposer); - } - - let (_, slots_per_epoch) = self.client.fetch_slots_config().await?; - let epoch = duty.slot.inner() / slots_per_epoch; - - // TODO: The `is_resolving_epoch` and similar checks are a code smell. - // Rewrite to an Actor design so that we don't have concurrent access to the - // storage - - let storage = self.storage.lock().await; - if storage.is_epoch_trimmed(epoch) { - return Err(SchedulerError::EpochAlreadyTrimmed { epoch, duty }); - } - - let def_set = storage - .duties - .get(&duty) - .ok_or_else(|| SchedulerError::DutyNotFound { epoch, duty })?; - - Ok(def_set.clone()) - } - - /// In case of a reorg of an already resolved epoch trim all duties. - /// - /// Duties will be resolved again in the nex slot. - pub async fn handle_chain_reorg(&mut self, epoch: u64) { - // NOTE: The SSE feature check should be done by the caller - let mut storage = self.storage.lock().await; - - let resolved_epoch = storage.resolved_epoch; - if epoch < resolved_epoch { - storage.trim_duties(resolved_epoch); - storage.resolved_epoch = u64::MAX; - - tracing::info!( - reorg_epoch = epoch, - resolved_epoch, - "Chain reorg event handled, duties trimmed" - ) - } - } - - async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { - let resolved_epoch = self.storage.lock().await.resolved_epoch; - if resolved_epoch != slot.epoch() { - tracing::debug!(slot = %slot.slot, epoch = %slot.epoch(), "Resolving duties for slot"); - - if let Err(err) = self.resolve_duties(slot.clone()).await { - tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); - } - } - - for duty_type in types::DutyType::all() { - let duty = types::Duty { - duty_type, - slot: slot.slot, - }; - - let def_set = { - let storage = self.storage.lock().await; - let Some(def_set) = storage.duties.get(&duty) else { - // Nothing for this duty. - continue; - }; - - def_set.clone() - }; - - let ct = ct.clone(); - let slot = slot.clone(); - let broadcast = self.duty_broadcast.clone(); - tokio::spawn(async move { - if let None = delay_slot_offset(&slot, &duty) - .with_cancellation_token_owned(ct) - .await - { - // Cancelled early - return; - } - - // TODO: - // instrument_duty(duty, def_set); - - // NOTE: Ignore send errors, it means that there are no subscribers. - let _ = broadcast.send((duty.clone(), def_set.clone())); - }); - } - - if slot.last_in_epoch() { - if let Err(err) = self.resolve_duties(slot.next_slot()).await { - tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); - } - } - } - - async fn resolve_duties(&mut self, slot: types::Slot) -> Result<()> { - async fn inner(s: &mut Scheduler, slot: types::Slot) -> Result<()> { - let vals = resolve_active_validators(slot.epoch(), &s.valcache).await?; - if vals.is_empty() { - tracing::info!(slot = %slot.slot, "No active validators for slot"); - s.storage.lock().await.resolved_epoch = slot.epoch(); - return Ok(()); - } - - // TODO: - // activeValsGauge.Set(float64(len(vals))) - - let mut storage = s.storage.lock().await; - - // Resolve Attester duties - { - let att_duties = fetch_attester_duties(&slot, &vals, &s.client).await?; - for att_duty in att_duties.into_iter() { - if !storage.set_duty_definition( - types::Duty::new_attester_duty(att_duty.slot), - slot.epoch(), - att_duty.pubkey, - types::DutyDefinition::Attester(att_duty.clone()), - ) { - continue; - } - - tracing::info!( - slot = %att_duty.slot, - vidx = %att_duty.v_idx, - pubkey = %att_duty.pubkey, - epoch = %slot.epoch(), - "Resolved attester duty" - ); - - // Schedule Aggregator duty as well - let agg_duty = types::Duty::new_aggregator_duty(att_duty.slot); - storage.set_duty_definition( - agg_duty, - slot.epoch(), - att_duty.pubkey, - types::DutyDefinition::Attester(att_duty), - ); - } - } - - // Resolve Proposer duties - { - let pro_duties = fetch_proposer_duties(&slot, &vals, &s.client).await?; - for pro_duty in pro_duties.into_iter() { - if !storage.set_duty_definition( - types::Duty::new_proposer_duty(pro_duty.slot), - slot.epoch(), - pro_duty.pubkey, - types::DutyDefinition::Proposer(pro_duty.clone()), - ) { - continue; - } - - tracing::info!( - slot = %pro_duty.slot, - vidx = %pro_duty.v_idx, - pubkey = %pro_duty.pubkey, - epoch = %slot.epoch(), - "Resolved proposer duty" - ); - } - } - - // Resolve Sync Committee duties - { - let sync_duties = fetch_sync_committee_duties(&slot, &vals, &s.client).await?; - for sync_duty in sync_duties.into_iter() { - // TODO(charon): sync committee duties start in the slot before the sync - // committee period. - // Refer: https://github.com/ethereum/consensus-specs/blob/dev/specs/altair/validator.md#sync-committee - for sl in slot - .iter() - .take_while(|other| other.epoch() == slot.epoch()) - { - storage.set_duty_definition( - types::Duty::new_sync_contribution_duty(sl.slot), - sl.epoch(), - sync_duty.pubkey, - types::DutyDefinition::SyncCommittee(sync_duty.clone()), - ); - } - - tracing::info!( - vidx = %&sync_duty.validator_index, - pubkey = %sync_duty.pubkey, - epoch = %slot.epoch(), - "Resolved sync committee duty" - ); - } - } - - storage.resolved_epoch = slot.epoch(); - storage.trim_duties(slot.epoch() - TRIM_EPOCH_OFFSET); - - Ok(()) - } - - // TODO: Improve the poor-man's `defer` - self.storage.lock().await.resolving_epoch = slot.epoch(); - let res = inner(self, slot).await; - self.storage.lock().await.resolving_epoch = u64::MAX; - - res - } } /// Create a read channel that will be populated with new slots in real time. From 6f45227664b57c5180887e8a7848fbe2e4e0305d Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 17:42:48 -0300 Subject: [PATCH 22/48] Fix epoch trimmed logic --- crates/core/src/scheduler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index e7f6a239..8f6941c0 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -504,7 +504,7 @@ impl Actor { return false; } - epoch >= self.resolved_epoch + TRIM_EPOCH_OFFSET + self.resolved_epoch >= epoch + TRIM_EPOCH_OFFSET } /// Returns true if the epoch is resolved From 012522f32e36f5812552e42c2f12548bfaa42f50 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 17:43:41 -0300 Subject: [PATCH 23/48] Use `saturating_sub` to prevent underflow --- crates/core/src/scheduler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 8f6941c0..d22ef271 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -456,7 +456,7 @@ impl Actor { } self.resolved_epoch = slot.epoch(); - self.trim_duties(slot.epoch() - TRIM_EPOCH_OFFSET); + self.trim_duties(slot.epoch().saturating_sub(TRIM_EPOCH_OFFSET)); Ok(()) } From 8a6007aa35b9465650ce9653648fb4396b24e69c Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 17:44:32 -0300 Subject: [PATCH 24/48] Check for resolved then trimmed --- crates/core/src/scheduler.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index d22ef271..2bc7d015 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -270,14 +270,14 @@ impl Actor { let (_, slots_per_epoch) = self.client.fetch_slots_config().await?; let epoch = duty.slot.inner() / slots_per_epoch; - if self.is_epoch_trimmed(epoch) { - return Err(SchedulerError::EpochAlreadyTrimmed { epoch, duty }); - } - if !self.is_epoch_resolved(epoch) { return Err(SchedulerError::EpochNotResolved { epoch }); } + if self.is_epoch_trimmed(epoch) { + return Err(SchedulerError::EpochAlreadyTrimmed { epoch, duty }); + } + let def_set = self .duties .get(&duty) From a2e8f68232e9e8b4dd7b863f77891d7f551e3aff Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 17:51:32 -0300 Subject: [PATCH 25/48] Allow early shutdown when waiting for start --- crates/core/src/scheduler.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 2bc7d015..2ea0ca79 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -152,8 +152,14 @@ impl Builder { client: client::EthBeaconNodeApiClient, ct: CancellationToken, ) -> Result { - wait_chain_start(&client).await?; - wait_beacon_sync(&client).await?; + wait_chain_start(&client) + .with_cancellation_token(&ct) + .await + .ok_or(SchedulerError::Terminated)??; + wait_beacon_sync(&client) + .with_cancellation_token(&ct) + .await + .ok_or(SchedulerError::Terminated)??; let slot_rx = new_slot_ticker(&client.clone(), ct.clone()).await?; From 759628c9c38d0fce82c3be588903efbfb24ee903 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 17:55:20 -0300 Subject: [PATCH 26/48] Correctly wait until the slot offset --- crates/core/src/scheduler.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 2ea0ca79..5293577f 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -2,7 +2,6 @@ use std::{ collections::{HashMap, hash_map::Entry}, - ops::Div, time::Duration, u64, }; @@ -711,14 +710,17 @@ async fn wait_beacon_sync(client: &pluto_eth2api::client::EthBeaconNodeApiClient /// Blocks until the slot offset for the duty has been reached. async fn delay_slot_offset(slot: &types::Slot, duty: &types::Duty) { - let to_sleep = match duty.duty_type { - types::DutyType::Attester => slot.slot_duration.div(3) * 1, - types::DutyType::Aggregator => slot.slot_duration.div(3) * 2, - types::DutyType::SyncContribution => slot.slot_duration.div(3) * 2, + let offset = match duty.duty_type { + types::DutyType::Attester => slot.slot_duration / 3, + types::DutyType::Aggregator => slot.slot_duration * 2 / 3, + types::DutyType::SyncContribution => slot.slot_duration * 2 / 3, _ => return, }; - tokio::time::sleep(to_sleep.to_std().unwrap_or_default()).await; + // Wait until the absolute deadline + let deadline = slot.time + offset; + let wait = (deadline - chrono::Utc::now()).to_std().unwrap_or_default(); + tokio::time::sleep(wait).await; } /// Fetches the attester duties for the given slot and validators, and validates From f4762670288f1383099b9755240ba6f970805df3 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 18:03:28 -0300 Subject: [PATCH 27/48] Add guard for lagging subscribers --- crates/core/src/scheduler.rs | 41 ++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 5293577f..48593363 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -105,9 +105,24 @@ impl Builder { // TODO: We might want to return a handle so clients can `.abort()` them to drop // the subscription tokio::spawn(async move { - while let Ok(slot) = rx.recv().await { - if let Err(err) = f(&slot) { - tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); + loop { + match rx.recv().await { + Ok(slot) => { + if let Err(err) = f(&slot) { + tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); + } + } + // NOTE: A lagging subscriber requires further analysis. + // Log the error and terminate the subscription. + Err(sync::broadcast::error::RecvError::Lagged(skipped)) => { + tracing::error!( + skipped, + label = label.as_ref(), + "Emit scheduled slot subscriber lagged" + ); + break; + } + Err(sync::broadcast::error::RecvError::Closed) => break, } } }); @@ -122,9 +137,23 @@ impl Builder { let mut rx = self.duty_broadcast.subscribe(); tokio::spawn(async move { - while let Ok((duty, set)) = rx.recv().await { - if let Err(err) = f(&duty, &set) { - tracing::error!(err = ?err, label = label.as_ref(), "Trigger duty subscriber error"); + loop { + match rx.recv().await { + Ok((duty, set)) => { + if let Err(err) = f(&duty, &set) { + tracing::error!(err = ?err, label = label.as_ref(), "Trigger duty subscriber error"); + } + } + // NOTE: Same as in `subscribe_slot`, a lagging subscriber requires further analysis. + Err(sync::broadcast::error::RecvError::Lagged(skipped)) => { + tracing::error!( + skipped, + label = label.as_ref(), + "Trigger duty subscriber lagged" + ); + break; + } + Err(sync::broadcast::error::RecvError::Closed) => break, } } }); From 76510f34ec6d0cca68faffdb163a0a528235efc0 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 18:54:34 -0300 Subject: [PATCH 28/48] Add metrics support --- crates/core/src/scheduler.rs | 43 +++++-- crates/core/src/scheduler/metrics.rs | 35 ++++++ scheduler-port-plan.md | 163 +++++++++++++++++++++++++++ 3 files changed, 229 insertions(+), 12 deletions(-) create mode 100644 crates/core/src/scheduler/metrics.rs create mode 100644 scheduler-port-plan.md diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 48593363..c9913026 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -11,7 +11,9 @@ use pluto_eth2api::{EthBeaconNodeApiClientError, client}; use tokio::sync; use tokio_util::{future::FutureExt, sync::CancellationToken}; -use crate::{types, valcache}; +use crate::{scheduler::metrics::SCHEDULER_METRICS, types, valcache}; + +mod metrics; // Trim cached duties after 3 epochs. Note inclusion delay calculation requires // now-32 slot duties. @@ -144,7 +146,7 @@ impl Builder { tracing::error!(err = ?err, label = label.as_ref(), "Trigger duty subscriber error"); } } - // NOTE: Same as in `subscribe_slot`, a lagging subscriber requires further analysis. + // NOTE: Same as in `subscribe_slot` Err(sync::broadcast::error::RecvError::Lagged(skipped)) => { tracing::error!( skipped, @@ -275,8 +277,8 @@ impl Actor { Some(slot) = slot_rx.recv() => { tracing::debug!(slot = %slot.slot, "Slot ticked"); - // TODO: - // instrument_slot(slot) + SCHEDULER_METRICS.current_slot.set(slot.slot.inner()); + SCHEDULER_METRICS.current_epoch.set(slot.epoch()); // NOTE: Ignore send errors, it means that there are no subscribers. let _ = self.slot_broadcast.send(slot.clone()); @@ -375,8 +377,8 @@ impl Actor { return; } - // TODO: - // instrument_duty(duty, def_set); + SCHEDULER_METRICS.duty_total[&duty.duty_type.to_string()] + .inc_by(def_set.len() as u64); // NOTE: Ignore send errors, it means that there are no subscribers. let _ = broadcast.send((duty.clone(), def_set.clone())); @@ -397,15 +399,15 @@ impl Actor { // This is the same behavior as in Charon, but it might not be desirable. let vals = resolve_active_validators(slot.epoch(), &self.valcache).await?; + + SCHEDULER_METRICS.validators_active.set(vals.len() as u64); + if vals.is_empty() { tracing::info!(slot = %slot.slot, "No active validators for slot"); self.resolved_epoch = slot.epoch(); return Ok(()); } - // TODO: - // activeValsGauge.Set(float64(len(vals))) - // Resolve Attester duties { let att_duties = fetch_attester_duties(&slot, &vals, &self.client).await?; @@ -593,7 +595,7 @@ async fn new_slot_ticker( if chrono::Utc::now() > slot.next_slot().time { let actual = current_slot(); tracing::warn!(actual_slot = %actual.slot, expect_slot = %slot.slot, "Slot(s) skipped"); - // skipCounter.inc() + SCHEDULER_METRICS.skipped_slots_total.inc(); slot = actual; } @@ -628,8 +630,25 @@ async fn resolve_active_validators( for (index, val) in complete.iter() { let pubkey = types::PubKey::try_from(val.validator.pubkey.as_str())?; - // TODO: - // submitter(pubkey, val.balance, val.status.to_string()) + // Submit validator balance and status metrics. + // Equivalent to Charon's `newMetricSubmitter` closure + let pubkey_full = pubkey.to_string(); + let pubkey_abbrev = pubkey.abbreviated(); + let balance = val.balance.parse::().unwrap_or_default(); + SCHEDULER_METRICS.validator_balance_gwei[&(pubkey_full.clone(), pubkey_abbrev.clone())] + .set(balance); + + // Emulate Charon's `statusGauge.Reset`: + // Vise's `Family` cannot delete series, so instead set any previously-reported + // status for this validator to 0 and the current one to 1. + let status = val.status.to_string(); + for ((full, abbrev, prev_status), gauge) in SCHEDULER_METRICS.validator_status.to_entries() + { + if full == pubkey_full && abbrev == pubkey_abbrev && prev_status != status { + gauge.set(0); + } + } + SCHEDULER_METRICS.validator_status[&(pubkey_full, pubkey_abbrev, status)].set(1); // Check for active validators for the given epoch. // The activation epoch needs to be checked in cases where this function is diff --git a/crates/core/src/scheduler/metrics.rs b/crates/core/src/scheduler/metrics.rs new file mode 100644 index 00000000..21b71913 --- /dev/null +++ b/crates/core/src/scheduler/metrics.rs @@ -0,0 +1,35 @@ +use vise::*; + +/// Metrics for the core scheduler. +#[derive(Debug, Metrics)] +#[metrics(prefix = "core_scheduler")] +pub struct SchedulerMetrics { + /// The current slot. + pub current_slot: Gauge, + + /// The current epoch. + pub current_epoch: Gauge, + + /// The total count of duties scheduled by type. + #[metrics(labels = ["duty"])] + pub duty_total: LabeledFamily, + + /// Number of active validators. + pub validators_active: Gauge, + + /// Total balance of a validator by public key. + #[metrics(labels = ["pubkey_full", "pubkey"])] + pub validator_balance_gwei: LabeledFamily<(String, String), Gauge, 2>, + + /// Gauge with validator pubkey and status as labels, value=1 is current + /// status, value=0 is previous. + #[metrics(labels = ["pubkey_full", "pubkey", "status"])] + pub validator_status: LabeledFamily<(String, String, String), Gauge, 3>, + + /// Total number of times slots were skipped. + pub skipped_slots_total: Counter, +} + +/// Global metrics for the core scheduler. +#[vise::register] +pub static SCHEDULER_METRICS: Global = Global::new(); diff --git a/scheduler-port-plan.md b/scheduler-port-plan.md new file mode 100644 index 00000000..643a2894 --- /dev/null +++ b/scheduler-port-plan.md @@ -0,0 +1,163 @@ +# Port `core/scheduler` from Charon to Pluto (Issue #176) + +## Context + +Charon's `core/scheduler` is the first stage of the duty pipeline: it resolves beacon-chain duties per epoch, ticks the slot clock, and fans duties out to downstream components (Fetcher, Consensus, DutyDB, etc.) via callbacks. Pluto currently has no scheduler — it's a blocker for end-to-end duty execution. The duty types it produces (`Attester`, `Aggregator`, `Proposer`, `SyncContribution`) need to be the *first* duties Pluto can emit before the rest of the pipeline can be exercised. + +Verdict: **port is feasible and can start immediately.** Pluto already has every dependency the scheduler needs (eth2api client, validator cache, deadline subsystem, cluster pubkeys, slot/epoch types, vise metrics, tokio task plumbing). The only foundational fix needed before scheduler code can compile cleanly is a one-line semantic correction to `DutyDefinitionSet`. + +## Reference (Charon Go source) + +`/home/emlautarom1/Development/Nethermind/charon/core/scheduler/` +- `scheduler.go` (808 lines) — main logic +- `offset.go` (24 lines) — intra-slot duty offsets (`1/3` for `Attester`, `2/3` for `Aggregator`/`SyncContribution`) +- `metrics.go` (87 lines) — Prometheus metrics +- `scheduler_test.go`, `scheduler_internal_test.go`, `testdata/*.golden` + +## Step 1 — Fix `DutyDefinitionSet` semantics (prerequisite) + +**File:** `crates/core/src/types.rs:411-462` + +The current type is `HashMap>`. Charon's analog (`core/types.go:334`) is `map[PubKey]DutyDefinition` — one definition per validator for a given duty. The scheduler stores duties as `map[Duty]DutyDefinitionSet`, where the outer key already encodes the `DutyType`, so keying the inner map by `DutyType` is wrong. + +Change to `HashMap>` and update method signatures (`get(&PubKey)`, `insert(PubKey, …)`, etc.). The type has exactly one caller (a test in `types.rs:1007-1011`); update it. Verify with `cargo check --workspace` after the change. This mirrors the existing `SignedDataSet(HashMap)` at `types.rs:715`, which is already correctly keyed. + +## Step 2 — Create the scheduler module + +**Location:** `crates/core/src/scheduler/` (new), exposed from `crates/core/src/lib.rs`. + +``` +crates/core/src/scheduler/ + mod.rs — Scheduler struct, public API + offsets.rs — intra-slot duty offsets (port of offset.go) + resolve.rs — resolveAttDuties / resolveProDuties / resolveSyncCommDuties + ticker.rs — slot ticker (port of newSlotTicker) + startup.rs — waitChainStart, waitBeaconSync + metrics.rs — vise metrics (port of metrics.go) +``` + +### 2a — Type signatures (mirror Charon, idiomatic Rust) + +- `pub struct Scheduler { eth2_cl: EthBeaconNodeApiClient, pubkeys: Vec, builder_enabled: bool, … }` +- Cached state behind `Arc>`: + - `duties: HashMap>` + - `duties_by_epoch: HashMap>` + - `resolved_epoch: u64`, `resolving_epoch: u64` +- Subscriber lists: `duty_subs: Vec BoxFuture> + Send + Sync>>`, `slot_subs: Vec<…>`. +- Public methods (1:1 with Charon): + - `Scheduler::new(pubkeys, eth2_cl, builder_enabled)` + - `subscribe_duties(cb)`, `subscribe_slots(cb)` — must be called before `run` + - `run(cancel: CancellationToken) -> Result<()>` + - `get_duty_definition(duty) -> Result` + - `handle_chain_reorg_event(epoch)` — **always enabled** in Pluto (no featureset gating) + +### 2b — Concrete payload type for `DutyDefinition` + +Charon uses an interface; in Rust we'll need a concrete enum, e.g.: + +```rust +pub enum SchedulerDutyDefinition { + Attester(AttesterDutyDefinition), // from eth2api types + Proposer(ProposerDutyDefinition), + SyncContribution(SyncCommitteeDutyDefinition), + // Aggregator reuses the AttesterDutyDefinition payload (per Charon's derivation at scheduler.go:400) +} +``` + +Source the wire shapes from existing `pluto_eth2api::Get{Attester,Proposer,SyncCommittee}DutiesResponseResponseDatum`. + +### 2c — Beacon client access + +Per the user decision: keep the concrete `EthBeaconNodeApiClient` (no trait abstraction). All calls go through: +- `client.get_attester_duties(...)` (`eth2api/src/client.rs:1341`) +- `client.get_proposer_duties(...)` (`eth2api/src/client.rs:1368`) +- `client.get_sync_committee_duties(...)` (`eth2api/src/client.rs:1390`) +- `client.get_genesis(...)` for chain-start wait +- `client.fetch_slots_config()` for slot duration / `slots_per_epoch` (`eth2api/src/extensions.rs:273`) +- Node syncing endpoint — verify it exists or add it (Charon uses `eth2Cl.NodeSyncing`). + +Use **`ValidatorCache`** (`crates/app/src/eth2wrap/valcache.rs:69`) to mirror Charon's `resolveActiveValidators` / `CompleteValidators` logic — it already filters active validators per epoch and supports `trim()` on epoch boundary. + +### 2d — Slot ticker (`ticker.rs`) + +Port `newSlotTicker` (scheduler.go:629). Use `tokio::time::sleep_until` with the genesis-time + `slot * slot_duration` formula. Emit `core::Slot { slot, time, slot_duration, slots_per_epoch }` (already defined at `types.rs:763`). For tests, parameterize on a clock source — extend the existing pattern from `crates/app/src/retry.rs` (`time_fn: Arc DateTime>`). + +### 2e — Intra-slot offset delays (`offsets.rs`) + +Direct port of `offset.go`. Map: + +| Duty | Offset | +|------|--------| +| `Attester` | `slot_duration * 1/3` | +| `Aggregator` | `slot_duration * 2/3` | +| `SyncContribution` | `slot_duration * 2/3` | +| `Proposer` | none (fire at slot start) | + +### 2f — Resolve logic (`resolve.rs`) + +Port `resolveDuties` (scheduler.go:298) and its three sub-functions: +- `resolve_att_duties` — calls `get_attester_duties`, emits paired `DutyAttester` + `DutyAggregator` definitions +- `resolve_pro_duties` — calls `get_proposer_duties` +- `resolve_sync_comm_duties` — calls `get_sync_committee_duties`, expands across all slots in the epoch + +Each populates the `duties` cache and `duties_by_epoch` index. Use `expbackoff`-equivalent retry via `tokio_retry` or a small hand-rolled retry loop — keep it inline with how the codebase already does retries. + +### 2g — Lifecycle & trimming + +- On `run`: `wait_chain_start` → `wait_beacon_sync` → start ticker loop. +- Per slot: emit slot callbacks, then schedule duties asynchronously (`tokio::spawn` per subscriber per duty, after `delay_slot_offset`). +- On epoch boundary: trim duties older than `trim_epoch_offset = 3` (scheduler.go:28). +- Use `CancellationToken` from `tokio-util` (already a workspace dep) instead of Charon's `quit` channel. + +### 2h — Metrics (`metrics.rs`) + +Port via `vise` (workspace dep, already used in `crates/p2p/src/bandwidth.rs`): +- `slot_gauge`, `epoch_gauge`, `active_vals_gauge`, `skip_counter` +- `duty_counter{type=...}`, `balance_gauge{pubkey=...}`, `status_gauge{pubkey=..., status=...}` + +## Step 3 — Tests + +Per user decision: **port unit tests using `testcontainers`** to drive against a real beacon node, following the pattern in `crates/eth2api/src/integration.rs:1-80` (`BeaconNodeContainer::shared()`). + +Test files to create under `crates/core/src/scheduler/`: +- `mod.rs` `#[cfg(test)] mod tests` — ports of `scheduler_internal_test.go`: `TestResolveAttDuties`, `TestResolveProDuties`, `TestResolveSyncCommDuties`, `TestResolvingEpoch`. +- `tests/integration.rs` (or a sibling integration module) — ports of `scheduler_test.go`: `TestSchedulerDuties`, `TestScheduler_GetDuty`, `TestSchedulerWait`, `TestNoActive`, `TestHandleChainReorgEvent`. Skip `TestIntegration` itself (it's already a flag-gated live-network test in Go and is covered by the testcontainer setup). + +Translate the `*.golden` JSON files from `charon/core/scheduler/testdata/` into Rust fixtures (either inline `serde_json::json!` or check the JSON files in alongside the test module and read via `include_str!`). + +## Critical files + +**Modify (prerequisite):** +- `crates/core/src/types.rs` — re-key `DutyDefinitionSet` to `HashMap>` (lines 411–462) + fix test at 1007–1011. + +**Create:** +- `crates/core/src/scheduler/` (full new module as outlined above). + +**Touch:** +- `crates/core/src/lib.rs` — `pub mod scheduler;` +- `crates/core/Cargo.toml` — add deps as needed (`tokio-util` for `CancellationToken`, `vise` for metrics, possibly `tokio-retry`). + +## Reused existing infrastructure + +- `pluto_core::types::{Duty, DutyType, PubKey, Slot, SlotNumber, DutyDefinition, DutyDefinitionSet}` — `crates/core/src/types.rs` +- `pluto_eth2api::EthBeaconNodeApiClient` + `extensions::{fetch_genesis_time, fetch_slots_config}` — `crates/eth2api/src/` +- `pluto_app::eth2wrap::valcache::{ValidatorCache, ActiveValidators}` — `crates/app/src/eth2wrap/valcache.rs` +- `pluto_core::deadline::{Deadliner, DeadlinerTask}` — `crates/core/src/deadline/mod.rs` (consumer side; scheduler feeds it) +- `pluto_cluster::lock::Lock` — for sourcing the initial `pubkeys: Vec` +- `tokio_util::sync::CancellationToken` — shutdown +- `vise` — metrics +- Testcontainer pattern in `crates/eth2api/src/integration.rs` — beacon node fixture for tests + +## Out of scope (deferred) + +- **SSE listener** that calls `handle_chain_reorg_event` — the method is implemented, but the SSE source isn't wired in this PR (no SSE infra in Pluto yet). +- **`featureset` system** — not needed since we always enable reorg handling. +- **`schedSlotFunc`** — Charon test-only hook; reproduce via test-side closure injection if needed, otherwise drop. + +## Verification + +1. `cargo +nightly fmt --all --check` +2. `cargo clippy --workspace --all-targets --all-features -- -D warnings` +3. `cargo test --workspace --all-features` — ensures the `DutyDefinitionSet` re-key doesn't break anything else and scheduler unit tests pass. +4. Scheduler integration tests via testcontainer beacon node: `cargo test -p pluto-core scheduler::` — verify duty resolution against a real BN matches expected slot/validator counts. +5. Manual end-to-end: wire the scheduler in `crates/app/src/lib.rs` against a devnet beacon node, subscribe a logging callback, and confirm `DutyAttester` / `DutyProposer` events fire at the expected intra-slot offsets across two epochs. From e6158bca4f508e4aa00888201154894dd227ff18 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 19:06:31 -0300 Subject: [PATCH 29/48] Add original Charon timeout logic --- crates/core/src/scheduler.rs | 64 +++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index c9913026..d160931d 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -73,7 +73,12 @@ pub enum SchedulerError { /// Duty attempted to be accessed duty: types::Duty, }, + /// Timed out while waiting for the scheduler to respond with a duty + /// definition. + #[error("Timed out while waiting for a duty definition")] + TimeoutError, + /// The underlying scheduler actor has been terminated. #[error("Scheduler actor has been terminated")] Terminated, } @@ -229,6 +234,8 @@ pub struct Handle { impl Handle { /// Returns the definition for a duty if a definition exists for a resolved /// epoch. + /// + /// NOTE: this operation has a default timeout of 100 ms. pub async fn get_duty_definition(&self, duty: types::Duty) -> Result { let (tx, rx) = sync::oneshot::channel(); let msg = Message::GetDutyDefinition { duty, resp: tx }; @@ -238,9 +245,12 @@ impl Handle { .await .map_err(|_| SchedulerError::Terminated)?; - // TODO: In Charon, this call has a default timeout of 100 ms while the epoch is - // being resolved. I don't like that approach. - rx.await.map_err(|_| SchedulerError::Terminated)? + // This has to be very rare event, when the requested epoch is being resolved. + // We wait for the epoch to be resolved before returning the duty definition. + tokio::time::timeout(Duration::from_millis(100), rx) + .await + .map_err(|_| SchedulerError::TimeoutError)? + .map_err(|_| SchedulerError::Terminated)? } } @@ -274,6 +284,13 @@ impl Actor { self.handle_chain_reorg(epoch).await; }, + Some(msg) = msg_rx.recv() => match msg { + Message::GetDutyDefinition { duty, resp } => { + let result = self.get_duty_definition(duty).await; + let _ = resp.send(result); + }, + }, + Some(slot) = slot_rx.recv() => { tracing::debug!(slot = %slot.slot, "Slot ticked"); @@ -285,17 +302,27 @@ impl Actor { self.schedule_slot(slot, ct.clone()).await; }, - - Some(msg) = msg_rx.recv() => match msg { - Message::GetDutyDefinition { duty, resp } => { - let result = self.get_duty_definition(duty).await; - let _ = resp.send(result); - }, - } } } } + /// In case of a reorg of an already resolved epoch trim all duties. + /// + /// Duties will be resolved again in the nex slot. + pub async fn handle_chain_reorg(&mut self, epoch: u64) { + let resolved_epoch = self.resolved_epoch; + if epoch < resolved_epoch { + self.trim_duties(resolved_epoch); + self.resolved_epoch = u64::MAX; + + tracing::info!( + reorg_epoch = epoch, + resolved_epoch, + "Chain reorg event handled, duties trimmed" + ) + } + } + /// Returns the definition for a duty if a definition exists for a resolved /// epoch. async fn get_duty_definition(&mut self, duty: types::Duty) -> Result { @@ -322,23 +349,6 @@ impl Actor { Ok(def_set.clone()) } - /// In case of a reorg of an already resolved epoch trim all duties. - /// - /// Duties will be resolved again in the nex slot. - pub async fn handle_chain_reorg(&mut self, epoch: u64) { - let resolved_epoch = self.resolved_epoch; - if epoch < resolved_epoch { - self.trim_duties(resolved_epoch); - self.resolved_epoch = u64::MAX; - - tracing::info!( - reorg_epoch = epoch, - resolved_epoch, - "Chain reorg event handled, duties trimmed" - ) - } - } - /// Resolves upcoming duties and triggers resolved duties for the given /// slot. async fn schedule_slot(&mut self, slot: types::Slot, ct: CancellationToken) { From a1020a5f53473bcc22c1f81d36bf344922b66cc6 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 19:09:11 -0300 Subject: [PATCH 30/48] Add `Default` to builder --- crates/core/src/scheduler.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index d160931d..50397ae3 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -3,7 +3,6 @@ use std::{ collections::{HashMap, hash_map::Entry}, time::Duration, - u64, }; use backon::{BackoffBuilder, Retryable}; @@ -220,6 +219,11 @@ impl Builder { } } +impl Default for Builder { + fn default() -> Self { + Self::new() + } +} enum Message { GetDutyDefinition { duty: types::Duty, From 0993ff95fef647bba087d6c0d51e1011b64a0294 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 19:17:45 -0300 Subject: [PATCH 31/48] Fix clippy lints --- crates/core/src/scheduler.rs | 80 +++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 50397ae3..09500418 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -335,7 +335,11 @@ impl Actor { } let (_, slots_per_epoch) = self.client.fetch_slots_config().await?; - let epoch = duty.slot.inner() / slots_per_epoch; + let epoch = duty + .slot + .inner() + .checked_div(slots_per_epoch) + .expect("non-zero"); if !self.is_epoch_resolved(epoch) { return Err(SchedulerError::EpochNotResolved { epoch }); @@ -383,9 +387,10 @@ impl Actor { let slot = slot.clone(); let broadcast = self.duty_broadcast.clone(); tokio::spawn(async move { - if let None = delay_slot_offset(&slot, &duty) + if delay_slot_offset(&slot, &duty) .with_cancellation_token_owned(ct) .await + .is_none() { // Cancelled early return; @@ -399,10 +404,10 @@ impl Actor { }); } - if slot.last_in_epoch() { - if let Err(err) = self.resolve_duties(slot.next_slot()).await { - tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); - } + if slot.last_in_epoch() + && let Err(err) = self.resolve_duties(slot.next_slot()).await + { + tracing::warn!(err = ?err, slot = %slot.slot, "Resolving duties error (retrying next slot)"); } } @@ -528,10 +533,7 @@ impl Actor { entry.insert(definition); } }; - self.duties_by_epoch - .entry(epoch) - .or_insert(Vec::new()) - .push(duty); + self.duties_by_epoch.entry(epoch).or_default().push(duty); true } @@ -540,7 +542,7 @@ impl Actor { fn trim_duties(&mut self, epoch: u64) { let duties = self.duties_by_epoch.remove(&epoch); if let Some(duties) = duties - && duties.len() > 0 + && !duties.is_empty() { for duty in duties { self.duties.remove(&duty); @@ -554,7 +556,7 @@ impl Actor { return false; } - self.resolved_epoch >= epoch + TRIM_EPOCH_OFFSET + self.resolved_epoch >= epoch.saturating_add(TRIM_EPOCH_OFFSET) } /// Returns true if the epoch is resolved @@ -578,16 +580,23 @@ async fn new_slot_ticker( ) -> Result> { let genesis_time = client.fetch_genesis_time().await?; let (slot_duration, slots_per_epoch) = client.fetch_slots_config().await?; - let slot_duration = chrono::Duration::from_std(slot_duration).unwrap(); + let slot_duration = chrono::Duration::from_std(slot_duration).expect("withing range"); let current_slot = move || { - let chain_age = chrono::Utc::now() - genesis_time; + let chain_age = chrono::Utc::now().signed_duration_since(genesis_time); let slot_ms = slot_duration.num_milliseconds(); - let slot = chain_age.num_milliseconds() / slot_ms; - let start_time = genesis_time + chrono::Duration::milliseconds(slot * slot_ms); + let slot = chain_age + .num_milliseconds() + .checked_div(slot_ms) + .expect("non-zero"); + let start_offset = + chrono::Duration::milliseconds(slot.checked_mul(slot_ms).expect("within range")); + let start_time = genesis_time + .checked_add_signed(start_offset) + .expect("within range"); types::Slot { - slot: types::SlotNumber::new(slot as u64), + slot: types::SlotNumber::new(slot.cast_unsigned()), time: start_time, slots_per_epoch, slot_duration, @@ -599,7 +608,9 @@ async fn new_slot_ticker( let mut slot = current_slot(); loop { - let wait = (slot.time - chrono::Utc::now()) + let wait = slot + .time + .signed_duration_since(chrono::Utc::now()) .to_std() .unwrap_or_default(); tokio::time::sleep(wait).await; @@ -726,7 +737,10 @@ async fn wait_chain_start(client: &pluto_eth2api::client::EthBeaconNodeApiClient let now = chrono::Utc::now(); if now < genesis_time { - let delta = (genesis_time - now).to_std().unwrap_or_default(); + let delta = genesis_time + .signed_duration_since(now) + .to_std() + .unwrap_or_default(); tracing::info!(genesis_time = %genesis_time, sleep = ?delta, "Sleeping until genesis time"); tokio::time::sleep(delta).await; } @@ -772,16 +786,23 @@ async fn wait_beacon_sync(client: &pluto_eth2api::client::EthBeaconNodeApiClient /// Blocks until the slot offset for the duty has been reached. async fn delay_slot_offset(slot: &types::Slot, duty: &types::Duty) { + // A slot duration is small (~12s), so these never overflow chrono's range. let offset = match duty.duty_type { - types::DutyType::Attester => slot.slot_duration / 3, - types::DutyType::Aggregator => slot.slot_duration * 2 / 3, - types::DutyType::SyncContribution => slot.slot_duration * 2 / 3, + types::DutyType::Attester => slot.slot_duration.checked_div(3).expect("within range"), + types::DutyType::Aggregator | types::DutyType::SyncContribution => slot + .slot_duration + .checked_mul(2) + .and_then(|d| d.checked_div(3)) + .expect("within range"), _ => return, }; // Wait until the absolute deadline - let deadline = slot.time + offset; - let wait = (deadline - chrono::Utc::now()).to_std().unwrap_or_default(); + let deadline = slot.time.checked_add_signed(offset).expect("within range"); + let wait = deadline + .signed_duration_since(chrono::Utc::now()) + .to_std() + .unwrap_or_default(); tokio::time::sleep(wait).await; } @@ -789,9 +810,10 @@ async fn delay_slot_offset(slot: &types::Slot, duty: &types::Duty) { /// that the returned duties match the expected validators. async fn fetch_attester_duties( slot: &types::Slot, - validators: &Vec, + validators: impl AsRef<[Validator]>, client: &client::EthBeaconNodeApiClient, ) -> Result> { + let validators = validators.as_ref(); let req = pluto_eth2api::GetAttesterDutiesRequest::builder() .epoch(slot.epoch().to_string()) .body(validators.iter().map(|v| v.v_idx.to_string()).collect()) @@ -851,7 +873,7 @@ async fn fetch_attester_duties( result.push(att_duty); } - if remaining.len() > 0 { + if !remaining.is_empty() { tracing::warn!( slot = %slot.slot, epoch = %slot.epoch(), @@ -867,9 +889,10 @@ async fn fetch_attester_duties( /// that the returned duties match the expected validators. async fn fetch_proposer_duties( slot: &types::Slot, - validators: &Vec, + validators: impl AsRef<[Validator]>, client: &client::EthBeaconNodeApiClient, ) -> Result> { + let validators = validators.as_ref(); let req = pluto_eth2api::GetProposerDutiesRequest::builder() .epoch(slot.epoch().to_string()) .build() @@ -928,9 +951,10 @@ async fn fetch_proposer_duties( /// validates that the returned duties match the expected validators. async fn fetch_sync_committee_duties( slot: &types::Slot, - validators: &Vec, + validators: impl AsRef<[Validator]>, client: &client::EthBeaconNodeApiClient, ) -> Result> { + let validators = validators.as_ref(); let req = pluto_eth2api::GetSyncCommitteeDutiesRequest::builder() .epoch(slot.epoch().to_string()) .body(validators.iter().map(|v| v.v_idx.to_string()).collect()) From 9606d1a96a4ceb107d495332b99924ea27f16c07 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 29 May 2026 20:00:20 -0300 Subject: [PATCH 32/48] Cleanup - Add docs - Simplify error messages --- crates/core/src/scheduler.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 09500418..b7af7a08 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1,5 +1,3 @@ -#![allow(missing_docs)] - use std::{ collections::{HashMap, hash_map::Entry}, time::Duration, @@ -61,7 +59,10 @@ pub enum SchedulerError { /// Attempted to get a duty definition for an epoch that has not been /// resolved yet. #[error("Epoch {epoch} has not been resolved yet")] - EpochNotResolved { epoch: u64 }, + EpochNotResolved { + /// The unresolved epoch. + epoch: u64, + }, /// Duty definition not found for a resolved epoch. #[error("Duty {duty} definition set not found in the resolved epoch {epoch}")] @@ -84,6 +85,12 @@ pub enum SchedulerError { type Result = std::result::Result; +/// A builder for the Scheduler. +/// +/// Allows setting up subscriptions for slot and duty events, as well as +/// well as setting up a source of chain reorg events. +/// +/// The Scheduler can be started by calling [`Builder::build`]. pub struct Builder { slot_broadcast: sync::broadcast::Sender, duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, @@ -231,6 +238,11 @@ enum Message { }, } +/// A handle to interact with the Scheduler actor. +/// +/// Cloning the handle is cheap and allows sending messages to the actor from +/// multiple tasks. +#[derive(Clone)] pub struct Handle { sender: sync::mpsc::Sender, } From 80436587ef6fddc6868d63687d83ba7fae338a94 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 1 Jun 2026 13:33:32 -0300 Subject: [PATCH 33/48] Add test suite - Fix edge case when trimming small epochs --- crates/core/src/scheduler.rs | 507 ++++++++++++++++++++++++++++++++++- 1 file changed, 506 insertions(+), 1 deletion(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index b7af7a08..f191dc24 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -523,7 +523,12 @@ impl Actor { } self.resolved_epoch = slot.epoch(); - self.trim_duties(slot.epoch().saturating_sub(TRIM_EPOCH_OFFSET)); + // Only trim once there is an epoch old enough to trim. + // NOTE: Charon relies on `uint64` underflow wrapping to a huge (absent) epoch + // for epochs < 3. `checked_sub` reproduces that no-op + if let Some(trim_epoch) = slot.epoch().checked_sub(TRIM_EPOCH_OFFSET) { + self.trim_duties(trim_epoch); + } Ok(()) } @@ -1016,3 +1021,503 @@ async fn fetch_sync_committee_duties( Ok(result) } + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use pluto_eth2api::{ + GetStateValidatorsResponseResponse, GetStateValidatorsResponseResponseDatum, + }; + use pluto_testutil::{BeaconMock, ValidatorSet}; + use wiremock::{ + Mock, ResponseTemplate, + matchers::{method, path}, + }; + + use super::*; + use crate::testutils::random_core_pub_key; + + /// Builds a beacon mock seeded with `ValidatorSetA` and deterministic + /// duties for every duty type + async fn duties_mock(slots_per_epoch: u64) -> BeaconMock { + BeaconMock::builder() + .validator_set(ValidatorSet::validator_set_a()) + .deterministic_attester_duties(0) + .deterministic_proposer_duties(0) + .deterministic_sync_comm_duties((2, 2)) + .slots_per_epoch(slots_per_epoch) + .slot_duration(std::time::Duration::from_secs(12)) + .build() + .await + .expect("build beacon mock") + } + + /// The `ValidatorSetA` validators as `/states/head/validators`. + /// + /// NOTE: the default mock only serves this endpoint over GET, but + /// `valcache::get_by_head` queries it over POST. + fn validator_set_a_datums() -> Vec { + ValidatorSet::validator_set_a() + .validators() + .into_iter() + .map(|v| GetStateValidatorsResponseResponseDatum { + index: v.index.to_string(), + balance: v.balance.to_string(), + status: v.status, + validator: v.validator, + }) + .collect() + } + + /// `ValidatorSetA` validators with their real indexes but random pubkeys, + /// to force the `InvalidDutyPubkey` mismatch path + fn validator_set_a_mismatched() -> Vec { + ValidatorSet::validator_set_a() + .validators() + .into_iter() + .map(|v| Validator { + pubkey: random_core_pub_key(), + v_idx: v.index, + }) + .collect() + } + + /// Mounts the POST `/states/head/validators` endpoint used by + /// `valcache::get_by_head`. + async fn mount_head_validators( + mock: &BeaconMock, + data: Vec, + ) { + Mock::given(method("POST")) + .and(path("/eth/v1/beacon/states/head/validators")) + .respond_with(ResponseTemplate::new(200).set_body_json( + GetStateValidatorsResponseResponse { + execution_optimistic: false, + finalized: true, + data, + }, + )) + .mount(mock.server()) + .await; + } + + /// Builds an initial `Actor` wired to the mock's client. No epoch resolved + /// yet. + fn test_actor(mock: &BeaconMock) -> Actor { + let client = mock.client().clone(); + Actor { + client: client.clone(), + valcache: valcache::ValidatorCache::new(client, Vec::new()), + slot_broadcast: sync::broadcast::channel(100).0, + duty_broadcast: sync::broadcast::channel(100).0, + resolved_epoch: u64::MAX, + duties: HashMap::new(), + duties_by_epoch: HashMap::new(), + } + } + + /// A `Slot` dated far in the past so `delay_slot_offset` deadlines have + /// already elapsed and duty broadcasts fire immediately. + fn test_past_slot(slot: u64, slots_per_epoch: u64) -> types::Slot { + types::Slot { + slot: types::SlotNumber::new(slot), + time: chrono::Utc::now() + .checked_sub_signed(chrono::Duration::days(1)) + .expect("within chrono range"), + slot_duration: chrono::Duration::seconds(12), + slots_per_epoch, + } + } + + /// Builds an attester duty definition for tests. + fn test_attester_def(pubkey: types::PubKey, v_idx: u64, slot: u64) -> types::DutyDefinition { + let datum = pluto_eth2api::types::GetAttesterDutiesResponseResponseDatum { + pubkey: pubkey.to_string(), + validator_index: v_idx.to_string(), + slot: slot.to_string(), + ..Default::default() + }; + let def: types::AttesterDutyDefinition = datum.try_into().expect("valid attester datum"); + types::DutyDefinition::Attester(def) + } + + /// Drives the actor's `run` loop with test-controlled channels. + struct Harness { + slot_tx: sync::mpsc::Sender, + reorg_tx: sync::mpsc::Sender, + handle: Handle, + slot_sub: sync::broadcast::Receiver, + duty_sub: sync::broadcast::Receiver<(types::Duty, types::DutyDefinitionSet)>, + ct: CancellationToken, + } + + fn spawn_actor(mock: &BeaconMock) -> Harness { + let client = mock.client().clone(); + let slot_broadcast = sync::broadcast::channel(100).0; + let duty_broadcast = sync::broadcast::channel(100).0; + let slot_sub = slot_broadcast.subscribe(); + let duty_sub = duty_broadcast.subscribe(); + + let actor = Actor { + client: client.clone(), + valcache: valcache::ValidatorCache::new(client, Vec::new()), + slot_broadcast, + duty_broadcast, + resolved_epoch: u64::MAX, + duties: HashMap::new(), + duties_by_epoch: HashMap::new(), + }; + + let (slot_tx, slot_rx) = sync::mpsc::channel(100); + let (msg_tx, msg_rx) = sync::mpsc::channel(100); + let (reorg_tx, reorg_rx) = sync::mpsc::channel(100); + let ct = CancellationToken::new(); + + tokio::spawn(actor.run(slot_rx, msg_rx, reorg_rx, ct.clone())); + + Harness { + slot_tx, + reorg_tx, + handle: Handle { sender: msg_tx }, + slot_sub, + duty_sub, + ct, + } + } + + #[tokio::test] + async fn fetch_attester_duties_rejects_mismatched_pubkey() { + let mock = duties_mock(1).await; + let err = fetch_attester_duties( + &test_past_slot(0, 1), + validator_set_a_mismatched(), + mock.client(), + ) + .await + .expect_err("mismatched pubkey should be rejected"); + assert!(matches!(err, SchedulerError::InvalidDutyPubkey { .. })); + } + + #[tokio::test] + async fn fetch_proposer_duties_rejects_mismatched_pubkey() { + let mock = duties_mock(1).await; + let err = fetch_proposer_duties( + &test_past_slot(0, 1), + validator_set_a_mismatched(), + mock.client(), + ) + .await + .expect_err("mismatched pubkey should be rejected"); + assert!(matches!(err, SchedulerError::InvalidDutyPubkey { .. })); + } + + #[tokio::test] + async fn fetch_sync_committee_duties_rejects_mismatched_pubkey() { + let mock = duties_mock(1).await; + let err = fetch_sync_committee_duties( + &test_past_slot(0, 1), + validator_set_a_mismatched(), + mock.client(), + ) + .await + .expect_err("mismatched pubkey should be rejected"); + assert!(matches!(err, SchedulerError::InvalidDutyPubkey { .. })); + } + + #[tokio::test] + async fn epoch_resolved_and_trimmed_boundaries() { + let mock = BeaconMock::builder().build().await.expect("build mock"); + let mut actor = test_actor(&mock); + + // Sentinel: nothing resolved yet. + assert!(!actor.is_epoch_resolved(5)); + assert!(!actor.is_epoch_trimmed(5)); + + actor.resolved_epoch = 10; + assert!(actor.is_epoch_resolved(9)); + assert!(actor.is_epoch_resolved(10)); + assert!(!actor.is_epoch_resolved(11)); + + // Trimmed iff resolved_epoch >= epoch + TRIM_EPOCH_OFFSET (epoch <= 7). + assert!(actor.is_epoch_trimmed(7)); + assert!(!actor.is_epoch_trimmed(8)); + } + + #[tokio::test] + async fn set_duty_definition_dedups_and_trim_removes() { + let mock = BeaconMock::builder().build().await.expect("build mock"); + let mut actor = test_actor(&mock); + + let duty = types::Duty::new_attester_duty(types::SlotNumber::new(0)); + let pk = random_core_pub_key(); + let def = test_attester_def(pk, 1, 0); + + assert!(actor.set_duty_definition(duty.clone(), 0, pk, def.clone())); + // Same pubkey for the same duty is a no-op and reports `false`. + assert!(!actor.set_duty_definition(duty.clone(), 0, pk, def)); + assert!(actor.duties.contains_key(&duty)); + + actor.trim_duties(0); + assert!(!actor.duties.contains_key(&duty)); + } + + #[tokio::test] + async fn get_duty_definition_variants() { + let mock = BeaconMock::builder() + .slots_per_epoch(1) + .build() + .await + .expect("build mock"); + let mut actor = test_actor(&mock); + let slot0 = types::SlotNumber::new(0); + + // Deprecated builder-proposer duty (checked before any network call). + let builder = types::Duty::new(slot0, types::DutyType::BuilderProposer); + assert!(matches!( + actor.get_duty_definition(builder).await, + Err(SchedulerError::DeprecatedDutyBuilderProposer) + )); + + // Epoch not resolved yet (resolved_epoch == u64::MAX). + let att = types::Duty::new_attester_duty(slot0); + assert!(matches!( + actor.get_duty_definition(att.clone()).await, + Err(SchedulerError::EpochNotResolved { epoch: 0 }) + )); + + // Resolved but no duty stored. + actor.resolved_epoch = 0; + assert!(matches!( + actor.get_duty_definition(att.clone()).await, + Err(SchedulerError::DutyNotFound { epoch: 0, .. }) + )); + + // Resolved and present: returns a clone of the definition set. + let pk = random_core_pub_key(); + actor.set_duty_definition(att.clone(), 0, pk, test_attester_def(pk, 1, 0)); + let set = actor + .get_duty_definition(att.clone()) + .await + .expect("resolved duty is returned"); + assert!(set.contains_key(&pk)); + + // Advance resolved_epoch so epoch 0 is now trimmed. + actor.resolved_epoch = TRIM_EPOCH_OFFSET; + assert!(matches!( + actor.get_duty_definition(att).await, + Err(SchedulerError::EpochAlreadyTrimmed { epoch: 0, .. }) + )); + } + + #[tokio::test] + async fn resolve_duties_stores_all_duty_types() { + let mock = duties_mock(16).await; + mount_head_validators(&mock, validator_set_a_datums()).await; + let mut actor = test_actor(&mock); + + actor + .resolve_duties(test_past_slot(0, 16)) + .await + .expect("resolve duties"); + + assert_eq!(actor.resolved_epoch, 0); + + let slot0 = types::SlotNumber::new(0); + // Attester duty plus its paired aggregator duty. + assert!( + actor + .duties + .contains_key(&types::Duty::new_attester_duty(slot0)) + ); + assert!( + actor + .duties + .contains_key(&types::Duty::new_aggregator_duty(slot0)) + ); + // Proposer and sync-contribution duties. + assert!( + actor + .duties + .contains_key(&types::Duty::new_proposer_duty(slot0)) + ); + assert!( + actor + .duties + .contains_key(&types::Duty::new_sync_contribution_duty(slot0)) + ); + } + + #[tokio::test] + async fn resolve_duties_no_active_validators() { + let mock = BeaconMock::builder() + .slots_per_epoch(1) + .build() + .await + .expect("build mock"); + mount_head_validators(&mock, Vec::new()).await; + let mut actor = test_actor(&mock); + + actor + .resolve_duties(test_past_slot(0, 1)) + .await + .expect("resolve duties"); + + assert_eq!(actor.resolved_epoch, 0); + assert!(matches!( + actor + .get_duty_definition(types::Duty::new_attester_duty(types::SlotNumber::new(0))) + .await, + Err(SchedulerError::DutyNotFound { .. }) + )); + } + + #[tokio::test] + async fn handle_chain_reorg_trims_and_resets() { + let mock = BeaconMock::builder().build().await.expect("build mock"); + let mut actor = test_actor(&mock); + + // Seed a resolved epoch 5 holding one duty. + let duty = types::Duty::new_attester_duty(types::SlotNumber::new(5)); + let pk = random_core_pub_key(); + actor.set_duty_definition(duty.clone(), 5, pk, test_attester_def(pk, 1, 5)); + actor.resolved_epoch = 5; + + // A reorg at/after the resolved epoch is a no-op. + actor.handle_chain_reorg(5).await; + assert_eq!(actor.resolved_epoch, 5); + assert!(actor.duties.contains_key(&duty)); + + // A reorg before the resolved epoch trims duties and resets the epoch. + actor.handle_chain_reorg(4).await; + assert_eq!(actor.resolved_epoch, u64::MAX); + assert!(!actor.duties.contains_key(&duty)); + } + + // ---- 6. Channel-driven actor (the full run loop) ---------------------- + + #[test_case::test_case(0 ; "first slot in epoch 0 triggers duties")] + #[test_case::test_case(16 ; "first slot in epoch 1 triggers duties")] + #[tokio::test] + async fn first_slot_broadcasts_slot_and_triggers_duties(slot_number: u64) { + let mock = duties_mock(16).await; + mount_head_validators(&mock, validator_set_a_datums()).await; + let mut h = spawn_actor(&mock); + + h.slot_tx + .send(test_past_slot(slot_number, 16)) + .await + .expect("send slot"); + + // The slot itself is broadcast immediately. + let slot = tokio::time::timeout(Duration::from_secs(2), h.slot_sub.recv()) + .await + .expect("slot broadcast within timeout") + .expect("slot value"); + assert_eq!(slot.slot.inner(), slot_number); + + // Past-dated slot => duties broadcast (near-)immediately. Collect the + // four expected duty types triggered for the given slot. + let mut seen = HashSet::new(); + while seen.len() < 4 { + let (duty, set) = tokio::time::timeout(Duration::from_secs(2), h.duty_sub.recv()) + .await + .expect("duty broadcast within timeout") + .expect("duty value"); + assert!(!set.is_empty()); + seen.insert(duty.duty_type); + } + assert!(seen.contains(&types::DutyType::Attester)); + assert!(seen.contains(&types::DutyType::Aggregator)); + assert!(seen.contains(&types::DutyType::Proposer)); + assert!(seen.contains(&types::DutyType::SyncContribution)); + + h.ct.cancel(); + } + + #[test_case::test_case(1 ; "mid-epoch slot 1 triggers only sync contribution duties")] + #[test_case::test_case(5 ; "mid-epoch slot 5 triggers only sync contribution duties")] + #[test_case::test_case(15 ; "mid-epoch slot 15 triggers only sync contribution duties")] + #[tokio::test] + async fn mid_epoch_slot_broadcasts_slot_and_triggers_only_sync_contribution_duty( + slot_number: u64, + ) { + let mock = duties_mock(16).await; + mount_head_validators(&mock, validator_set_a_datums()).await; + let mut h = spawn_actor(&mock); + + // Slot is mid-epoch (epoch 0 spans slots 0..=15). With the deterministic + // Beacon setup: + // - Attester duties are only included in the first slot of an epoch + // - The paired Aggregator duties are not included either + // - Proposer duties are only included in the first slot of an epoch + // - Sync-committee contribution duties are included in every slot of an epoch + h.slot_tx + .send(test_past_slot(slot_number, 16)) + .await + .expect("send slot"); + + // The slot itself is broadcast immediately. + let slot = tokio::time::timeout(Duration::from_secs(2), h.slot_sub.recv()) + .await + .expect("slot broadcast within timeout") + .expect("slot value"); + assert_eq!(slot.slot.inner(), slot_number); + + // The only duty triggered for a mid-epoch slot is the sync-committee + // contribution. + let (duty, set) = tokio::time::timeout(Duration::from_secs(2), h.duty_sub.recv()) + .await + .expect("duty broadcast within timeout") + .expect("duty value"); + assert_eq!(duty.duty_type, types::DutyType::SyncContribution); + assert!(!set.is_empty()); + + // No attester/proposer/aggregator duty is broadcast for this slot. + let next = tokio::time::timeout(Duration::from_millis(200), h.duty_sub.recv()).await; + assert!( + next.is_err(), + "expected no further duty broadcasts, got {next:?}" + ); + + h.ct.cancel(); + } + + #[tokio::test] + async fn get_duty_success_then_reorg_then_get_duty_fails() { + let mock = duties_mock(16).await; + mount_head_validators(&mock, validator_set_a_datums()).await; + let mut h = spawn_actor(&mock); + + // Drive a slot in epoch 1 and wait for a duty broadcast, which only + // happens once `resolve_duties` has completed for the epoch. + h.slot_tx + .send(test_past_slot(16, 16)) + .await + .expect("send slot"); + tokio::time::timeout(Duration::from_secs(2), h.duty_sub.recv()) + .await + .expect("duty broadcast within timeout") + .expect("duty value"); + + // The handle can now read the resolved attester duty. + let att = types::Duty::new_attester_duty(types::SlotNumber::new(16)); + let set = h + .handle + .get_duty_definition(att.clone()) + .await + .expect("resolved duty"); + assert!(!set.is_empty()); + + // A reorg before the resolved epoch trims duties; the handle then + // reports the epoch as unresolved. The reorg is handled first so an immediate + // read observes the reset. + h.reorg_tx.send(0).await.expect("send reorg"); + assert!(matches!( + h.handle.get_duty_definition(att).await, + Err(SchedulerError::EpochNotResolved { .. }) + )); + + h.ct.cancel(); + } +} From f19dc979cfe58e4cd8c4075781b88617ba3c6c86 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 1 Jun 2026 13:45:44 -0300 Subject: [PATCH 34/48] Delete planning artifact --- scheduler-port-plan.md | 163 ----------------------------------------- 1 file changed, 163 deletions(-) delete mode 100644 scheduler-port-plan.md diff --git a/scheduler-port-plan.md b/scheduler-port-plan.md deleted file mode 100644 index 643a2894..00000000 --- a/scheduler-port-plan.md +++ /dev/null @@ -1,163 +0,0 @@ -# Port `core/scheduler` from Charon to Pluto (Issue #176) - -## Context - -Charon's `core/scheduler` is the first stage of the duty pipeline: it resolves beacon-chain duties per epoch, ticks the slot clock, and fans duties out to downstream components (Fetcher, Consensus, DutyDB, etc.) via callbacks. Pluto currently has no scheduler — it's a blocker for end-to-end duty execution. The duty types it produces (`Attester`, `Aggregator`, `Proposer`, `SyncContribution`) need to be the *first* duties Pluto can emit before the rest of the pipeline can be exercised. - -Verdict: **port is feasible and can start immediately.** Pluto already has every dependency the scheduler needs (eth2api client, validator cache, deadline subsystem, cluster pubkeys, slot/epoch types, vise metrics, tokio task plumbing). The only foundational fix needed before scheduler code can compile cleanly is a one-line semantic correction to `DutyDefinitionSet`. - -## Reference (Charon Go source) - -`/home/emlautarom1/Development/Nethermind/charon/core/scheduler/` -- `scheduler.go` (808 lines) — main logic -- `offset.go` (24 lines) — intra-slot duty offsets (`1/3` for `Attester`, `2/3` for `Aggregator`/`SyncContribution`) -- `metrics.go` (87 lines) — Prometheus metrics -- `scheduler_test.go`, `scheduler_internal_test.go`, `testdata/*.golden` - -## Step 1 — Fix `DutyDefinitionSet` semantics (prerequisite) - -**File:** `crates/core/src/types.rs:411-462` - -The current type is `HashMap>`. Charon's analog (`core/types.go:334`) is `map[PubKey]DutyDefinition` — one definition per validator for a given duty. The scheduler stores duties as `map[Duty]DutyDefinitionSet`, where the outer key already encodes the `DutyType`, so keying the inner map by `DutyType` is wrong. - -Change to `HashMap>` and update method signatures (`get(&PubKey)`, `insert(PubKey, …)`, etc.). The type has exactly one caller (a test in `types.rs:1007-1011`); update it. Verify with `cargo check --workspace` after the change. This mirrors the existing `SignedDataSet(HashMap)` at `types.rs:715`, which is already correctly keyed. - -## Step 2 — Create the scheduler module - -**Location:** `crates/core/src/scheduler/` (new), exposed from `crates/core/src/lib.rs`. - -``` -crates/core/src/scheduler/ - mod.rs — Scheduler struct, public API - offsets.rs — intra-slot duty offsets (port of offset.go) - resolve.rs — resolveAttDuties / resolveProDuties / resolveSyncCommDuties - ticker.rs — slot ticker (port of newSlotTicker) - startup.rs — waitChainStart, waitBeaconSync - metrics.rs — vise metrics (port of metrics.go) -``` - -### 2a — Type signatures (mirror Charon, idiomatic Rust) - -- `pub struct Scheduler { eth2_cl: EthBeaconNodeApiClient, pubkeys: Vec, builder_enabled: bool, … }` -- Cached state behind `Arc>`: - - `duties: HashMap>` - - `duties_by_epoch: HashMap>` - - `resolved_epoch: u64`, `resolving_epoch: u64` -- Subscriber lists: `duty_subs: Vec BoxFuture> + Send + Sync>>`, `slot_subs: Vec<…>`. -- Public methods (1:1 with Charon): - - `Scheduler::new(pubkeys, eth2_cl, builder_enabled)` - - `subscribe_duties(cb)`, `subscribe_slots(cb)` — must be called before `run` - - `run(cancel: CancellationToken) -> Result<()>` - - `get_duty_definition(duty) -> Result` - - `handle_chain_reorg_event(epoch)` — **always enabled** in Pluto (no featureset gating) - -### 2b — Concrete payload type for `DutyDefinition` - -Charon uses an interface; in Rust we'll need a concrete enum, e.g.: - -```rust -pub enum SchedulerDutyDefinition { - Attester(AttesterDutyDefinition), // from eth2api types - Proposer(ProposerDutyDefinition), - SyncContribution(SyncCommitteeDutyDefinition), - // Aggregator reuses the AttesterDutyDefinition payload (per Charon's derivation at scheduler.go:400) -} -``` - -Source the wire shapes from existing `pluto_eth2api::Get{Attester,Proposer,SyncCommittee}DutiesResponseResponseDatum`. - -### 2c — Beacon client access - -Per the user decision: keep the concrete `EthBeaconNodeApiClient` (no trait abstraction). All calls go through: -- `client.get_attester_duties(...)` (`eth2api/src/client.rs:1341`) -- `client.get_proposer_duties(...)` (`eth2api/src/client.rs:1368`) -- `client.get_sync_committee_duties(...)` (`eth2api/src/client.rs:1390`) -- `client.get_genesis(...)` for chain-start wait -- `client.fetch_slots_config()` for slot duration / `slots_per_epoch` (`eth2api/src/extensions.rs:273`) -- Node syncing endpoint — verify it exists or add it (Charon uses `eth2Cl.NodeSyncing`). - -Use **`ValidatorCache`** (`crates/app/src/eth2wrap/valcache.rs:69`) to mirror Charon's `resolveActiveValidators` / `CompleteValidators` logic — it already filters active validators per epoch and supports `trim()` on epoch boundary. - -### 2d — Slot ticker (`ticker.rs`) - -Port `newSlotTicker` (scheduler.go:629). Use `tokio::time::sleep_until` with the genesis-time + `slot * slot_duration` formula. Emit `core::Slot { slot, time, slot_duration, slots_per_epoch }` (already defined at `types.rs:763`). For tests, parameterize on a clock source — extend the existing pattern from `crates/app/src/retry.rs` (`time_fn: Arc DateTime>`). - -### 2e — Intra-slot offset delays (`offsets.rs`) - -Direct port of `offset.go`. Map: - -| Duty | Offset | -|------|--------| -| `Attester` | `slot_duration * 1/3` | -| `Aggregator` | `slot_duration * 2/3` | -| `SyncContribution` | `slot_duration * 2/3` | -| `Proposer` | none (fire at slot start) | - -### 2f — Resolve logic (`resolve.rs`) - -Port `resolveDuties` (scheduler.go:298) and its three sub-functions: -- `resolve_att_duties` — calls `get_attester_duties`, emits paired `DutyAttester` + `DutyAggregator` definitions -- `resolve_pro_duties` — calls `get_proposer_duties` -- `resolve_sync_comm_duties` — calls `get_sync_committee_duties`, expands across all slots in the epoch - -Each populates the `duties` cache and `duties_by_epoch` index. Use `expbackoff`-equivalent retry via `tokio_retry` or a small hand-rolled retry loop — keep it inline with how the codebase already does retries. - -### 2g — Lifecycle & trimming - -- On `run`: `wait_chain_start` → `wait_beacon_sync` → start ticker loop. -- Per slot: emit slot callbacks, then schedule duties asynchronously (`tokio::spawn` per subscriber per duty, after `delay_slot_offset`). -- On epoch boundary: trim duties older than `trim_epoch_offset = 3` (scheduler.go:28). -- Use `CancellationToken` from `tokio-util` (already a workspace dep) instead of Charon's `quit` channel. - -### 2h — Metrics (`metrics.rs`) - -Port via `vise` (workspace dep, already used in `crates/p2p/src/bandwidth.rs`): -- `slot_gauge`, `epoch_gauge`, `active_vals_gauge`, `skip_counter` -- `duty_counter{type=...}`, `balance_gauge{pubkey=...}`, `status_gauge{pubkey=..., status=...}` - -## Step 3 — Tests - -Per user decision: **port unit tests using `testcontainers`** to drive against a real beacon node, following the pattern in `crates/eth2api/src/integration.rs:1-80` (`BeaconNodeContainer::shared()`). - -Test files to create under `crates/core/src/scheduler/`: -- `mod.rs` `#[cfg(test)] mod tests` — ports of `scheduler_internal_test.go`: `TestResolveAttDuties`, `TestResolveProDuties`, `TestResolveSyncCommDuties`, `TestResolvingEpoch`. -- `tests/integration.rs` (or a sibling integration module) — ports of `scheduler_test.go`: `TestSchedulerDuties`, `TestScheduler_GetDuty`, `TestSchedulerWait`, `TestNoActive`, `TestHandleChainReorgEvent`. Skip `TestIntegration` itself (it's already a flag-gated live-network test in Go and is covered by the testcontainer setup). - -Translate the `*.golden` JSON files from `charon/core/scheduler/testdata/` into Rust fixtures (either inline `serde_json::json!` or check the JSON files in alongside the test module and read via `include_str!`). - -## Critical files - -**Modify (prerequisite):** -- `crates/core/src/types.rs` — re-key `DutyDefinitionSet` to `HashMap>` (lines 411–462) + fix test at 1007–1011. - -**Create:** -- `crates/core/src/scheduler/` (full new module as outlined above). - -**Touch:** -- `crates/core/src/lib.rs` — `pub mod scheduler;` -- `crates/core/Cargo.toml` — add deps as needed (`tokio-util` for `CancellationToken`, `vise` for metrics, possibly `tokio-retry`). - -## Reused existing infrastructure - -- `pluto_core::types::{Duty, DutyType, PubKey, Slot, SlotNumber, DutyDefinition, DutyDefinitionSet}` — `crates/core/src/types.rs` -- `pluto_eth2api::EthBeaconNodeApiClient` + `extensions::{fetch_genesis_time, fetch_slots_config}` — `crates/eth2api/src/` -- `pluto_app::eth2wrap::valcache::{ValidatorCache, ActiveValidators}` — `crates/app/src/eth2wrap/valcache.rs` -- `pluto_core::deadline::{Deadliner, DeadlinerTask}` — `crates/core/src/deadline/mod.rs` (consumer side; scheduler feeds it) -- `pluto_cluster::lock::Lock` — for sourcing the initial `pubkeys: Vec` -- `tokio_util::sync::CancellationToken` — shutdown -- `vise` — metrics -- Testcontainer pattern in `crates/eth2api/src/integration.rs` — beacon node fixture for tests - -## Out of scope (deferred) - -- **SSE listener** that calls `handle_chain_reorg_event` — the method is implemented, but the SSE source isn't wired in this PR (no SSE infra in Pluto yet). -- **`featureset` system** — not needed since we always enable reorg handling. -- **`schedSlotFunc`** — Charon test-only hook; reproduce via test-side closure injection if needed, otherwise drop. - -## Verification - -1. `cargo +nightly fmt --all --check` -2. `cargo clippy --workspace --all-targets --all-features -- -D warnings` -3. `cargo test --workspace --all-features` — ensures the `DutyDefinitionSet` re-key doesn't break anything else and scheduler unit tests pass. -4. Scheduler integration tests via testcontainer beacon node: `cargo test -p pluto-core scheduler::` — verify duty resolution against a real BN matches expected slot/validator counts. -5. Manual end-to-end: wire the scheduler in `crates/app/src/lib.rs` against a devnet beacon node, subscribe a logging callback, and confirm `DutyAttester` / `DutyProposer` events fire at the expected intra-slot offsets across two epochs. From aa16bc67c3df8311768e53d2cf85c8bf0efe7d33 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 1 Jun 2026 13:53:35 -0300 Subject: [PATCH 35/48] Add missing docs --- crates/core/src/types.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 7f818a6f..a3fb270a 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -1,5 +1,3 @@ -#![allow(missing_docs)] - //! Types for the Charon core. use std::{any::Any, collections::HashMap, fmt::Display, iter}; @@ -428,10 +426,14 @@ impl AsRef<[u8]> for PubKey { } } +/// Attestation duties to be performed by validators for a particular epoch. #[derive(Debug, Clone, PartialEq)] pub struct AttesterDutyDefinition { + /// The validator's BLS public key pub pubkey: PubKey, + /// Index of validator in validator registry pub v_idx: u64, + /// The slot at which the validator must attest. pub slot: SlotNumber, inner: pluto_eth2api::types::GetAttesterDutiesResponseResponseDatum, @@ -462,10 +464,14 @@ impl TryInto } } +/// Indicates that a validator must propose a block in a given epoch #[derive(Debug, Clone, PartialEq)] pub struct ProposerDutyDefinition { + /// The validator's BLS public key pub pubkey: PubKey, + ///Index of validator in validator registry. pub v_idx: u64, + /// The slot at which the validator must propose a block. pub slot: SlotNumber, inner: pluto_eth2api::types::GetProposerDutiesResponseResponseDatum, @@ -496,10 +502,14 @@ impl TryInto } } +/// Sync committee duties for a particular epoch #[derive(Debug, Clone, PartialEq)] pub struct SyncCommitteeDutyDefinition { + /// The validator's BLS public key pub pubkey: PubKey, + /// Index of validator in validator registry. pub validator_index: u64, + /// The indices of the validator in the sync committee. pub validator_sync_committee_indices: Vec, inner: pluto_eth2api::types::GetSyncCommitteeDutiesResponseResponseDatum, @@ -537,13 +547,19 @@ impl TryInto } } +/// All duty definitions for a validator in a given epoch. #[derive(Debug, Clone, PartialEq)] pub enum DutyDefinition { + /// Attester duty definition. Attester(AttesterDutyDefinition), + /// Proposer duty definition. Proposer(ProposerDutyDefinition), + /// Sync committee duty definition. SyncCommittee(SyncCommitteeDutyDefinition), } +/// A set of duty definitions for all validators in a given epoch, indexed by +/// public key. pub type DutyDefinitionSet = HashMap; /// Unsigned data type From 04acd467d508e34a48b18446e17a0cf781e8fa9a Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 1 Jun 2026 13:54:12 -0300 Subject: [PATCH 36/48] Remove inner fields - Not needed in practice, and they are large structs that would be expensive to clone. --- crates/core/src/types.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index a3fb270a..d35ff2c0 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -435,8 +435,6 @@ pub struct AttesterDutyDefinition { pub v_idx: u64, /// The slot at which the validator must attest. pub slot: SlotNumber, - - inner: pluto_eth2api::types::GetAttesterDutiesResponseResponseDatum, } impl TryInto @@ -459,7 +457,6 @@ impl TryInto pubkey, v_idx, slot, - inner: self, }) } } @@ -473,8 +470,6 @@ pub struct ProposerDutyDefinition { pub v_idx: u64, /// The slot at which the validator must propose a block. pub slot: SlotNumber, - - inner: pluto_eth2api::types::GetProposerDutiesResponseResponseDatum, } impl TryInto @@ -497,7 +492,6 @@ impl TryInto pubkey, v_idx, slot, - inner: self, }) } } @@ -511,8 +505,6 @@ pub struct SyncCommitteeDutyDefinition { pub validator_index: u64, /// The indices of the validator in the sync committee. pub validator_sync_committee_indices: Vec, - - inner: pluto_eth2api::types::GetSyncCommitteeDutiesResponseResponseDatum, } impl TryInto @@ -542,7 +534,6 @@ impl TryInto pubkey, validator_index, validator_sync_committee_indices, - inner: self, }) } } From 36fc4bb9f54dc6a7d979f615f81e220f31863b48 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 1 Jun 2026 14:03:28 -0300 Subject: [PATCH 37/48] Rename definitions --- crates/core/src/scheduler.rs | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index f191dc24..2cb32664 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -90,17 +90,17 @@ type Result = std::result::Result; /// Allows setting up subscriptions for slot and duty events, as well as /// well as setting up a source of chain reorg events. /// -/// The Scheduler can be started by calling [`Builder::build`]. -pub struct Builder { +/// The Scheduler can be started by calling [`SchedulerBuilder::build`]. +pub struct SchedulerBuilder { slot_broadcast: sync::broadcast::Sender, duty_broadcast: sync::broadcast::Sender<(types::Duty, types::DutyDefinitionSet)>, reorg_rx: sync::mpsc::Receiver, } -impl Builder { - /// Construct a default [`Builder`] with no chain reorg handling. +impl SchedulerBuilder { + /// Construct a default [`SchedulerBuilder`] with no chain reorg handling. pub fn new() -> Self { - Builder { + SchedulerBuilder { slot_broadcast: sync::broadcast::channel(100).0, duty_broadcast: sync::broadcast::channel(100).0, reorg_rx: sync::mpsc::channel(100).1, // A channel that never receives @@ -186,13 +186,13 @@ impl Builder { /// Listeners for duties and slots should be registered before calling this /// function. /// - /// The returned [`Handle`] can be used to query the scheduler for duty - /// definitions. + /// The returned [`SchedulerHandle`] can be used to query the scheduler for + /// duty definitions. pub async fn build( self, client: client::EthBeaconNodeApiClient, ct: CancellationToken, - ) -> Result { + ) -> Result { wait_chain_start(&client) .with_cancellation_token(&ct) .await @@ -204,7 +204,7 @@ impl Builder { let slot_rx = new_slot_ticker(&client.clone(), ct.clone()).await?; - let actor = Actor { + let actor = SchedulerActor { client: client.clone(), // TODO: Figure out what to pass as `pub_keys`. // In Charon, these are not used (dead code) @@ -219,19 +219,19 @@ impl Builder { }; let (msg_tx, msg_rx) = sync::mpsc::channel(100); - let handle = Handle { sender: msg_tx }; + let handle = SchedulerHandle { sender: msg_tx }; tokio::spawn(actor.run(slot_rx, msg_rx, self.reorg_rx, ct)); Ok(handle) } } -impl Default for Builder { +impl Default for SchedulerBuilder { fn default() -> Self { Self::new() } } -enum Message { +enum SchedulerMessage { GetDutyDefinition { duty: types::Duty, resp: sync::oneshot::Sender>, @@ -243,18 +243,18 @@ enum Message { /// Cloning the handle is cheap and allows sending messages to the actor from /// multiple tasks. #[derive(Clone)] -pub struct Handle { - sender: sync::mpsc::Sender, +pub struct SchedulerHandle { + sender: sync::mpsc::Sender, } -impl Handle { +impl SchedulerHandle { /// Returns the definition for a duty if a definition exists for a resolved /// epoch. /// /// NOTE: this operation has a default timeout of 100 ms. pub async fn get_duty_definition(&self, duty: types::Duty) -> Result { let (tx, rx) = sync::oneshot::channel(); - let msg = Message::GetDutyDefinition { duty, resp: tx }; + let msg = SchedulerMessage::GetDutyDefinition { duty, resp: tx }; self.sender .send(msg) @@ -270,7 +270,7 @@ impl Handle { } } -struct Actor { +struct SchedulerActor { client: client::EthBeaconNodeApiClient, valcache: valcache::ValidatorCache, @@ -282,11 +282,11 @@ struct Actor { duties_by_epoch: HashMap>, } -impl Actor { +impl SchedulerActor { async fn run( mut self, mut slot_rx: sync::mpsc::Receiver, - mut msg_rx: sync::mpsc::Receiver, + mut msg_rx: sync::mpsc::Receiver, mut reorg_rx: sync::mpsc::Receiver, ct: CancellationToken, ) { @@ -301,7 +301,7 @@ impl Actor { }, Some(msg) = msg_rx.recv() => match msg { - Message::GetDutyDefinition { duty, resp } => { + SchedulerMessage::GetDutyDefinition { duty, resp } => { let result = self.get_duty_definition(duty).await; let _ = resp.send(result); }, @@ -1056,7 +1056,7 @@ mod tests { /// The `ValidatorSetA` validators as `/states/head/validators`. /// /// NOTE: the default mock only serves this endpoint over GET, but - /// `valcache::get_by_head` queries it over POST. + /// [`valcache::ValidatorCache::get_by_head`] queries it over POST. fn validator_set_a_datums() -> Vec { ValidatorSet::validator_set_a() .validators() @@ -1071,7 +1071,7 @@ mod tests { } /// `ValidatorSetA` validators with their real indexes but random pubkeys, - /// to force the `InvalidDutyPubkey` mismatch path + /// to force the [`SchedulerError::InvalidDutyPubkey`] mismatch path fn validator_set_a_mismatched() -> Vec { ValidatorSet::validator_set_a() .validators() @@ -1084,7 +1084,7 @@ mod tests { } /// Mounts the POST `/states/head/validators` endpoint used by - /// `valcache::get_by_head`. + /// [`valcache::ValidatorCache::get_by_head`]. async fn mount_head_validators( mock: &BeaconMock, data: Vec, @@ -1102,11 +1102,11 @@ mod tests { .await; } - /// Builds an initial `Actor` wired to the mock's client. No epoch resolved - /// yet. - fn test_actor(mock: &BeaconMock) -> Actor { + /// Builds an initial [`SchedulerActor`] wired to the mock's client. No + /// epoch resolved yet. + fn test_actor(mock: &BeaconMock) -> SchedulerActor { let client = mock.client().clone(); - Actor { + SchedulerActor { client: client.clone(), valcache: valcache::ValidatorCache::new(client, Vec::new()), slot_broadcast: sync::broadcast::channel(100).0, @@ -1117,7 +1117,7 @@ mod tests { } } - /// A `Slot` dated far in the past so `delay_slot_offset` deadlines have + /// A [`types::Slot`] dated far in the past so `delay_slot_offset` deadlines have /// already elapsed and duty broadcasts fire immediately. fn test_past_slot(slot: u64, slots_per_epoch: u64) -> types::Slot { types::Slot { @@ -1143,23 +1143,23 @@ mod tests { } /// Drives the actor's `run` loop with test-controlled channels. - struct Harness { + struct TestHarness { slot_tx: sync::mpsc::Sender, reorg_tx: sync::mpsc::Sender, - handle: Handle, + handle: SchedulerHandle, slot_sub: sync::broadcast::Receiver, duty_sub: sync::broadcast::Receiver<(types::Duty, types::DutyDefinitionSet)>, ct: CancellationToken, } - fn spawn_actor(mock: &BeaconMock) -> Harness { + fn spawn_actor(mock: &BeaconMock) -> TestHarness { let client = mock.client().clone(); let slot_broadcast = sync::broadcast::channel(100).0; let duty_broadcast = sync::broadcast::channel(100).0; let slot_sub = slot_broadcast.subscribe(); let duty_sub = duty_broadcast.subscribe(); - let actor = Actor { + let actor = SchedulerActor { client: client.clone(), valcache: valcache::ValidatorCache::new(client, Vec::new()), slot_broadcast, @@ -1176,10 +1176,10 @@ mod tests { tokio::spawn(actor.run(slot_rx, msg_rx, reorg_rx, ct.clone())); - Harness { + TestHarness { slot_tx, reorg_tx, - handle: Handle { sender: msg_tx }, + handle: SchedulerHandle { sender: msg_tx }, slot_sub, duty_sub, ct, From 7257c8e77e269df80a966cce68161e08ce0f2396 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Mon, 1 Jun 2026 14:07:21 -0300 Subject: [PATCH 38/48] Formatting --- crates/core/src/scheduler.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 2cb32664..d53d3054 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1117,8 +1117,8 @@ mod tests { } } - /// A [`types::Slot`] dated far in the past so `delay_slot_offset` deadlines have - /// already elapsed and duty broadcasts fire immediately. + /// A [`types::Slot`] dated far in the past so `delay_slot_offset` deadlines + /// have already elapsed and duty broadcasts fire immediately. fn test_past_slot(slot: u64, slots_per_epoch: u64) -> types::Slot { types::Slot { slot: types::SlotNumber::new(slot), From b82ffb8df359b533c304574db60456f71f026848 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 2 Jun 2026 15:08:32 -0300 Subject: [PATCH 39/48] Revert `anyhow` change --- crates/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 3bdf78c4..06925d9c 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -38,9 +38,9 @@ pluto-featureset.workspace = true pluto-ssz.workspace = true ssz.workspace = true tree_hash.workspace = true -anyhow.workspace = true [dev-dependencies] +anyhow.workspace = true alloy.workspace = true clap.workspace = true rand.workspace = true From a5b8810a4c33b9fcb61efd7969717ff5f1428bc4 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Tue, 2 Jun 2026 15:22:46 -0300 Subject: [PATCH 40/48] Support async callbacks --- crates/core/src/scheduler.rs | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index d53d3054..b3917a36 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -108,11 +108,12 @@ impl SchedulerBuilder { } /// Subscribes a callback function for triggered slots. - pub fn subscribe_slot( - &mut self, - f: impl Fn(&types::Slot) -> Result<()> + Send + 'static, - label: impl AsRef + Send + 'static, - ) { + pub fn subscribe_slot(&mut self, f: F, label: impl AsRef + Send + 'static) + where + F: Fn(&types::Slot) -> Fut + Send + 'static, + Fut: std::future::Future> + Send + 'static, + E: std::error::Error + Send + Sync + 'static, + { let mut rx = self.slot_broadcast.subscribe(); // TODO: We might want to return a handle so clients can `.abort()` them to drop @@ -121,7 +122,7 @@ impl SchedulerBuilder { loop { match rx.recv().await { Ok(slot) => { - if let Err(err) = f(&slot) { + if let Err(err) = f(&slot).await { tracing::error!(err = ?err, slot = %slot.slot, label = label.as_ref(), "Emit scheduled slot event"); } } @@ -142,18 +143,19 @@ impl SchedulerBuilder { } /// Subscribes a callback function for triggered duties. - pub fn subscribe_duty( - &mut self, - f: impl Fn(&types::Duty, &types::DutyDefinitionSet) -> Result<()> + Send + 'static, - label: impl AsRef + Send + 'static, - ) { + pub fn subscribe_duty(&mut self, f: F, label: impl AsRef + Send + 'static) + where + F: Fn(&types::Duty, &types::DutyDefinitionSet) -> Fut + Send + 'static, + Fut: std::future::Future> + Send + 'static, + E: std::error::Error + Send + Sync + 'static, + { let mut rx = self.duty_broadcast.subscribe(); tokio::spawn(async move { loop { match rx.recv().await { Ok((duty, set)) => { - if let Err(err) = f(&duty, &set) { + if let Err(err) = f(&duty, &set).await { tracing::error!(err = ?err, label = label.as_ref(), "Trigger duty subscriber error"); } } From 670c9f2d050dac48d4b0b4b61facb075ee07b954 Mon Sep 17 00:00:00 2001 From: Denis Kolodin Date: Fri, 29 May 2026 12:23:17 +0200 Subject: [PATCH 41/48] Split p2p replay --- crates/p2p/src/relay.rs | 1594 ------------------------- crates/p2p/src/relay/dial.rs | 169 +++ crates/p2p/src/relay/event.rs | 110 ++ crates/p2p/src/relay/manager.rs | 702 +++++++++++ crates/p2p/src/relay/manager/tests.rs | 639 ++++++++++ crates/p2p/src/relay/mod.rs | 31 + 6 files changed, 1651 insertions(+), 1594 deletions(-) delete mode 100644 crates/p2p/src/relay.rs create mode 100644 crates/p2p/src/relay/dial.rs create mode 100644 crates/p2p/src/relay/event.rs create mode 100644 crates/p2p/src/relay/manager.rs create mode 100644 crates/p2p/src/relay/manager/tests.rs create mode 100644 crates/p2p/src/relay/mod.rs diff --git a/crates/p2p/src/relay.rs b/crates/p2p/src/relay.rs deleted file mode 100644 index 97d424a9..00000000 --- a/crates/p2p/src/relay.rs +++ /dev/null @@ -1,1594 +0,0 @@ -//! Relay reservation and cluster-peer routing. -//! -//! [`RelayManager`] is a libp2p [`NetworkBehaviour`] with three -//! responsibilities: -//! -//! 1. Subscribe to [`MutablePeer`] watch channels to receive relay address -//! updates as they're discovered. -//! 2. Manage each relay's reservation lifecycle (`Dialing → Established → -//! Reserved`) and redial with exponential backoff when transport connections -//! drop. -//! 3. Route known cluster peers through reserved relay circuits so peer-to-peer -//! traffic can traverse NATs that would otherwise block direct dials. - -use std::{ - collections::{HashMap, HashSet, VecDeque}, - convert::Infallible, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; - -use crate::{ - p2p_context::P2PContext, - peer::{MutablePeer, Peer}, -}; -use futures::{Stream, stream::StreamExt}; -use libp2p::{ - Multiaddr, PeerId, - core::{Endpoint, transport::PortUse}, - multiaddr::Protocol as MaProtocol, - swarm::{ - ConnectionDenied, ConnectionId, DialError, FromSwarm, NetworkBehaviour, THandler, - THandlerInEvent, ToSwarm, dial_opts::DialOpts, dummy, - }, -}; -use tokio::time::{Instant, Sleep, sleep_until}; -use tokio_stream::wrappers::WatchStream; - -/// Initial backoff delay before the first reconnect attempt. Matches Charon's -/// `DefaultConfig.BaseDelay`. -const RELAY_BACKOFF_BASE: Duration = Duration::from_secs(1); -/// Maximum backoff delay between reconnect attempts. Matches Charon's -/// `DefaultConfig.MaxDelay`. -const RELAY_BACKOFF_MAX: Duration = Duration::from_secs(120); -/// Jitter factor applied to backoff delays. Matches Charon's -/// `DefaultConfig.Jitter`. -const RELAY_BACKOFF_JITTER: f64 = 0.2; - -/// How long a relay may stay in `Established` (transport connected, no -/// reservation yet) before the watchdog force-closes the transport so a fresh -/// dial campaign can recover. Mirrors Charon's "no relay connection, -/// reconnecting" path (`charon/p2p/relay.go:73-92`). -const ESTABLISHED_STUCK_THRESHOLD: Duration = Duration::from_secs(60); -/// How often the watchdog re-evaluates stuck-in-Established relays. -const ESTABLISHED_WATCHDOG_TICK: Duration = Duration::from_secs(15); - -/// Libp2p [`NetworkBehaviour`] that reserves circuits on a configured set of -/// relays and routes known cluster peers through them. See the module-level -/// docs for the full responsibility breakdown. -pub struct RelayManager { - /// Events to emit to the swarm - events: VecDeque>, - - /// Streams of relay peer updates. Each stream yields the current value on - /// first poll, so initial peers are picked up automatically without a - /// separate bootstrap pass. - relay_subs: Vec>>, - - /// Dial states for each relay. - dial_states: HashMap, - - /// Connection states for each relay. - connection_states: HashMap, - - /// Latest known transport addresses for each relay. Persists across the - /// connection lifecycle so we can redial after `ConnectionClosed` without - /// waiting for another `MutablePeer` update. - relay_addrs: HashMap>, - - /// Tracks when each relay last entered `Established` without having since - /// reached `Reserved`. The watchdog uses this to identify relays whose - /// reservation never confirmed (or whose refresh was denied so libp2p's - /// relay client silently gave up) and force-close them so we redial fresh. - established_at: HashMap, - - /// Watchdog tick. Fires every `ESTABLISHED_WATCHDOG_TICK`; on fire we walk - /// `established_at` and emit `ToSwarm::CloseConnection` for any relay - /// stuck beyond `ESTABLISHED_STUCK_THRESHOLD`. Lazily initialised on the - /// first `poll` so `RelayManager::new` can be called outside a Tokio - /// runtime (e.g. in unit tests that exercise pure helpers). - watchdog: Option>>, - - /// Shared P2P context used to enumerate known cluster peers when routing - /// them through reserved relays. - p2p_context: P2PContext, -} - -/// Events emitted by [`RelayManager`] to the swarm. -/// -/// Mirrors the relay lifecycle (`Dialing → Established → Reserved`) plus the -/// outcomes of routing known cluster peers through reserved circuits. Consumers -/// can observe the full progression of a reservation, or pick out just the -/// events they care about (e.g. `RelayReserved` for "circuits are usable now"). -#[derive(Debug)] -pub enum RelayManagerEvent { - /// Transport connection to a relay is up. A circuit listener has been - /// requested but the reservation is not yet confirmed. - RelayConnected(PeerId), - /// Relay accepted the reservation; circuits through this relay are now - /// usable for routing cluster peers. - RelayReserved(PeerId), - /// Circuit listener for this relay expired; the relay has been demoted to - /// `Established`. libp2p's circuit client typically refreshes the - /// reservation shortly, which will re-emit `RelayReserved`. - RelayReservationLost(PeerId), - /// Last transport connection to the relay closed. A re-dial campaign with - /// exponential backoff has been queued. - RelayDisconnected(PeerId), - /// A cluster peer has been reached through one of the reserved relay - /// circuits. From here libp2p owns the connection; this event exists for - /// telemetry only. - PeerRoutedConnected(PeerId), - /// A dial attempt failed. The underlying [`RelayDialState`] self-rearms - /// with exponential backoff, so consumers don't need to take any action. - DialFailed { - /// Target peer id (a relay server, or a routed cluster peer). - peer_id: PeerId, - /// Whether this dial was targeting a relay or a routed peer. - target: RelayDialType, - /// Number of attempts so far (including this one). - retry_count: u32, - /// Categorised dial error. - error: RelayDialError, - }, -} - -/// Categorised dial error surfaced via [`RelayManagerEvent::DialFailed`]. -/// -/// Translated from libp2p's [`DialError`] so consumers can match on variants -/// without depending on libp2p's swarm types directly. Free-form details are -/// preserved as strings on the variants where they carry diagnostic value. -#[derive(Debug, Clone, thiserror::Error)] -pub enum RelayDialError { - /// Attempted to dial our own peer id. - #[error("local peer id")] - LocalPeerId, - /// No transport addresses were available for the target. - #[error("no addresses")] - NoAddresses, - /// Dial was skipped because of a peer condition (already - /// connected/dialing). - #[error("dial skipped: peer condition not met")] - Skipped, - /// Pending connection attempt was aborted (e.g. swarm shutdown, or a newer - /// dial superseded it). - #[error("aborted")] - Aborted, - /// Connected, but the remote reported a peer id different from the - /// expected one. - #[error("wrong peer id")] - WrongPeerId, - /// Connection was denied by a behaviour or upgrade step. - #[error("denied: {0}")] - Denied(String), - /// All transport attempts failed; details preserved as `addr: err`, - /// joined by `; `. - #[error("transport: {0}")] - Transport(String), -} - -impl From<&DialError> for RelayDialError { - fn from(err: &DialError) -> Self { - match err { - DialError::LocalPeerId { .. } => Self::LocalPeerId, - DialError::NoAddresses => Self::NoAddresses, - DialError::DialPeerConditionFalse(_) => Self::Skipped, - DialError::Aborted => Self::Aborted, - DialError::WrongPeerId { .. } => Self::WrongPeerId, - DialError::Denied { cause } => Self::Denied(cause.to_string()), - DialError::Transport(errors) => Self::Transport( - errors - .iter() - .map(|(addr, e)| format!("{addr}: {e}")) - .collect::>() - .join("; "), - ), - } - } -} - -/// Whether a [`RelayDialState`] is targeting a relay server or a cluster peer -/// reached through reserved relay circuits. -#[derive(Debug, Clone, Copy)] -pub enum RelayDialType { - /// Dial a known cluster peer via reserved relay circuits. - ClusterPeer, - /// Dial a relay server directly. - RelayServer, -} - -/// State of an in-flight dial campaign, polled to produce a `ToSwarm::Dial` -/// event each time its backoff elapses. -struct RelayDialState { - /// Kind of target this campaign is dialing. - ty: RelayDialType, - /// Target peer id for the dial. - peer_id: PeerId, - /// Transport (for `RelayServer`) or circuit (for `ClusterPeer`) addresses - /// to try. - addrs: Vec, - /// Number of dial attempts so far, used to compute the next backoff. - retry_count: u32, - /// Sleeps until the next dial is due. Boxed-and-pinned so the struct stays - /// `Unpin` and can be stored in a `HashMap`; the inner `Sleep` is `!Unpin`. - sleep: Pin>, -} - -impl RelayDialState { - /// Creates a fresh dial state armed to fire after the base backoff. - fn new(ty: RelayDialType, peer_id: PeerId, addrs: Vec) -> Self { - Self { - ty, - peer_id, - addrs, - retry_count: 0, - sleep: Box::pin(sleep_until(Instant::now())), - } - } -} - -/// Lifecycle of a relay reservation. -/// -/// - `Dialing`: a [`RelayDialState`] is in flight; no transport connection to -/// the relay yet. -/// - `Established`: transport connection to the relay is up; the swarm has been -/// asked to listen on the circuit address(es) but no reservation has been -/// confirmed yet. -/// - `Reserved`: the swarm has emitted `NewListenAddr` for the circuit address, -/// meaning the relay accepted our reservation and we can route peers through -/// it. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum RelayConnectionState { - /// Dial campaign in flight; no transport connection to the relay yet. - Dialing, - /// Transport connection up; reservation not yet confirmed. - Established, - /// Reservation confirmed; circuits through this relay are usable. - Reserved, -} - -impl Stream for RelayDialState { - type Item = ToSwarm; - - /// Drives the dial schedule. Yields a `Dial` event when the next attempt - /// is due, then self-rearms with an exponential backoff so subsequent - /// `poll_next` calls produce later retries. The stream never terminates. - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - std::task::ready!(self.sleep.as_mut().poll(cx)); - - let next_delay = backoff_delay(self.retry_count); - self.retry_count = self.retry_count.saturating_add(1); - let next_deadline = Instant::now() - .checked_add(next_delay) - .unwrap_or_else(Instant::now); - self.sleep.as_mut().reset(next_deadline); - - let opts = DialOpts::peer_id(self.peer_id) - .condition(libp2p::swarm::dial_opts::PeerCondition::DisconnectedAndNotDialing) - .addresses(self.addrs.clone()) - .build(); - - Poll::Ready(Some(ToSwarm::Dial { opts })) - } -} - -/// Returns true if both slices contain the same multiaddrs (order-independent). -/// Used to decide whether a routing refresh actually expanded the available -/// circuit paths to a peer — if it did, the dial state's backoff is reset. -fn addr_sets_equal(a: &[Multiaddr], b: &[Multiaddr]) -> bool { - if a.len() != b.len() { - return false; - } - let a_set: HashSet<&Multiaddr> = a.iter().collect(); - b.iter().all(|x| a_set.contains(x)) -} - -/// Exponential backoff delay for a given retry count. -/// -/// Mirrors Charon's `expbackoff.DefaultConfig`: base=1s, multiplier=1.6, -/// jitter=0.2, max=120s. `retry_count == 0` returns the base delay with no -/// jitter, matching Go's early-return path. For `retry_count > 0`, ±20% -/// jitter is applied after capping so nodes don't retry in lockstep. -fn backoff_delay(retry_count: u32) -> Duration { - if retry_count == 0 { - return RELAY_BACKOFF_BASE; - } - let mut delay = RELAY_BACKOFF_BASE.as_secs_f64(); - let max = RELAY_BACKOFF_MAX.as_secs_f64(); - for _ in 0..retry_count { - delay *= 1.6; - if delay >= max { - delay = max; - break; - } - } - let rand_val = rand::random::(); - delay *= 1.0 + RELAY_BACKOFF_JITTER * (rand_val * 2.0 - 1.0); - if delay < 0.0 { - return Duration::ZERO; - } - Duration::from_secs_f64(delay) -} - -impl RelayManager { - /// Creates a new relay manager: reserves circuits on the supplied relays - /// and routes known cluster peers through them. - pub fn new(mutable_peers: Vec, p2p_context: P2PContext) -> Self { - let relay_subs = mutable_peers - .iter() - .map(|mp| WatchStream::new(mp.subscribe())) - .collect(); - - Self { - events: VecDeque::new(), - relay_subs, - dial_states: HashMap::new(), - connection_states: HashMap::new(), - relay_addrs: HashMap::new(), - established_at: HashMap::new(), - watchdog: None, - p2p_context, - } - } - - /// Builds circuit listen addresses for a relay from its transport - /// addresses: `/ip4/.../tcp/.../p2p//p2p-circuit`. - fn circuit_addrs(relay_id: PeerId, addrs: &[Multiaddr]) -> Vec { - addrs - .iter() - .map(|addr| { - let mut circuit: Multiaddr = addr - .iter() - .filter(|p| !matches!(p, MaProtocol::P2p(_))) - .collect(); - circuit.push(MaProtocol::P2p(relay_id)); - circuit.push(MaProtocol::P2pCircuit); - circuit - }) - .collect() - } - - /// Extracts the relay peer id from a circuit listen address of the form - /// `/.../p2p//p2p-circuit`. Returns `None` if the address is not - /// a relay circuit address. - fn relay_id_from_circuit_addr(addr: &Multiaddr) -> Option { - let mut last_p2p: Option = None; - for proto in addr.iter() { - match proto { - MaProtocol::P2p(id) => last_p2p = Some(id), - MaProtocol::P2pCircuit => return last_p2p, - _ => {} - } - } - None - } - - /// Applies a relay address update from a [`MutablePeer`]: refreshes - /// tracked addresses and, if this is the first time we've seen this - /// relay, kicks off a new dial campaign. - fn queue_relay_update(&mut self, relay: Peer) { - self.relay_addrs.insert(relay.id, relay.addresses.clone()); - - // In-flight dial campaign: refresh its address list without resetting - // the backoff schedule. - if let Some(dial_state) = self.dial_states.get_mut(&relay.id) { - dial_state.addrs = relay.addresses; - return; - } - - // Already connected (Established or Reserved): nothing to do now; - // `relay_addrs` is updated and the next disconnect will pick it up. - if self.connection_states.contains_key(&relay.id) { - return; - } - - // First time we see this relay: start the dial campaign. - self.dial_states.insert( - relay.id, - RelayDialState::new(RelayDialType::RelayServer, relay.id, relay.addresses), - ); - self.set_relay_state(relay.id, RelayConnectionState::Dialing); - } - - /// Updates the connection state for a relay, logging the transition and - /// maintaining the `established_at` watchdog timestamp. - fn set_relay_state(&mut self, relay_id: PeerId, next: RelayConnectionState) { - let prev = self.connection_states.insert(relay_id, next); - if prev != Some(next) { - tracing::debug!( - relay_peer_id = %relay_id, - ?prev, - ?next, - "Relay connection state transition" - ); - } - match next { - // Entering or refreshing the no-reservation-yet state: start (or - // restart, on demote from Reserved) the stuck-Established timer. - RelayConnectionState::Established => { - if prev != Some(RelayConnectionState::Established) { - self.established_at.insert(relay_id, Instant::now()); - } - } - // Promoted to Reserved or back to Dialing: the relay isn't stuck - // in Established anymore, so clear its watchdog timestamp. - RelayConnectionState::Reserved | RelayConnectionState::Dialing => { - self.established_at.remove(&relay_id); - } - } - } - - /// Polls every active dial state once, queuing a `ToSwarm::Dial` event for - /// any whose backoff has elapsed. Wakers for the remaining (pending) ones - /// are registered via the underlying `Sleep` futures. - fn process_relay_dials(&mut self, cx: &mut Context<'_>) { - for state in self.dial_states.values_mut() { - if let Poll::Ready(Some(event)) = state.poll_next_unpin(cx) { - self.events.push_back(event); - } - } - } - - /// Watchdog for relays stuck in `Established`. - /// - /// Libp2p's relay client owns reservation refresh; if a relay denies a - /// refresh (overloaded, quota exhausted, version mismatch), the client - /// typically gives up silently — no further `NewListenAddr` is emitted - /// and the transport stays up, so `on_connection_closed` never fires. - /// Without intervention the relay would stay in `Established` forever. - /// - /// On each tick, any relay that has been `Established` for longer than - /// [`ESTABLISHED_STUCK_THRESHOLD`] gets a `ToSwarm::CloseConnection`; the - /// resulting `FromSwarm::ConnectionClosed` drives `on_connection_closed` - /// → `redial_relay`, mirroring Charon's "no relay connection, - /// reconnecting" recovery path (`charon/p2p/relay.go:73-92`). - fn process_established_watchdog(&mut self, cx: &mut Context<'_>) { - let watchdog = self.watchdog.get_or_insert_with(|| { - let deadline = Instant::now() - .checked_add(ESTABLISHED_WATCHDOG_TICK) - .unwrap_or_else(Instant::now); - Box::pin(sleep_until(deadline)) - }); - if watchdog.as_mut().poll(cx).is_pending() { - return; - } - - let now = Instant::now(); - let stuck: Vec = self - .established_at - .iter() - .filter(|(id, since)| { - now.saturating_duration_since(**since) >= ESTABLISHED_STUCK_THRESHOLD - && matches!( - self.connection_states.get(id), - Some(RelayConnectionState::Established) - ) - }) - .map(|(id, _)| *id) - .collect(); - - for relay_id in stuck { - tracing::warn!( - relay_peer_id = %relay_id, - threshold = ?ESTABLISHED_STUCK_THRESHOLD, - "Relay stuck in Established without reservation; force-closing for redial" - ); - // Clear the timestamp so we don't re-fire CloseConnection on the - // next tick while ConnectionClosed is in flight; on_connection_closed - // will eventually transition us back to Dialing. - self.established_at.remove(&relay_id); - self.events.push_back(ToSwarm::CloseConnection { - peer_id: relay_id, - connection: libp2p::swarm::CloseConnection::All, - }); - } - - let next_deadline = now - .checked_add(ESTABLISHED_WATCHDOG_TICK) - .unwrap_or_else(Instant::now); - // Watchdog is Some by construction inside this function — we just - // initialised or polled it above. - if let Some(watchdog) = self.watchdog.as_mut() { - watchdog.as_mut().reset(next_deadline); - } - } - - /// Returns the peer ids of relays whose circuit reservation has been - /// confirmed (i.e. swarm has issued `NewListenAddr` for the circuit). - fn reserved_relay_ids(&self) -> Vec { - self.connection_states - .iter() - .filter(|(_, s)| matches!(s, RelayConnectionState::Reserved)) - .map(|(id, _)| *id) - .collect() - } - - /// Builds circuit dial addresses for reaching `target` through every - /// currently reserved relay: - /// `/.../p2p//p2p-circuit/p2p/`. - fn peer_circuit_addrs(&self, target: &PeerId) -> Vec { - let mut addrs = Vec::new(); - for relay_id in self.reserved_relay_ids() { - let Some(relay_addrs) = self.relay_addrs.get(&relay_id) else { - continue; - }; - for relay_addr in relay_addrs { - let mut circuit: Multiaddr = relay_addr - .iter() - .filter(|p| !matches!(p, MaProtocol::P2p(_))) - .collect(); - circuit.push(MaProtocol::P2p(relay_id)); - circuit.push(MaProtocol::P2pCircuit); - circuit.push(MaProtocol::P2p(*target)); - addrs.push(circuit); - } - } - addrs - } - - /// Ensures every known cluster peer (≠ self) has a dial state armed to - /// reach it through the current set of reserved relays. - fn route_known_peers(&mut self) { - let local = self.p2p_context.local_peer_id(); - let targets: Vec = self - .p2p_context - .known_peers() - .iter() - .copied() - .filter(|id| Some(*id) != local) - .collect(); - - for target in targets { - self.upsert_peer_dial(target); - } - } - - /// Inserts or refreshes a dial state for `target` using the current circuit - /// addrs. - /// - /// If the address set changed (or there was no dial state yet) the backoff - /// schedule is reset so the new route is tried immediately. If the address - /// set is unchanged, the existing dial state is left alone — its backoff - /// schedule survives so we don't hammer peers that have been unreachable - /// just because re-routing was re-evaluated. If no reserved relay can - /// currently reach `target`, any pre-existing dial state is removed so we - /// don't keep firing `Dial` events at circuits through unreserved relays. - fn upsert_peer_dial(&mut self, target: PeerId) { - let addrs = self.peer_circuit_addrs(&target); - if addrs.is_empty() { - self.dial_states.remove(&target); - return; - } - - if let Some(existing) = self.dial_states.get(&target) - && addr_sets_equal(&existing.addrs, &addrs) - { - return; - } - - self.dial_states.insert( - target, - RelayDialState::new(RelayDialType::ClusterPeer, target, addrs), - ); - } - - /// Re-evaluates every active cluster-peer dial state against the current - /// set of reserved relays. Called when a relay leaves `Reserved` so that - /// peer dial campaigns stop self-rearming through circuits that no longer - /// exist. - fn refresh_peer_dials(&mut self) { - let peer_targets: Vec = self - .dial_states - .iter() - .filter(|(_, s)| matches!(s.ty, RelayDialType::ClusterPeer)) - .map(|(id, _)| *id) - .collect(); - for target in peer_targets { - self.upsert_peer_dial(target); - } - } - - /// Reacts to a new transport connection on a peer we previously dialed. - /// Relay dials transition into `Established` and queue circuit listeners; - /// peer routing dials just drop their dial state — libp2p takes it from - /// here. - fn on_connection_established(&mut self, peer_id: PeerId) { - let Some(dial_state) = self.dial_states.remove(&peer_id) else { - return; - }; - - match dial_state.ty { - RelayDialType::RelayServer => { - self.events - .push_back(ToSwarm::GenerateEvent(RelayManagerEvent::RelayConnected( - peer_id, - ))); - self.set_relay_state(peer_id, RelayConnectionState::Established); - - for circuit_addr in Self::circuit_addrs(peer_id, &dial_state.addrs) { - tracing::debug!( - relay_peer_id = %peer_id, - %circuit_addr, - "Requesting circuit listener on relay" - ); - self.events.push_back(ToSwarm::ListenOn { - opts: libp2p::swarm::ListenOpts::new(circuit_addr), - }); - } - } - RelayDialType::ClusterPeer => { - tracing::debug!( - peer_id = %peer_id, - "Routed peer connection established" - ); - self.events.push_back(ToSwarm::GenerateEvent( - RelayManagerEvent::PeerRoutedConnected(peer_id), - )); - } - } - } - - /// Reacts to a new listen address. If it's a circuit address for one of - /// our relays, promotes that relay's state to `Reserved` and re-routes - /// known peers through the updated set of reserved relays. - fn on_new_listen_addr(&mut self, addr: &Multiaddr) { - let Some(relay_id) = Self::relay_id_from_circuit_addr(addr) else { - return; - }; - let Some(state) = self.connection_states.get(&relay_id).copied() else { - return; - }; - match state { - RelayConnectionState::Dialing => { - tracing::warn!( - relay_peer_id = %relay_id, - listen_addr = %addr, - "NewListenAddr for relay in Dialing state; ignoring" - ); - } - RelayConnectionState::Reserved => { - // Second circuit address from the same relay — already routed. - } - RelayConnectionState::Established => { - tracing::info!( - relay_peer_id = %relay_id, - listen_addr = %addr, - "Relay reservation confirmed; routing known peers via this relay" - ); - self.set_relay_state(relay_id, RelayConnectionState::Reserved); - self.events - .push_back(ToSwarm::GenerateEvent(RelayManagerEvent::RelayReserved( - relay_id, - ))); - self.route_known_peers(); - } - } - } - - /// Reacts to a circuit listen address expiring. If the relay was in - /// `Reserved`, demote it to `Established` so we stop routing peers through - /// it. libp2p's circuit-client will normally refresh the reservation and - /// emit `NewListenAddr` again, which promotes us back. If the transport - /// connection also drops, `on_connection_closed` will handle the redial. - fn on_expired_listen_addr(&mut self, addr: &Multiaddr) { - let Some(relay_id) = Self::relay_id_from_circuit_addr(addr) else { - return; - }; - let Some(state) = self.connection_states.get(&relay_id).copied() else { - return; - }; - if matches!(state, RelayConnectionState::Reserved) { - tracing::info!( - relay_peer_id = %relay_id, - listen_addr = %addr, - "Relay circuit listener expired; demoting to Established" - ); - self.set_relay_state(relay_id, RelayConnectionState::Established); - self.events.push_back(ToSwarm::GenerateEvent( - RelayManagerEvent::RelayReservationLost(relay_id), - )); - // The reserved-relay set just shrank: drop or refresh any peer - // dial campaigns routed through this relay so they don't keep - // self-rearming through dead circuits. - self.refresh_peer_dials(); - } - } - - /// Reacts to the last connection to `peer_id` closing. Either it's one of - /// our relays (queue a fresh re-dial cycle) or a known cluster peer - /// (arm a fresh routing dial through the current reserved relays). - /// Anything else is ignored. - /// - /// If the relay was previously in `Reserved`, `RelayReservationLost` is - /// emitted before `RelayDisconnected` so subscribers see the reservation - /// tear down explicitly, and the peer routing campaigns through this - /// relay are refreshed to drop now-dead circuits. - fn on_connection_closed(&mut self, peer_id: PeerId) { - if let Some(prev_state) = self.connection_states.get(&peer_id).copied() { - let was_reserved = matches!(prev_state, RelayConnectionState::Reserved); - if was_reserved { - self.events.push_back(ToSwarm::GenerateEvent( - RelayManagerEvent::RelayReservationLost(peer_id), - )); - } - self.events.push_back(ToSwarm::GenerateEvent( - RelayManagerEvent::RelayDisconnected(peer_id), - )); - self.redial_relay(peer_id); - if was_reserved { - self.refresh_peer_dials(); - } - } else if self.p2p_context.is_known_peer(&peer_id) { - self.reroute_peer(peer_id); - } - } - - /// Reacts to a dial failure by logging and emitting a `DialFailed` event. - /// The underlying [`RelayDialState`] self-rearms with exponential backoff - /// on the next swarm poll, so by default no state change is needed here. - /// - /// One special case: `DialError::DialPeerConditionFalse` means libp2p - /// refused the dial because we're already connected to (or dialing) the - /// target. Behaviour depends on the dial type: - /// - /// - [`RelayDialType::ClusterPeer`]: libp2p owns the existing direct - /// connection. Drop the dial state and rely on - /// [`Self::on_connection_closed`] → [`Self::reroute_peer`] to re-arm the - /// dial once the existing connection actually closes. - /// - [`RelayDialType::RelayServer`]: dropping the dial state here would - /// wedge `connection_states` in `Dialing` forever — no - /// `on_connection_closed` will fire if libp2p already has the transport - /// connection, and `queue_relay_update` short-circuits while - /// `connection_states` has an entry. Instead leave the campaign armed; - /// backoff retries are cheap (libp2p re-rejects with the same error) and - /// `on_connection_established` will tear the dial state down once libp2p - /// surfaces the connection. - fn on_dial_failure(&mut self, peer_id: Option, error: &DialError) { - let Some(peer_id) = peer_id else { return }; - let Some(state) = self.dial_states.get(&peer_id) else { - return; - }; - let target = state.ty; - let retry_count = state.retry_count; - let skipped = matches!(error, DialError::DialPeerConditionFalse(_)); - - if skipped { - match target { - RelayDialType::ClusterPeer => { - tracing::debug!( - peer_id = %peer_id, - dial_type = ?target, - retry_count, - %error, - "Dial skipped (already connected or dialing); dropping dial state" - ); - self.dial_states.remove(&peer_id); - } - RelayDialType::RelayServer => { - tracing::debug!( - peer_id = %peer_id, - dial_type = ?target, - retry_count, - %error, - "Dial skipped for relay; keeping campaign armed for backoff retry" - ); - } - } - } else { - tracing::debug!( - peer_id = %peer_id, - dial_type = ?target, - retry_count, - %error, - "Dial failed, will retry with backoff" - ); - } - - self.events - .push_back(ToSwarm::GenerateEvent(RelayManagerEvent::DialFailed { - peer_id, - target, - retry_count, - error: RelayDialError::from(error), - })); - } - - /// Schedules a re-dial for a relay whose last connection just dropped. - fn redial_relay(&mut self, relay_id: PeerId) { - let Some(addrs) = self.relay_addrs.get(&relay_id).cloned() else { - tracing::warn!( - relay_peer_id = %relay_id, - "Relay closed but addresses no longer tracked; cannot redial" - ); - self.connection_states.remove(&relay_id); - return; - }; - tracing::debug!( - relay_peer_id = %relay_id, - "Relay connection closed, queuing re-dial with backoff" - ); - self.dial_states.insert( - relay_id, - RelayDialState::new(RelayDialType::RelayServer, relay_id, addrs), - ); - self.set_relay_state(relay_id, RelayConnectionState::Dialing); - } - - /// Arms a dial campaign for a known cluster peer whose last connection - /// just dropped, routing through all currently reserved relays. Delegates - /// to [`Self::upsert_peer_dial`] so that an existing dial state with the - /// same circuit addrs survives — its backoff schedule is preserved across - /// rapid disconnect/reconnect cycles when the route hasn't changed. No-op - /// if no relay is currently reserved. - fn reroute_peer(&mut self, peer_id: PeerId) { - tracing::debug!( - peer_id = %peer_id, - "Peer connection closed, re-routing via reserved relays" - ); - self.upsert_peer_dial(peer_id); - } -} - -impl NetworkBehaviour for RelayManager { - type ConnectionHandler = dummy::ConnectionHandler; - type ToSwarm = RelayManagerEvent; - - fn handle_established_inbound_connection( - &mut self, - _connection_id: ConnectionId, - _peer: PeerId, - _local_addr: &Multiaddr, - _remote_addr: &Multiaddr, - ) -> Result, ConnectionDenied> { - Ok(dummy::ConnectionHandler) - } - - fn handle_established_outbound_connection( - &mut self, - _connection_id: ConnectionId, - _peer: PeerId, - _addr: &Multiaddr, - _role_override: Endpoint, - _port_use: PortUse, - ) -> Result, ConnectionDenied> { - Ok(dummy::ConnectionHandler) - } - - fn on_swarm_event(&mut self, event: FromSwarm) { - match event { - FromSwarm::ConnectionEstablished(conn) => { - self.on_connection_established(conn.peer_id); - } - FromSwarm::NewListenAddr(ev) => { - self.on_new_listen_addr(ev.addr); - } - FromSwarm::ExpiredListenAddr(ev) => { - self.on_expired_listen_addr(ev.addr); - } - FromSwarm::ConnectionClosed(conn) if conn.remaining_established == 0 => { - self.on_connection_closed(conn.peer_id); - } - FromSwarm::DialFailure(ev) => { - self.on_dial_failure(ev.peer_id, ev.error); - } - _ => {} - } - } - - fn on_connection_handler_event( - &mut self, - _peer_id: libp2p::PeerId, - _connection_id: libp2p::swarm::ConnectionId, - _event: libp2p::swarm::THandlerOutEvent, - ) { - // No special handling needed for connection handler events - } - - fn poll( - &mut self, - cx: &mut Context<'_>, - ) -> std::task::Poll>> { - let mut updates: Vec = Vec::new(); - for stream in &mut self.relay_subs { - while let Poll::Ready(Some(Some(peer))) = stream.poll_next_unpin(cx) { - updates.push(peer); - } - } - for peer in updates { - self.queue_relay_update(peer); - } - - self.process_relay_dials(cx); - self.process_established_watchdog(cx); - - if let Some(event) = self.events.pop_front() { - return Poll::Ready(event); - } - - Poll::Pending - } -} - -#[cfg(test)] -mod tests { - use std::str::FromStr; - - use super::*; - - fn addr(s: &str) -> Multiaddr { - Multiaddr::from_str(s).expect("valid multiaddr") - } - - fn manager() -> RelayManager { - RelayManager::new(Vec::new(), P2PContext::new(Vec::::new())) - } - - // ---- circuit_addrs ------------------------------------------------- - - #[test] - fn circuit_addrs_strips_existing_p2p_and_appends_relay_suffix() { - let relay = PeerId::random(); - let transport = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}")); - - let out = RelayManager::circuit_addrs(relay, &[transport]); - - let expected = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit")); - assert_eq!(out, vec![expected]); - } - - #[test] - fn circuit_addrs_handles_addr_without_existing_p2p_component() { - let relay = PeerId::random(); - let transport = addr("/ip4/10.0.0.1/udp/9000/quic-v1"); - - let out = RelayManager::circuit_addrs(relay, &[transport]); - - let expected = addr(&format!( - "/ip4/10.0.0.1/udp/9000/quic-v1/p2p/{relay}/p2p-circuit" - )); - assert_eq!(out, vec![expected]); - } - - #[test] - fn circuit_addrs_preserves_input_order_for_multiple_addrs() { - let relay = PeerId::random(); - let other = PeerId::random(); - let inputs = vec![ - addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{other}")), - addr("/ip4/10.0.0.1/udp/9000/quic-v1"), - ]; - - let out = RelayManager::circuit_addrs(relay, &inputs); - - assert_eq!( - out, - vec![ - addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit")), - addr(&format!( - "/ip4/10.0.0.1/udp/9000/quic-v1/p2p/{relay}/p2p-circuit" - )), - ] - ); - } - - #[test] - fn circuit_addrs_empty_input_yields_empty_output() { - let relay = PeerId::random(); - let out = RelayManager::circuit_addrs(relay, &[]); - assert!(out.is_empty()); - } - - // ---- relay_id_from_circuit_addr ----------------------------------- - - #[test] - fn relay_id_from_circuit_addr_extracts_last_p2p_before_circuit() { - let relay = PeerId::random(); - let circuit = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit")); - - assert_eq!( - RelayManager::relay_id_from_circuit_addr(&circuit), - Some(relay) - ); - } - - #[test] - fn relay_id_from_circuit_addr_ignores_target_p2p_after_circuit() { - // Full circuit-dial form `/.../p2p//p2p-circuit/p2p/` - // must return the relay id (before `/p2p-circuit`), not the target. - let relay = PeerId::random(); - let target = PeerId::random(); - let circuit = addr(&format!( - "/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit/p2p/{target}" - )); - - assert_eq!( - RelayManager::relay_id_from_circuit_addr(&circuit), - Some(relay) - ); - } - - #[test] - fn relay_id_from_circuit_addr_returns_none_when_no_circuit_component() { - let peer = PeerId::random(); - let plain = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{peer}")); - - assert_eq!(RelayManager::relay_id_from_circuit_addr(&plain), None); - } - - #[test] - fn relay_id_from_circuit_addr_returns_none_when_circuit_has_no_preceding_p2p() { - let bare = addr("/ip4/127.0.0.1/tcp/9000/p2p-circuit"); - assert_eq!(RelayManager::relay_id_from_circuit_addr(&bare), None); - } - - // ---- peer_circuit_addrs ------------------------------------------- - - #[test] - fn peer_circuit_addrs_returns_empty_when_no_relays_reserved() { - let mgr = manager(); - let target = PeerId::random(); - assert!(mgr.peer_circuit_addrs(&target).is_empty()); - } - - #[test] - fn peer_circuit_addrs_ignores_relays_in_dialing_or_established() { - let mut mgr = manager(); - let target = PeerId::random(); - let dialing = PeerId::random(); - let established = PeerId::random(); - - mgr.connection_states - .insert(dialing, RelayConnectionState::Dialing); - mgr.relay_addrs - .insert(dialing, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - mgr.connection_states - .insert(established, RelayConnectionState::Established); - mgr.relay_addrs - .insert(established, vec![addr("/ip4/10.0.0.2/tcp/9000")]); - - assert!(mgr.peer_circuit_addrs(&target).is_empty()); - } - - #[test] - fn peer_circuit_addrs_skips_reserved_relay_without_tracked_addrs() { - let mut mgr = manager(); - let target = PeerId::random(); - let relay = PeerId::random(); - - mgr.connection_states - .insert(relay, RelayConnectionState::Reserved); - // No entry in relay_addrs: the relay is reserved but we have no - // transport addrs to build a circuit through it. - - assert!(mgr.peer_circuit_addrs(&target).is_empty()); - } - - #[test] - fn peer_circuit_addrs_builds_one_circuit_per_reserved_relay_addr() { - let mut mgr = manager(); - let target = PeerId::random(); - let relay = PeerId::random(); - - let relay_addrs = vec![ - // With and without trailing /p2p/ — both should produce the - // same canonical circuit form. - addr(&format!("/ip4/10.0.0.1/tcp/9000/p2p/{relay}")), - addr("/ip4/10.0.0.1/udp/9000/quic-v1"), - ]; - mgr.connection_states - .insert(relay, RelayConnectionState::Reserved); - mgr.relay_addrs.insert(relay, relay_addrs); - - let out = mgr.peer_circuit_addrs(&target); - - let expected = vec![ - addr(&format!( - "/ip4/10.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit/p2p/{target}" - )), - addr(&format!( - "/ip4/10.0.0.1/udp/9000/quic-v1/p2p/{relay}/p2p-circuit/p2p/{target}" - )), - ]; - assert_eq!(out, expected); - } - - #[test] - fn peer_circuit_addrs_aggregates_across_multiple_reserved_relays() { - let mut mgr = manager(); - let target = PeerId::random(); - let relay_a = PeerId::random(); - let relay_b = PeerId::random(); - - mgr.connection_states - .insert(relay_a, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay_a, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - mgr.connection_states - .insert(relay_b, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay_b, vec![addr("/ip4/10.0.0.2/tcp/9000")]); - - let out: HashSet = mgr.peer_circuit_addrs(&target).into_iter().collect(); - - let expected: HashSet = [ - addr(&format!( - "/ip4/10.0.0.1/tcp/9000/p2p/{relay_a}/p2p-circuit/p2p/{target}" - )), - addr(&format!( - "/ip4/10.0.0.2/tcp/9000/p2p/{relay_b}/p2p-circuit/p2p/{target}" - )), - ] - .into_iter() - .collect(); - assert_eq!(out, expected); - } - - // ---- backoff_delay ------------------------------------------------ - - #[test] - fn backoff_delay_retry_zero_returns_base_exactly() { - // Charon's early-return path: retry == 0 returns base with no jitter. - assert_eq!(backoff_delay(0), RELAY_BACKOFF_BASE); - } - - #[test] - fn backoff_delay_caps_at_max_with_jitter_bound() { - // 1.6^n grows past max well before retry == 50; we should be capped at - // max ± 20% jitter and never wander outside that envelope. - let max = RELAY_BACKOFF_MAX.as_secs_f64(); - let lower = max * (1.0 - RELAY_BACKOFF_JITTER); - let upper = max * (1.0 + RELAY_BACKOFF_JITTER); - for _ in 0..32 { - let d = backoff_delay(50).as_secs_f64(); - assert!( - d >= lower && d <= upper, - "delay {d}s outside jitter envelope [{lower}, {upper}]" - ); - } - } - - #[test] - fn backoff_delay_grows_then_plateaus() { - // Averaging out jitter, retry=1 should be larger than base and - // retry=10 should already be at the cap. - let mut sum_1 = 0.0; - let mut sum_10 = 0.0; - let samples = 64; - for _ in 0..samples { - sum_1 += backoff_delay(1).as_secs_f64(); - sum_10 += backoff_delay(10).as_secs_f64(); - } - let avg_1 = sum_1 / f64::from(samples); - let avg_10 = sum_10 / f64::from(samples); - assert!(avg_1 > RELAY_BACKOFF_BASE.as_secs_f64()); - assert!(avg_10 >= RELAY_BACKOFF_MAX.as_secs_f64() * (1.0 - RELAY_BACKOFF_JITTER)); - } - - // ---- queue_relay_update ------------------------------------------- - - fn relay_peer(id: PeerId, addrs: Vec) -> Peer { - Peer { - id, - addresses: addrs, - index: 0, - name: crate::name::peer_name(&id), - } - } - - #[tokio::test] - async fn queue_relay_update_first_seen_starts_dial_campaign() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - let addrs = vec![addr("/ip4/10.0.0.1/tcp/9000")]; - - mgr.queue_relay_update(relay_peer(relay_id, addrs.clone())); - - assert!(mgr.dial_states.contains_key(&relay_id)); - assert_eq!( - mgr.connection_states.get(&relay_id), - Some(&RelayConnectionState::Dialing) - ); - assert_eq!(mgr.relay_addrs.get(&relay_id), Some(&addrs)); - } - - #[tokio::test] - async fn queue_relay_update_refreshes_inflight_addrs_without_resetting_backoff() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - - mgr.queue_relay_update(relay_peer(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")])); - // Pretend the dial state has already retried a few times. - mgr.dial_states.get_mut(&relay_id).unwrap().retry_count = 7; - - let new_addrs = vec![ - addr("/ip4/10.0.0.1/tcp/9000"), - addr("/ip4/10.0.0.2/tcp/9000"), - ]; - mgr.queue_relay_update(relay_peer(relay_id, new_addrs.clone())); - - let state = mgr.dial_states.get(&relay_id).unwrap(); - assert_eq!(state.addrs, new_addrs); - assert_eq!( - state.retry_count, 7, - "backoff schedule must survive refresh" - ); - assert_eq!(mgr.relay_addrs.get(&relay_id), Some(&new_addrs)); - } - - #[tokio::test] - async fn queue_relay_update_no_op_when_relay_already_connected() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - mgr.connection_states - .insert(relay_id, RelayConnectionState::Reserved); - - let new_addrs = vec![addr("/ip4/10.0.0.99/tcp/9000")]; - mgr.queue_relay_update(relay_peer(relay_id, new_addrs.clone())); - - assert!( - !mgr.dial_states.contains_key(&relay_id), - "no dial campaign while connected" - ); - // Connection state untouched. - assert_eq!( - mgr.connection_states.get(&relay_id), - Some(&RelayConnectionState::Reserved) - ); - // relay_addrs still gets refreshed so we have the latest list ready - // for redial after a disconnect. - assert_eq!(mgr.relay_addrs.get(&relay_id), Some(&new_addrs)); - } - - // ---- state machine: on_connection_established ---------------------- - - #[tokio::test] - async fn on_connection_established_relay_promotes_to_established_and_queues_listen() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - let relay_addrs = vec![addr("/ip4/10.0.0.1/tcp/9000")]; - - mgr.queue_relay_update(relay_peer(relay_id, relay_addrs.clone())); - mgr.events.clear(); - mgr.on_connection_established(relay_id); - - assert!(!mgr.dial_states.contains_key(&relay_id)); - assert_eq!( - mgr.connection_states.get(&relay_id), - Some(&RelayConnectionState::Established) - ); - let listen_count = mgr - .events - .iter() - .filter(|e| matches!(e, ToSwarm::ListenOn { .. })) - .count(); - assert_eq!(listen_count, relay_addrs.len()); - let relay_connected = mgr.events.iter().any(|e| { - matches!( - e, - ToSwarm::GenerateEvent(RelayManagerEvent::RelayConnected(id)) if *id == relay_id - ) - }); - assert!(relay_connected, "RelayConnected event must be emitted"); - } - - #[tokio::test] - async fn on_connection_established_cluster_peer_drops_dial_state() { - let mut mgr = manager(); - let target = PeerId::random(); - // Seed a peer-routing dial state (skipping upsert which requires - // reserved relays). - mgr.dial_states.insert( - target, - RelayDialState::new( - RelayDialType::ClusterPeer, - target, - vec![addr("/ip4/10.0.0.1/tcp/9000/p2p-circuit")], - ), - ); - - mgr.on_connection_established(target); - - assert!(!mgr.dial_states.contains_key(&target)); - let routed = mgr.events.iter().any(|e| { - matches!( - e, - ToSwarm::GenerateEvent(RelayManagerEvent::PeerRoutedConnected(id)) if *id == target - ) - }); - assert!(routed, "PeerRoutedConnected event must be emitted"); - } - - // ---- state machine: on_new_listen_addr ----------------------------- - - #[tokio::test] - async fn on_new_listen_addr_promotes_established_to_reserved() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - mgr.connection_states - .insert(relay_id, RelayConnectionState::Established); - mgr.relay_addrs - .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - - let circuit = addr(&format!( - "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit" - )); - mgr.on_new_listen_addr(&circuit); - - assert_eq!( - mgr.connection_states.get(&relay_id), - Some(&RelayConnectionState::Reserved) - ); - let reserved = mgr.events.iter().any(|e| { - matches!( - e, - ToSwarm::GenerateEvent(RelayManagerEvent::RelayReserved(id)) if *id == relay_id - ) - }); - assert!(reserved); - } - - // ---- state machine: on_expired_listen_addr ------------------------- - - #[tokio::test] - async fn on_expired_listen_addr_demotes_reserved_and_emits_reservation_lost() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - mgr.connection_states - .insert(relay_id, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - - let circuit = addr(&format!( - "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit" - )); - mgr.on_expired_listen_addr(&circuit); - - assert_eq!( - mgr.connection_states.get(&relay_id), - Some(&RelayConnectionState::Established) - ); - let lost = mgr.events.iter().any(|e| { - matches!( - e, - ToSwarm::GenerateEvent(RelayManagerEvent::RelayReservationLost(id)) - if *id == relay_id - ) - }); - assert!(lost, "RelayReservationLost must be emitted on demote"); - } - - #[tokio::test] - async fn on_expired_listen_addr_drops_peer_dials_with_no_route_left() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - let target = PeerId::random(); - - // Single reserved relay supporting a peer-routing dial. - mgr.connection_states - .insert(relay_id, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - mgr.dial_states.insert( - target, - RelayDialState::new( - RelayDialType::ClusterPeer, - target, - vec![addr(&format!( - "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit/p2p/{target}" - ))], - ), - ); - - let circuit = addr(&format!( - "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit" - )); - mgr.on_expired_listen_addr(&circuit); - - assert!( - !mgr.dial_states.contains_key(&target), - "peer dial state must be dropped once no reserved relay can route to it" - ); - } - - // ---- state machine: on_connection_closed --------------------------- - - #[tokio::test] - async fn on_connection_closed_reserved_relay_emits_lost_before_disconnected() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - mgr.connection_states - .insert(relay_id, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - - mgr.on_connection_closed(relay_id); - - let lost_idx = mgr.events.iter().position(|e| { - matches!( - e, - ToSwarm::GenerateEvent(RelayManagerEvent::RelayReservationLost(id)) - if *id == relay_id - ) - }); - let disc_idx = mgr.events.iter().position(|e| { - matches!( - e, - ToSwarm::GenerateEvent(RelayManagerEvent::RelayDisconnected(id)) if *id == relay_id - ) - }); - let lost = lost_idx.expect("RelayReservationLost must fire when prev state was Reserved"); - let disc = disc_idx.expect("RelayDisconnected must fire on relay close"); - assert!(lost < disc, "ReservationLost must precede Disconnected"); - assert_eq!( - mgr.connection_states.get(&relay_id), - Some(&RelayConnectionState::Dialing), - "redial campaign must arm" - ); - assert!(mgr.dial_states.contains_key(&relay_id)); - } - - #[tokio::test] - async fn on_connection_closed_established_relay_skips_reservation_lost() { - let mut mgr = manager(); - let relay_id = PeerId::random(); - mgr.connection_states - .insert(relay_id, RelayConnectionState::Established); - mgr.relay_addrs - .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - - mgr.on_connection_closed(relay_id); - - let lost = mgr.events.iter().any(|e| { - matches!( - e, - ToSwarm::GenerateEvent(RelayManagerEvent::RelayReservationLost(_)) - ) - }); - assert!( - !lost, - "no ReservationLost event when prev state wasn't Reserved" - ); - } - - // ---- on_dial_failure: Skipped path -------------------------------- - - fn skipped_dial_error() -> DialError { - DialError::DialPeerConditionFalse( - libp2p::swarm::dial_opts::PeerCondition::DisconnectedAndNotDialing, - ) - } - - #[tokio::test] - async fn on_dial_failure_skipped_cluster_peer_drops_dial_state() { - let mut mgr = manager(); - let target = PeerId::random(); - mgr.dial_states.insert( - target, - RelayDialState::new( - RelayDialType::ClusterPeer, - target, - vec![addr("/ip4/10.0.0.1/tcp/9000")], - ), - ); - - mgr.on_dial_failure(Some(target), &skipped_dial_error()); - - assert!( - !mgr.dial_states.contains_key(&target), - "cluster-peer dial state must be dropped on Skipped" - ); - } - - #[tokio::test] - async fn on_dial_failure_skipped_relay_keeps_dial_state() { - // Regression for the wedge bug: keep the campaign armed so backoff - // continues to retry until libp2p surfaces the connection state. - let mut mgr = manager(); - let relay_id = PeerId::random(); - mgr.connection_states - .insert(relay_id, RelayConnectionState::Dialing); - mgr.dial_states.insert( - relay_id, - RelayDialState::new( - RelayDialType::RelayServer, - relay_id, - vec![addr("/ip4/10.0.0.1/tcp/9000")], - ), - ); - - mgr.on_dial_failure(Some(relay_id), &skipped_dial_error()); - - assert!( - mgr.dial_states.contains_key(&relay_id), - "relay dial state must survive Skipped so backoff can retry" - ); - assert_eq!( - mgr.connection_states.get(&relay_id), - Some(&RelayConnectionState::Dialing), - "connection state must still be Dialing" - ); - } - - // ---- upsert_peer_dial --------------------------------------------- - - #[tokio::test] - async fn upsert_peer_dial_preserves_backoff_when_addrs_unchanged() { - let mut mgr = manager(); - let target = PeerId::random(); - let relay = PeerId::random(); - mgr.connection_states - .insert(relay, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - - mgr.upsert_peer_dial(target); - let inserted_count = mgr.dial_states.get(&target).map(|s| s.retry_count); - // Pretend the dial has retried. - if let Some(s) = mgr.dial_states.get_mut(&target) { - s.retry_count = 5; - } - mgr.upsert_peer_dial(target); - let after = mgr.dial_states.get(&target).map(|s| s.retry_count); - assert_eq!(inserted_count, Some(0)); - assert_eq!( - after, - Some(5), - "addr-set unchanged: existing dial state (and its backoff) must be preserved" - ); - } - - #[tokio::test] - async fn upsert_peer_dial_resets_backoff_when_addrs_change() { - let mut mgr = manager(); - let target = PeerId::random(); - let relay_a = PeerId::random(); - let relay_b = PeerId::random(); - mgr.connection_states - .insert(relay_a, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay_a, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - - mgr.upsert_peer_dial(target); - if let Some(s) = mgr.dial_states.get_mut(&target) { - s.retry_count = 5; - } - - // Reserve a second relay → new circuit addr → addr-set changes. - mgr.connection_states - .insert(relay_b, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay_b, vec![addr("/ip4/10.0.0.2/tcp/9000")]); - mgr.upsert_peer_dial(target); - - assert_eq!( - mgr.dial_states.get(&target).map(|s| s.retry_count), - Some(0), - "addr-set changed: dial state (and backoff) must be replaced" - ); - } - - #[tokio::test] - async fn upsert_peer_dial_drops_stale_state_when_no_route_left() { - let mut mgr = manager(); - let target = PeerId::random(); - let relay = PeerId::random(); - mgr.connection_states - .insert(relay, RelayConnectionState::Reserved); - mgr.relay_addrs - .insert(relay, vec![addr("/ip4/10.0.0.1/tcp/9000")]); - - mgr.upsert_peer_dial(target); - assert!(mgr.dial_states.contains_key(&target)); - - // Demote the only reserved relay → no circuit addrs left. - mgr.connection_states - .insert(relay, RelayConnectionState::Established); - mgr.upsert_peer_dial(target); - - assert!( - !mgr.dial_states.contains_key(&target), - "no reserved relay can reach target: stale dial state must be dropped" - ); - } -} diff --git a/crates/p2p/src/relay/dial.rs b/crates/p2p/src/relay/dial.rs new file mode 100644 index 00000000..155d5f5d --- /dev/null +++ b/crates/p2p/src/relay/dial.rs @@ -0,0 +1,169 @@ +//! Dial-campaign machinery: backoff scheduling and the per-target dial state. +//! +//! A [`RelayDialState`] is a [`Stream`] that yields a `ToSwarm::Dial` each time +//! its exponential backoff elapses, so the swarm re-dials a relay (or a routed +//! cluster peer) until it connects. + +use std::{ + collections::HashSet, + convert::Infallible, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +use futures::Stream; +use libp2p::{ + Multiaddr, PeerId, + swarm::{ToSwarm, dial_opts::DialOpts}, +}; +use tokio::time::{Instant, Sleep, sleep_until}; + +use super::event::{RelayDialType, RelayManagerEvent}; + +/// Initial backoff delay before the first reconnect attempt. Matches Charon's +/// `DefaultConfig.BaseDelay`. +const RELAY_BACKOFF_BASE: Duration = Duration::from_secs(1); +/// Maximum backoff delay between reconnect attempts. Matches Charon's +/// `DefaultConfig.MaxDelay`. +const RELAY_BACKOFF_MAX: Duration = Duration::from_secs(120); +/// Jitter factor applied to backoff delays. Matches Charon's +/// `DefaultConfig.Jitter`. +const RELAY_BACKOFF_JITTER: f64 = 0.2; + +/// State of an in-flight dial campaign, polled to produce a `ToSwarm::Dial` +/// event each time its backoff elapses. +pub(super) struct RelayDialState { + /// Kind of target this campaign is dialing. + pub(super) ty: RelayDialType, + /// Target peer id for the dial. + pub(super) peer_id: PeerId, + /// Transport (for `RelayServer`) or circuit (for `ClusterPeer`) addresses + /// to try. + pub(super) addrs: Vec, + /// Number of dial attempts so far, used to compute the next backoff. + pub(super) retry_count: u32, + /// Sleeps until the next dial is due. Boxed-and-pinned so the struct stays + /// `Unpin` and can be stored in a `HashMap`; the inner `Sleep` is `!Unpin`. + sleep: Pin>, +} + +impl RelayDialState { + /// Creates a fresh dial state armed to fire after the base backoff. + pub(super) fn new(ty: RelayDialType, peer_id: PeerId, addrs: Vec) -> Self { + Self { + ty, + peer_id, + addrs, + retry_count: 0, + sleep: Box::pin(sleep_until(Instant::now())), + } + } +} + +impl Stream for RelayDialState { + type Item = ToSwarm; + + /// Drives the dial schedule. Yields a `Dial` event when the next attempt + /// is due, then self-rearms with an exponential backoff so subsequent + /// `poll_next` calls produce later retries. The stream never terminates. + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + std::task::ready!(self.sleep.as_mut().poll(cx)); + + let next_delay = backoff_delay(self.retry_count); + self.retry_count = self.retry_count.saturating_add(1); + let next_deadline = Instant::now() + .checked_add(next_delay) + .unwrap_or_else(Instant::now); + self.sleep.as_mut().reset(next_deadline); + + let opts = DialOpts::peer_id(self.peer_id) + .condition(libp2p::swarm::dial_opts::PeerCondition::DisconnectedAndNotDialing) + .addresses(self.addrs.clone()) + .build(); + + Poll::Ready(Some(ToSwarm::Dial { opts })) + } +} + +/// Returns true if both slices contain the same multiaddrs (order-independent). +/// Used to decide whether a routing refresh actually expanded the available +/// circuit paths to a peer — if it did, the dial state's backoff is reset. +pub(super) fn addr_sets_equal(a: &[Multiaddr], b: &[Multiaddr]) -> bool { + if a.len() != b.len() { + return false; + } + let a_set: HashSet<&Multiaddr> = a.iter().collect(); + b.iter().all(|x| a_set.contains(x)) +} + +/// Exponential backoff delay for a given retry count. +/// +/// Mirrors Charon's `expbackoff.DefaultConfig`: base=1s, multiplier=1.6, +/// jitter=0.2, max=120s. `retry_count == 0` returns the base delay with no +/// jitter, matching Go's early-return path. For `retry_count > 0`, ±20% +/// jitter is applied after capping so nodes don't retry in lockstep. +fn backoff_delay(retry_count: u32) -> Duration { + if retry_count == 0 { + return RELAY_BACKOFF_BASE; + } + let mut delay = RELAY_BACKOFF_BASE.as_secs_f64(); + let max = RELAY_BACKOFF_MAX.as_secs_f64(); + for _ in 0..retry_count { + delay *= 1.6; + if delay >= max { + delay = max; + break; + } + } + let rand_val = rand::random::(); + delay *= 1.0 + RELAY_BACKOFF_JITTER * (rand_val * 2.0 - 1.0); + if delay < 0.0 { + return Duration::ZERO; + } + Duration::from_secs_f64(delay) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn backoff_delay_retry_zero_returns_base_exactly() { + // Charon's early-return path: retry == 0 returns base with no jitter. + assert_eq!(backoff_delay(0), RELAY_BACKOFF_BASE); + } + + #[test] + fn backoff_delay_caps_at_max_with_jitter_bound() { + // 1.6^n grows past max well before retry == 50; we should be capped at + // max ± 20% jitter and never wander outside that envelope. + let max = RELAY_BACKOFF_MAX.as_secs_f64(); + let lower = max * (1.0 - RELAY_BACKOFF_JITTER); + let upper = max * (1.0 + RELAY_BACKOFF_JITTER); + for _ in 0..32 { + let d = backoff_delay(50).as_secs_f64(); + assert!( + d >= lower && d <= upper, + "delay {d}s outside jitter envelope [{lower}, {upper}]" + ); + } + } + + #[test] + fn backoff_delay_grows_then_plateaus() { + // Averaging out jitter, retry=1 should be larger than base and + // retry=10 should already be at the cap. + let mut sum_1 = 0.0; + let mut sum_10 = 0.0; + let samples = 64; + for _ in 0..samples { + sum_1 += backoff_delay(1).as_secs_f64(); + sum_10 += backoff_delay(10).as_secs_f64(); + } + let avg_1 = sum_1 / f64::from(samples); + let avg_10 = sum_10 / f64::from(samples); + assert!(avg_1 > RELAY_BACKOFF_BASE.as_secs_f64()); + assert!(avg_10 >= RELAY_BACKOFF_MAX.as_secs_f64() * (1.0 - RELAY_BACKOFF_JITTER)); + } +} diff --git a/crates/p2p/src/relay/event.rs b/crates/p2p/src/relay/event.rs new file mode 100644 index 00000000..e5648ce8 --- /dev/null +++ b/crates/p2p/src/relay/event.rs @@ -0,0 +1,110 @@ +//! Public event and error types emitted by [`RelayManager`]. +//! +//! [`RelayManager`]: super::RelayManager + +use libp2p::{PeerId, swarm::DialError}; + +/// Events emitted by [`RelayManager`] to the swarm. +/// +/// Mirrors the relay lifecycle (`Dialing → Established → Reserved`) plus the +/// outcomes of routing known cluster peers through reserved circuits. Consumers +/// can observe the full progression of a reservation, or pick out just the +/// events they care about (e.g. `RelayReserved` for "circuits are usable now"). +/// +/// [`RelayManager`]: super::RelayManager +#[derive(Debug)] +pub enum RelayManagerEvent { + /// Transport connection to a relay is up. A circuit listener has been + /// requested but the reservation is not yet confirmed. + RelayConnected(PeerId), + /// Relay accepted the reservation; circuits through this relay are now + /// usable for routing cluster peers. + RelayReserved(PeerId), + /// Circuit listener for this relay expired; the relay has been demoted to + /// `Established`. libp2p's circuit client typically refreshes the + /// reservation shortly, which will re-emit `RelayReserved`. + RelayReservationLost(PeerId), + /// Last transport connection to the relay closed. A re-dial campaign with + /// exponential backoff has been queued. + RelayDisconnected(PeerId), + /// A cluster peer has been reached through one of the reserved relay + /// circuits. From here libp2p owns the connection; this event exists for + /// telemetry only. + PeerRoutedConnected(PeerId), + /// A dial attempt failed. The underlying `RelayDialState` self-rearms + /// with exponential backoff, so consumers don't need to take any action. + DialFailed { + /// Target peer id (a relay server, or a routed cluster peer). + peer_id: PeerId, + /// Whether this dial was targeting a relay or a routed peer. + target: RelayDialType, + /// Number of attempts so far (including this one). + retry_count: u32, + /// Categorised dial error. + error: RelayDialError, + }, +} + +/// Categorised dial error surfaced via [`RelayManagerEvent::DialFailed`]. +/// +/// Translated from libp2p's [`DialError`] so consumers can match on variants +/// without depending on libp2p's swarm types directly. Free-form details are +/// preserved as strings on the variants where they carry diagnostic value. +#[derive(Debug, Clone, thiserror::Error)] +pub enum RelayDialError { + /// Attempted to dial our own peer id. + #[error("local peer id")] + LocalPeerId, + /// No transport addresses were available for the target. + #[error("no addresses")] + NoAddresses, + /// Dial was skipped because of a peer condition (already + /// connected/dialing). + #[error("dial skipped: peer condition not met")] + Skipped, + /// Pending connection attempt was aborted (e.g. swarm shutdown, or a newer + /// dial superseded it). + #[error("aborted")] + Aborted, + /// Connected, but the remote reported a peer id different from the + /// expected one. + #[error("wrong peer id")] + WrongPeerId, + /// Connection was denied by a behaviour or upgrade step. + #[error("denied: {0}")] + Denied(String), + /// All transport attempts failed; details preserved as `addr: err`, + /// joined by `; `. + #[error("transport: {0}")] + Transport(String), +} + +impl From<&DialError> for RelayDialError { + fn from(err: &DialError) -> Self { + match err { + DialError::LocalPeerId { .. } => Self::LocalPeerId, + DialError::NoAddresses => Self::NoAddresses, + DialError::DialPeerConditionFalse(_) => Self::Skipped, + DialError::Aborted => Self::Aborted, + DialError::WrongPeerId { .. } => Self::WrongPeerId, + DialError::Denied { cause } => Self::Denied(cause.to_string()), + DialError::Transport(errors) => Self::Transport( + errors + .iter() + .map(|(addr, e)| format!("{addr}: {e}")) + .collect::>() + .join("; "), + ), + } + } +} + +/// Whether a `RelayDialState` is targeting a relay server or a cluster peer +/// reached through reserved relay circuits. +#[derive(Debug, Clone, Copy)] +pub enum RelayDialType { + /// Dial a known cluster peer via reserved relay circuits. + ClusterPeer, + /// Dial a relay server directly. + RelayServer, +} diff --git a/crates/p2p/src/relay/manager.rs b/crates/p2p/src/relay/manager.rs new file mode 100644 index 00000000..8011e0b4 --- /dev/null +++ b/crates/p2p/src/relay/manager.rs @@ -0,0 +1,702 @@ +//! The [`RelayManager`] behaviour: reservation lifecycle and peer routing. + +use std::{ + collections::{HashMap, VecDeque}, + convert::Infallible, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +use futures::stream::StreamExt; +use libp2p::{ + Multiaddr, PeerId, + core::{Endpoint, transport::PortUse}, + multiaddr::Protocol as MaProtocol, + swarm::{ + ConnectionDenied, ConnectionId, DialError, FromSwarm, NetworkBehaviour, THandler, + THandlerInEvent, ToSwarm, dummy, + }, +}; +use tokio::time::{Instant, Sleep, sleep_until}; +use tokio_stream::wrappers::WatchStream; + +use super::{ + dial::{RelayDialState, addr_sets_equal}, + event::{RelayDialError, RelayDialType, RelayManagerEvent}, +}; +use crate::{ + p2p_context::P2PContext, + peer::{MutablePeer, Peer}, +}; + +#[cfg(test)] +mod tests; + +/// How long a relay may stay in `Established` (transport connected, no +/// reservation yet) before the watchdog force-closes the transport so a fresh +/// dial campaign can recover. Mirrors Charon's "no relay connection, +/// reconnecting" path (`charon/p2p/relay.go:73-92`). +const ESTABLISHED_STUCK_THRESHOLD: Duration = Duration::from_secs(60); +/// How often the watchdog re-evaluates stuck-in-Established relays. +const ESTABLISHED_WATCHDOG_TICK: Duration = Duration::from_secs(15); + +/// Lifecycle of a relay reservation. +/// +/// - `Dialing`: a `RelayDialState` is in flight; no transport connection to the +/// relay yet. +/// - `Established`: transport connection to the relay is up; the swarm has been +/// asked to listen on the circuit address(es) but no reservation has been +/// confirmed yet. +/// - `Reserved`: the swarm has emitted `NewListenAddr` for the circuit address, +/// meaning the relay accepted our reservation and we can route peers through +/// it. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RelayConnectionState { + /// Dial campaign in flight; no transport connection to the relay yet. + Dialing, + /// Transport connection up; reservation not yet confirmed. + Established, + /// Reservation confirmed; circuits through this relay are usable. + Reserved, +} + +/// Libp2p [`NetworkBehaviour`] that reserves circuits on a configured set of +/// relays and routes known cluster peers through them. See the module-level +/// docs for the full responsibility breakdown. +pub struct RelayManager { + /// Events to emit to the swarm + events: VecDeque>, + + /// Streams of relay peer updates. Each stream yields the current value on + /// first poll, so initial peers are picked up automatically without a + /// separate bootstrap pass. + relay_subs: Vec>>, + + /// Dial states for each relay. + dial_states: HashMap, + + /// Connection states for each relay. + connection_states: HashMap, + + /// Latest known transport addresses for each relay. Persists across the + /// connection lifecycle so we can redial after `ConnectionClosed` without + /// waiting for another `MutablePeer` update. + relay_addrs: HashMap>, + + /// Tracks when each relay last entered `Established` without having since + /// reached `Reserved`. The watchdog uses this to identify relays whose + /// reservation never confirmed (or whose refresh was denied so libp2p's + /// relay client silently gave up) and force-close them so we redial fresh. + established_at: HashMap, + + /// Watchdog tick. Fires every `ESTABLISHED_WATCHDOG_TICK`; on fire we walk + /// `established_at` and emit `ToSwarm::CloseConnection` for any relay + /// stuck beyond `ESTABLISHED_STUCK_THRESHOLD`. Lazily initialised on the + /// first `poll` so `RelayManager::new` can be called outside a Tokio + /// runtime (e.g. in unit tests that exercise pure helpers). + watchdog: Option>>, + + /// Shared P2P context used to enumerate known cluster peers when routing + /// them through reserved relays. + p2p_context: P2PContext, +} + +impl RelayManager { + /// Creates a new relay manager: reserves circuits on the supplied relays + /// and routes known cluster peers through them. + pub fn new(mutable_peers: Vec, p2p_context: P2PContext) -> Self { + let relay_subs = mutable_peers + .iter() + .map(|mp| WatchStream::new(mp.subscribe())) + .collect(); + + Self { + events: VecDeque::new(), + relay_subs, + dial_states: HashMap::new(), + connection_states: HashMap::new(), + relay_addrs: HashMap::new(), + established_at: HashMap::new(), + watchdog: None, + p2p_context, + } + } + + /// Builds circuit listen addresses for a relay from its transport + /// addresses: `/ip4/.../tcp/.../p2p//p2p-circuit`. + fn circuit_addrs(relay_id: PeerId, addrs: &[Multiaddr]) -> Vec { + addrs + .iter() + .map(|addr| { + let mut circuit: Multiaddr = addr + .iter() + .filter(|p| !matches!(p, MaProtocol::P2p(_))) + .collect(); + circuit.push(MaProtocol::P2p(relay_id)); + circuit.push(MaProtocol::P2pCircuit); + circuit + }) + .collect() + } + + /// Extracts the relay peer id from a circuit listen address of the form + /// `/.../p2p//p2p-circuit`. Returns `None` if the address is not + /// a relay circuit address. + fn relay_id_from_circuit_addr(addr: &Multiaddr) -> Option { + let mut last_p2p: Option = None; + for proto in addr.iter() { + match proto { + MaProtocol::P2p(id) => last_p2p = Some(id), + MaProtocol::P2pCircuit => return last_p2p, + _ => {} + } + } + None + } + + /// Applies a relay address update from a [`MutablePeer`]: refreshes + /// tracked addresses and, if this is the first time we've seen this + /// relay, kicks off a new dial campaign. + fn queue_relay_update(&mut self, relay: Peer) { + self.relay_addrs.insert(relay.id, relay.addresses.clone()); + + // In-flight dial campaign: refresh its address list without resetting + // the backoff schedule. + if let Some(dial_state) = self.dial_states.get_mut(&relay.id) { + dial_state.addrs = relay.addresses; + return; + } + + // Already connected (Established or Reserved): nothing to do now; + // `relay_addrs` is updated and the next disconnect will pick it up. + if self.connection_states.contains_key(&relay.id) { + return; + } + + // First time we see this relay: start the dial campaign. + self.dial_states.insert( + relay.id, + RelayDialState::new(RelayDialType::RelayServer, relay.id, relay.addresses), + ); + self.set_relay_state(relay.id, RelayConnectionState::Dialing); + } + + /// Updates the connection state for a relay, logging the transition and + /// maintaining the `established_at` watchdog timestamp. + fn set_relay_state(&mut self, relay_id: PeerId, next: RelayConnectionState) { + let prev = self.connection_states.insert(relay_id, next); + if prev != Some(next) { + tracing::debug!( + relay_peer_id = %relay_id, + ?prev, + ?next, + "Relay connection state transition" + ); + } + match next { + // Entering or refreshing the no-reservation-yet state: start (or + // restart, on demote from Reserved) the stuck-Established timer. + RelayConnectionState::Established => { + if prev != Some(RelayConnectionState::Established) { + self.established_at.insert(relay_id, Instant::now()); + } + } + // Promoted to Reserved or back to Dialing: the relay isn't stuck + // in Established anymore, so clear its watchdog timestamp. + RelayConnectionState::Reserved | RelayConnectionState::Dialing => { + self.established_at.remove(&relay_id); + } + } + } + + /// Polls every active dial state once, queuing a `ToSwarm::Dial` event for + /// any whose backoff has elapsed. Wakers for the remaining (pending) ones + /// are registered via the underlying `Sleep` futures. + fn process_relay_dials(&mut self, cx: &mut Context<'_>) { + for state in self.dial_states.values_mut() { + if let Poll::Ready(Some(event)) = state.poll_next_unpin(cx) { + self.events.push_back(event); + } + } + } + + /// Watchdog for relays stuck in `Established`. + /// + /// Libp2p's relay client owns reservation refresh; if a relay denies a + /// refresh (overloaded, quota exhausted, version mismatch), the client + /// typically gives up silently — no further `NewListenAddr` is emitted + /// and the transport stays up, so `on_connection_closed` never fires. + /// Without intervention the relay would stay in `Established` forever. + /// + /// On each tick, any relay that has been `Established` for longer than + /// [`ESTABLISHED_STUCK_THRESHOLD`] gets a `ToSwarm::CloseConnection`; the + /// resulting `FromSwarm::ConnectionClosed` drives `on_connection_closed` + /// → `redial_relay`, mirroring Charon's "no relay connection, + /// reconnecting" recovery path (`charon/p2p/relay.go:73-92`). + fn process_established_watchdog(&mut self, cx: &mut Context<'_>) { + let watchdog = self.watchdog.get_or_insert_with(|| { + let deadline = Instant::now() + .checked_add(ESTABLISHED_WATCHDOG_TICK) + .unwrap_or_else(Instant::now); + Box::pin(sleep_until(deadline)) + }); + if watchdog.as_mut().poll(cx).is_pending() { + return; + } + + let now = Instant::now(); + let stuck: Vec = self + .established_at + .iter() + .filter(|(id, since)| { + now.saturating_duration_since(**since) >= ESTABLISHED_STUCK_THRESHOLD + && matches!( + self.connection_states.get(id), + Some(RelayConnectionState::Established) + ) + }) + .map(|(id, _)| *id) + .collect(); + + for relay_id in stuck { + tracing::warn!( + relay_peer_id = %relay_id, + threshold = ?ESTABLISHED_STUCK_THRESHOLD, + "Relay stuck in Established without reservation; force-closing for redial" + ); + // Clear the timestamp so we don't re-fire CloseConnection on the + // next tick while ConnectionClosed is in flight; on_connection_closed + // will eventually transition us back to Dialing. + self.established_at.remove(&relay_id); + self.events.push_back(ToSwarm::CloseConnection { + peer_id: relay_id, + connection: libp2p::swarm::CloseConnection::All, + }); + } + + let next_deadline = now + .checked_add(ESTABLISHED_WATCHDOG_TICK) + .unwrap_or_else(Instant::now); + // Watchdog is Some by construction inside this function — we just + // initialised or polled it above. + if let Some(watchdog) = self.watchdog.as_mut() { + watchdog.as_mut().reset(next_deadline); + } + } + + /// Returns the peer ids of relays whose circuit reservation has been + /// confirmed (i.e. swarm has issued `NewListenAddr` for the circuit). + fn reserved_relay_ids(&self) -> Vec { + self.connection_states + .iter() + .filter(|(_, s)| matches!(s, RelayConnectionState::Reserved)) + .map(|(id, _)| *id) + .collect() + } + + /// Builds circuit dial addresses for reaching `target` through every + /// currently reserved relay: + /// `/.../p2p//p2p-circuit/p2p/`. + fn peer_circuit_addrs(&self, target: &PeerId) -> Vec { + let mut addrs = Vec::new(); + for relay_id in self.reserved_relay_ids() { + let Some(relay_addrs) = self.relay_addrs.get(&relay_id) else { + continue; + }; + for relay_addr in relay_addrs { + let mut circuit: Multiaddr = relay_addr + .iter() + .filter(|p| !matches!(p, MaProtocol::P2p(_))) + .collect(); + circuit.push(MaProtocol::P2p(relay_id)); + circuit.push(MaProtocol::P2pCircuit); + circuit.push(MaProtocol::P2p(*target)); + addrs.push(circuit); + } + } + addrs + } + + /// Ensures every known cluster peer (≠ self) has a dial state armed to + /// reach it through the current set of reserved relays. + fn route_known_peers(&mut self) { + let local = self.p2p_context.local_peer_id(); + let targets: Vec = self + .p2p_context + .known_peers() + .iter() + .copied() + .filter(|id| Some(*id) != local) + .collect(); + + for target in targets { + self.upsert_peer_dial(target); + } + } + + /// Inserts or refreshes a dial state for `target` using the current circuit + /// addrs. + /// + /// If the address set changed (or there was no dial state yet) the backoff + /// schedule is reset so the new route is tried immediately. If the address + /// set is unchanged, the existing dial state is left alone — its backoff + /// schedule survives so we don't hammer peers that have been unreachable + /// just because re-routing was re-evaluated. If no reserved relay can + /// currently reach `target`, any pre-existing dial state is removed so we + /// don't keep firing `Dial` events at circuits through unreserved relays. + fn upsert_peer_dial(&mut self, target: PeerId) { + let addrs = self.peer_circuit_addrs(&target); + if addrs.is_empty() { + self.dial_states.remove(&target); + return; + } + + if let Some(existing) = self.dial_states.get(&target) + && addr_sets_equal(&existing.addrs, &addrs) + { + return; + } + + self.dial_states.insert( + target, + RelayDialState::new(RelayDialType::ClusterPeer, target, addrs), + ); + } + + /// Re-evaluates every active cluster-peer dial state against the current + /// set of reserved relays. Called when a relay leaves `Reserved` so that + /// peer dial campaigns stop self-rearming through circuits that no longer + /// exist. + fn refresh_peer_dials(&mut self) { + let peer_targets: Vec = self + .dial_states + .iter() + .filter(|(_, s)| matches!(s.ty, RelayDialType::ClusterPeer)) + .map(|(id, _)| *id) + .collect(); + for target in peer_targets { + self.upsert_peer_dial(target); + } + } + + /// Reacts to a new transport connection on a peer we previously dialed. + /// Relay dials transition into `Established` and queue circuit listeners; + /// peer routing dials just drop their dial state — libp2p takes it from + /// here. + fn on_connection_established(&mut self, peer_id: PeerId) { + let Some(dial_state) = self.dial_states.remove(&peer_id) else { + return; + }; + + match dial_state.ty { + RelayDialType::RelayServer => { + self.events + .push_back(ToSwarm::GenerateEvent(RelayManagerEvent::RelayConnected( + peer_id, + ))); + self.set_relay_state(peer_id, RelayConnectionState::Established); + + for circuit_addr in Self::circuit_addrs(peer_id, &dial_state.addrs) { + tracing::debug!( + relay_peer_id = %peer_id, + %circuit_addr, + "Requesting circuit listener on relay" + ); + self.events.push_back(ToSwarm::ListenOn { + opts: libp2p::swarm::ListenOpts::new(circuit_addr), + }); + } + } + RelayDialType::ClusterPeer => { + tracing::debug!( + peer_id = %peer_id, + "Routed peer connection established" + ); + self.events.push_back(ToSwarm::GenerateEvent( + RelayManagerEvent::PeerRoutedConnected(peer_id), + )); + } + } + } + + /// Reacts to a new listen address. If it's a circuit address for one of + /// our relays, promotes that relay's state to `Reserved` and re-routes + /// known peers through the updated set of reserved relays. + fn on_new_listen_addr(&mut self, addr: &Multiaddr) { + let Some(relay_id) = Self::relay_id_from_circuit_addr(addr) else { + return; + }; + let Some(state) = self.connection_states.get(&relay_id).copied() else { + return; + }; + match state { + RelayConnectionState::Dialing => { + tracing::warn!( + relay_peer_id = %relay_id, + listen_addr = %addr, + "NewListenAddr for relay in Dialing state; ignoring" + ); + } + RelayConnectionState::Reserved => { + // Second circuit address from the same relay — already routed. + } + RelayConnectionState::Established => { + tracing::info!( + relay_peer_id = %relay_id, + listen_addr = %addr, + "Relay reservation confirmed; routing known peers via this relay" + ); + self.set_relay_state(relay_id, RelayConnectionState::Reserved); + self.events + .push_back(ToSwarm::GenerateEvent(RelayManagerEvent::RelayReserved( + relay_id, + ))); + self.route_known_peers(); + } + } + } + + /// Reacts to a circuit listen address expiring. If the relay was in + /// `Reserved`, demote it to `Established` so we stop routing peers through + /// it. libp2p's circuit-client will normally refresh the reservation and + /// emit `NewListenAddr` again, which promotes us back. If the transport + /// connection also drops, `on_connection_closed` will handle the redial. + fn on_expired_listen_addr(&mut self, addr: &Multiaddr) { + let Some(relay_id) = Self::relay_id_from_circuit_addr(addr) else { + return; + }; + let Some(state) = self.connection_states.get(&relay_id).copied() else { + return; + }; + if matches!(state, RelayConnectionState::Reserved) { + tracing::info!( + relay_peer_id = %relay_id, + listen_addr = %addr, + "Relay circuit listener expired; demoting to Established" + ); + self.set_relay_state(relay_id, RelayConnectionState::Established); + self.events.push_back(ToSwarm::GenerateEvent( + RelayManagerEvent::RelayReservationLost(relay_id), + )); + // The reserved-relay set just shrank: drop or refresh any peer + // dial campaigns routed through this relay so they don't keep + // self-rearming through dead circuits. + self.refresh_peer_dials(); + } + } + + /// Reacts to the last connection to `peer_id` closing. Either it's one of + /// our relays (queue a fresh re-dial cycle) or a known cluster peer + /// (arm a fresh routing dial through the current reserved relays). + /// Anything else is ignored. + /// + /// If the relay was previously in `Reserved`, `RelayReservationLost` is + /// emitted before `RelayDisconnected` so subscribers see the reservation + /// tear down explicitly, and the peer routing campaigns through this + /// relay are refreshed to drop now-dead circuits. + fn on_connection_closed(&mut self, peer_id: PeerId) { + if let Some(prev_state) = self.connection_states.get(&peer_id).copied() { + let was_reserved = matches!(prev_state, RelayConnectionState::Reserved); + if was_reserved { + self.events.push_back(ToSwarm::GenerateEvent( + RelayManagerEvent::RelayReservationLost(peer_id), + )); + } + self.events.push_back(ToSwarm::GenerateEvent( + RelayManagerEvent::RelayDisconnected(peer_id), + )); + self.redial_relay(peer_id); + if was_reserved { + self.refresh_peer_dials(); + } + } else if self.p2p_context.is_known_peer(&peer_id) { + self.reroute_peer(peer_id); + } + } + + /// Reacts to a dial failure by logging and emitting a `DialFailed` event. + /// The underlying `RelayDialState` self-rearms with exponential backoff + /// on the next swarm poll, so by default no state change is needed here. + /// + /// One special case: `DialError::DialPeerConditionFalse` means libp2p + /// refused the dial because we're already connected to (or dialing) the + /// target. Behaviour depends on the dial type: + /// + /// - [`RelayDialType::ClusterPeer`]: libp2p owns the existing direct + /// connection. Drop the dial state and rely on + /// [`Self::on_connection_closed`] → [`Self::reroute_peer`] to re-arm the + /// dial once the existing connection actually closes. + /// - [`RelayDialType::RelayServer`]: dropping the dial state here would + /// wedge `connection_states` in `Dialing` forever — no + /// `on_connection_closed` will fire if libp2p already has the transport + /// connection, and `queue_relay_update` short-circuits while + /// `connection_states` has an entry. Instead leave the campaign armed; + /// backoff retries are cheap (libp2p re-rejects with the same error) and + /// `on_connection_established` will tear the dial state down once libp2p + /// surfaces the connection. + fn on_dial_failure(&mut self, peer_id: Option, error: &DialError) { + let Some(peer_id) = peer_id else { return }; + let Some(state) = self.dial_states.get(&peer_id) else { + return; + }; + let target = state.ty; + let retry_count = state.retry_count; + let skipped = matches!(error, DialError::DialPeerConditionFalse(_)); + + if skipped { + match target { + RelayDialType::ClusterPeer => { + tracing::debug!( + peer_id = %peer_id, + dial_type = ?target, + retry_count, + %error, + "Dial skipped (already connected or dialing); dropping dial state" + ); + self.dial_states.remove(&peer_id); + } + RelayDialType::RelayServer => { + tracing::debug!( + peer_id = %peer_id, + dial_type = ?target, + retry_count, + %error, + "Dial skipped for relay; keeping campaign armed for backoff retry" + ); + } + } + } else { + tracing::debug!( + peer_id = %peer_id, + dial_type = ?target, + retry_count, + %error, + "Dial failed, will retry with backoff" + ); + } + + self.events + .push_back(ToSwarm::GenerateEvent(RelayManagerEvent::DialFailed { + peer_id, + target, + retry_count, + error: RelayDialError::from(error), + })); + } + + /// Schedules a re-dial for a relay whose last connection just dropped. + fn redial_relay(&mut self, relay_id: PeerId) { + let Some(addrs) = self.relay_addrs.get(&relay_id).cloned() else { + tracing::warn!( + relay_peer_id = %relay_id, + "Relay closed but addresses no longer tracked; cannot redial" + ); + self.connection_states.remove(&relay_id); + return; + }; + tracing::debug!( + relay_peer_id = %relay_id, + "Relay connection closed, queuing re-dial with backoff" + ); + self.dial_states.insert( + relay_id, + RelayDialState::new(RelayDialType::RelayServer, relay_id, addrs), + ); + self.set_relay_state(relay_id, RelayConnectionState::Dialing); + } + + /// Arms a dial campaign for a known cluster peer whose last connection + /// just dropped, routing through all currently reserved relays. Delegates + /// to [`Self::upsert_peer_dial`] so that an existing dial state with the + /// same circuit addrs survives — its backoff schedule is preserved across + /// rapid disconnect/reconnect cycles when the route hasn't changed. No-op + /// if no relay is currently reserved. + fn reroute_peer(&mut self, peer_id: PeerId) { + tracing::debug!( + peer_id = %peer_id, + "Peer connection closed, re-routing via reserved relays" + ); + self.upsert_peer_dial(peer_id); + } +} + +impl NetworkBehaviour for RelayManager { + type ConnectionHandler = dummy::ConnectionHandler; + type ToSwarm = RelayManagerEvent; + + fn handle_established_inbound_connection( + &mut self, + _connection_id: ConnectionId, + _peer: PeerId, + _local_addr: &Multiaddr, + _remote_addr: &Multiaddr, + ) -> Result, ConnectionDenied> { + Ok(dummy::ConnectionHandler) + } + + fn handle_established_outbound_connection( + &mut self, + _connection_id: ConnectionId, + _peer: PeerId, + _addr: &Multiaddr, + _role_override: Endpoint, + _port_use: PortUse, + ) -> Result, ConnectionDenied> { + Ok(dummy::ConnectionHandler) + } + + fn on_swarm_event(&mut self, event: FromSwarm) { + match event { + FromSwarm::ConnectionEstablished(conn) => { + self.on_connection_established(conn.peer_id); + } + FromSwarm::NewListenAddr(ev) => { + self.on_new_listen_addr(ev.addr); + } + FromSwarm::ExpiredListenAddr(ev) => { + self.on_expired_listen_addr(ev.addr); + } + FromSwarm::ConnectionClosed(conn) if conn.remaining_established == 0 => { + self.on_connection_closed(conn.peer_id); + } + FromSwarm::DialFailure(ev) => { + self.on_dial_failure(ev.peer_id, ev.error); + } + _ => {} + } + } + + fn on_connection_handler_event( + &mut self, + _peer_id: libp2p::PeerId, + _connection_id: libp2p::swarm::ConnectionId, + _event: libp2p::swarm::THandlerOutEvent, + ) { + // No special handling needed for connection handler events + } + + fn poll( + &mut self, + cx: &mut Context<'_>, + ) -> std::task::Poll>> { + let mut updates: Vec = Vec::new(); + for stream in &mut self.relay_subs { + while let Poll::Ready(Some(Some(peer))) = stream.poll_next_unpin(cx) { + updates.push(peer); + } + } + for peer in updates { + self.queue_relay_update(peer); + } + + self.process_relay_dials(cx); + self.process_established_watchdog(cx); + + if let Some(event) = self.events.pop_front() { + return Poll::Ready(event); + } + + Poll::Pending + } +} diff --git a/crates/p2p/src/relay/manager/tests.rs b/crates/p2p/src/relay/manager/tests.rs new file mode 100644 index 00000000..1c942773 --- /dev/null +++ b/crates/p2p/src/relay/manager/tests.rs @@ -0,0 +1,639 @@ +use std::{collections::HashSet, str::FromStr}; + +use super::*; +use crate::relay::dial::RelayDialState; + +fn addr(s: &str) -> Multiaddr { + Multiaddr::from_str(s).expect("valid multiaddr") +} + +fn manager() -> RelayManager { + RelayManager::new(Vec::new(), P2PContext::new(Vec::::new())) +} + +// ---- circuit_addrs ------------------------------------------------- + +#[test] +fn circuit_addrs_strips_existing_p2p_and_appends_relay_suffix() { + let relay = PeerId::random(); + let transport = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}")); + + let out = RelayManager::circuit_addrs(relay, &[transport]); + + let expected = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit")); + assert_eq!(out, vec![expected]); +} + +#[test] +fn circuit_addrs_handles_addr_without_existing_p2p_component() { + let relay = PeerId::random(); + let transport = addr("/ip4/10.0.0.1/udp/9000/quic-v1"); + + let out = RelayManager::circuit_addrs(relay, &[transport]); + + let expected = addr(&format!( + "/ip4/10.0.0.1/udp/9000/quic-v1/p2p/{relay}/p2p-circuit" + )); + assert_eq!(out, vec![expected]); +} + +#[test] +fn circuit_addrs_preserves_input_order_for_multiple_addrs() { + let relay = PeerId::random(); + let other = PeerId::random(); + let inputs = vec![ + addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{other}")), + addr("/ip4/10.0.0.1/udp/9000/quic-v1"), + ]; + + let out = RelayManager::circuit_addrs(relay, &inputs); + + assert_eq!( + out, + vec![ + addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit")), + addr(&format!( + "/ip4/10.0.0.1/udp/9000/quic-v1/p2p/{relay}/p2p-circuit" + )), + ] + ); +} + +#[test] +fn circuit_addrs_empty_input_yields_empty_output() { + let relay = PeerId::random(); + let out = RelayManager::circuit_addrs(relay, &[]); + assert!(out.is_empty()); +} + +// ---- relay_id_from_circuit_addr ----------------------------------- + +#[test] +fn relay_id_from_circuit_addr_extracts_last_p2p_before_circuit() { + let relay = PeerId::random(); + let circuit = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit")); + + assert_eq!( + RelayManager::relay_id_from_circuit_addr(&circuit), + Some(relay) + ); +} + +#[test] +fn relay_id_from_circuit_addr_ignores_target_p2p_after_circuit() { + // Full circuit-dial form `/.../p2p//p2p-circuit/p2p/` + // must return the relay id (before `/p2p-circuit`), not the target. + let relay = PeerId::random(); + let target = PeerId::random(); + let circuit = addr(&format!( + "/ip4/127.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit/p2p/{target}" + )); + + assert_eq!( + RelayManager::relay_id_from_circuit_addr(&circuit), + Some(relay) + ); +} + +#[test] +fn relay_id_from_circuit_addr_returns_none_when_no_circuit_component() { + let peer = PeerId::random(); + let plain = addr(&format!("/ip4/127.0.0.1/tcp/9000/p2p/{peer}")); + + assert_eq!(RelayManager::relay_id_from_circuit_addr(&plain), None); +} + +#[test] +fn relay_id_from_circuit_addr_returns_none_when_circuit_has_no_preceding_p2p() { + let bare = addr("/ip4/127.0.0.1/tcp/9000/p2p-circuit"); + assert_eq!(RelayManager::relay_id_from_circuit_addr(&bare), None); +} + +// ---- peer_circuit_addrs ------------------------------------------- + +#[test] +fn peer_circuit_addrs_returns_empty_when_no_relays_reserved() { + let mgr = manager(); + let target = PeerId::random(); + assert!(mgr.peer_circuit_addrs(&target).is_empty()); +} + +#[test] +fn peer_circuit_addrs_ignores_relays_in_dialing_or_established() { + let mut mgr = manager(); + let target = PeerId::random(); + let dialing = PeerId::random(); + let established = PeerId::random(); + + mgr.connection_states + .insert(dialing, RelayConnectionState::Dialing); + mgr.relay_addrs + .insert(dialing, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + mgr.connection_states + .insert(established, RelayConnectionState::Established); + mgr.relay_addrs + .insert(established, vec![addr("/ip4/10.0.0.2/tcp/9000")]); + + assert!(mgr.peer_circuit_addrs(&target).is_empty()); +} + +#[test] +fn peer_circuit_addrs_skips_reserved_relay_without_tracked_addrs() { + let mut mgr = manager(); + let target = PeerId::random(); + let relay = PeerId::random(); + + mgr.connection_states + .insert(relay, RelayConnectionState::Reserved); + // No entry in relay_addrs: the relay is reserved but we have no + // transport addrs to build a circuit through it. + + assert!(mgr.peer_circuit_addrs(&target).is_empty()); +} + +#[test] +fn peer_circuit_addrs_builds_one_circuit_per_reserved_relay_addr() { + let mut mgr = manager(); + let target = PeerId::random(); + let relay = PeerId::random(); + + let relay_addrs = vec![ + // With and without trailing /p2p/ — both should produce the + // same canonical circuit form. + addr(&format!("/ip4/10.0.0.1/tcp/9000/p2p/{relay}")), + addr("/ip4/10.0.0.1/udp/9000/quic-v1"), + ]; + mgr.connection_states + .insert(relay, RelayConnectionState::Reserved); + mgr.relay_addrs.insert(relay, relay_addrs); + + let out = mgr.peer_circuit_addrs(&target); + + let expected = vec![ + addr(&format!( + "/ip4/10.0.0.1/tcp/9000/p2p/{relay}/p2p-circuit/p2p/{target}" + )), + addr(&format!( + "/ip4/10.0.0.1/udp/9000/quic-v1/p2p/{relay}/p2p-circuit/p2p/{target}" + )), + ]; + assert_eq!(out, expected); +} + +#[test] +fn peer_circuit_addrs_aggregates_across_multiple_reserved_relays() { + let mut mgr = manager(); + let target = PeerId::random(); + let relay_a = PeerId::random(); + let relay_b = PeerId::random(); + + mgr.connection_states + .insert(relay_a, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay_a, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + mgr.connection_states + .insert(relay_b, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay_b, vec![addr("/ip4/10.0.0.2/tcp/9000")]); + + let out: HashSet = mgr.peer_circuit_addrs(&target).into_iter().collect(); + + let expected: HashSet = [ + addr(&format!( + "/ip4/10.0.0.1/tcp/9000/p2p/{relay_a}/p2p-circuit/p2p/{target}" + )), + addr(&format!( + "/ip4/10.0.0.2/tcp/9000/p2p/{relay_b}/p2p-circuit/p2p/{target}" + )), + ] + .into_iter() + .collect(); + assert_eq!(out, expected); +} + +// ---- queue_relay_update ------------------------------------------- + +fn relay_peer(id: PeerId, addrs: Vec) -> Peer { + Peer { + id, + addresses: addrs, + index: 0, + name: crate::name::peer_name(&id), + } +} + +#[tokio::test] +async fn queue_relay_update_first_seen_starts_dial_campaign() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + let addrs = vec![addr("/ip4/10.0.0.1/tcp/9000")]; + + mgr.queue_relay_update(relay_peer(relay_id, addrs.clone())); + + assert!(mgr.dial_states.contains_key(&relay_id)); + assert_eq!( + mgr.connection_states.get(&relay_id), + Some(&RelayConnectionState::Dialing) + ); + assert_eq!(mgr.relay_addrs.get(&relay_id), Some(&addrs)); +} + +#[tokio::test] +async fn queue_relay_update_refreshes_inflight_addrs_without_resetting_backoff() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + + mgr.queue_relay_update(relay_peer(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")])); + // Pretend the dial state has already retried a few times. + mgr.dial_states.get_mut(&relay_id).unwrap().retry_count = 7; + + let new_addrs = vec![ + addr("/ip4/10.0.0.1/tcp/9000"), + addr("/ip4/10.0.0.2/tcp/9000"), + ]; + mgr.queue_relay_update(relay_peer(relay_id, new_addrs.clone())); + + let state = mgr.dial_states.get(&relay_id).unwrap(); + assert_eq!(state.addrs, new_addrs); + assert_eq!( + state.retry_count, 7, + "backoff schedule must survive refresh" + ); + assert_eq!(mgr.relay_addrs.get(&relay_id), Some(&new_addrs)); +} + +#[tokio::test] +async fn queue_relay_update_no_op_when_relay_already_connected() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + mgr.connection_states + .insert(relay_id, RelayConnectionState::Reserved); + + let new_addrs = vec![addr("/ip4/10.0.0.99/tcp/9000")]; + mgr.queue_relay_update(relay_peer(relay_id, new_addrs.clone())); + + assert!( + !mgr.dial_states.contains_key(&relay_id), + "no dial campaign while connected" + ); + // Connection state untouched. + assert_eq!( + mgr.connection_states.get(&relay_id), + Some(&RelayConnectionState::Reserved) + ); + // relay_addrs still gets refreshed so we have the latest list ready + // for redial after a disconnect. + assert_eq!(mgr.relay_addrs.get(&relay_id), Some(&new_addrs)); +} + +// ---- state machine: on_connection_established ---------------------- + +#[tokio::test] +async fn on_connection_established_relay_promotes_to_established_and_queues_listen() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + let relay_addrs = vec![addr("/ip4/10.0.0.1/tcp/9000")]; + + mgr.queue_relay_update(relay_peer(relay_id, relay_addrs.clone())); + mgr.events.clear(); + mgr.on_connection_established(relay_id); + + assert!(!mgr.dial_states.contains_key(&relay_id)); + assert_eq!( + mgr.connection_states.get(&relay_id), + Some(&RelayConnectionState::Established) + ); + let listen_count = mgr + .events + .iter() + .filter(|e| matches!(e, ToSwarm::ListenOn { .. })) + .count(); + assert_eq!(listen_count, relay_addrs.len()); + let relay_connected = mgr.events.iter().any(|e| { + matches!( + e, + ToSwarm::GenerateEvent(RelayManagerEvent::RelayConnected(id)) if *id == relay_id + ) + }); + assert!(relay_connected, "RelayConnected event must be emitted"); +} + +#[tokio::test] +async fn on_connection_established_cluster_peer_drops_dial_state() { + let mut mgr = manager(); + let target = PeerId::random(); + // Seed a peer-routing dial state (skipping upsert which requires + // reserved relays). + mgr.dial_states.insert( + target, + RelayDialState::new( + RelayDialType::ClusterPeer, + target, + vec![addr("/ip4/10.0.0.1/tcp/9000/p2p-circuit")], + ), + ); + + mgr.on_connection_established(target); + + assert!(!mgr.dial_states.contains_key(&target)); + let routed = mgr.events.iter().any(|e| { + matches!( + e, + ToSwarm::GenerateEvent(RelayManagerEvent::PeerRoutedConnected(id)) if *id == target + ) + }); + assert!(routed, "PeerRoutedConnected event must be emitted"); +} + +// ---- state machine: on_new_listen_addr ----------------------------- + +#[tokio::test] +async fn on_new_listen_addr_promotes_established_to_reserved() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + mgr.connection_states + .insert(relay_id, RelayConnectionState::Established); + mgr.relay_addrs + .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + + let circuit = addr(&format!( + "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit" + )); + mgr.on_new_listen_addr(&circuit); + + assert_eq!( + mgr.connection_states.get(&relay_id), + Some(&RelayConnectionState::Reserved) + ); + let reserved = mgr.events.iter().any(|e| { + matches!( + e, + ToSwarm::GenerateEvent(RelayManagerEvent::RelayReserved(id)) if *id == relay_id + ) + }); + assert!(reserved); +} + +// ---- state machine: on_expired_listen_addr ------------------------- + +#[tokio::test] +async fn on_expired_listen_addr_demotes_reserved_and_emits_reservation_lost() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + mgr.connection_states + .insert(relay_id, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + + let circuit = addr(&format!( + "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit" + )); + mgr.on_expired_listen_addr(&circuit); + + assert_eq!( + mgr.connection_states.get(&relay_id), + Some(&RelayConnectionState::Established) + ); + let lost = mgr.events.iter().any(|e| { + matches!( + e, + ToSwarm::GenerateEvent(RelayManagerEvent::RelayReservationLost(id)) + if *id == relay_id + ) + }); + assert!(lost, "RelayReservationLost must be emitted on demote"); +} + +#[tokio::test] +async fn on_expired_listen_addr_drops_peer_dials_with_no_route_left() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + let target = PeerId::random(); + + // Single reserved relay supporting a peer-routing dial. + mgr.connection_states + .insert(relay_id, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + mgr.dial_states.insert( + target, + RelayDialState::new( + RelayDialType::ClusterPeer, + target, + vec![addr(&format!( + "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit/p2p/{target}" + ))], + ), + ); + + let circuit = addr(&format!( + "/ip4/10.0.0.1/tcp/9000/p2p/{relay_id}/p2p-circuit" + )); + mgr.on_expired_listen_addr(&circuit); + + assert!( + !mgr.dial_states.contains_key(&target), + "peer dial state must be dropped once no reserved relay can route to it" + ); +} + +// ---- state machine: on_connection_closed --------------------------- + +#[tokio::test] +async fn on_connection_closed_reserved_relay_emits_lost_before_disconnected() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + mgr.connection_states + .insert(relay_id, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + + mgr.on_connection_closed(relay_id); + + let lost_idx = mgr.events.iter().position(|e| { + matches!( + e, + ToSwarm::GenerateEvent(RelayManagerEvent::RelayReservationLost(id)) + if *id == relay_id + ) + }); + let disc_idx = mgr.events.iter().position(|e| { + matches!( + e, + ToSwarm::GenerateEvent(RelayManagerEvent::RelayDisconnected(id)) if *id == relay_id + ) + }); + let lost = lost_idx.expect("RelayReservationLost must fire when prev state was Reserved"); + let disc = disc_idx.expect("RelayDisconnected must fire on relay close"); + assert!(lost < disc, "ReservationLost must precede Disconnected"); + assert_eq!( + mgr.connection_states.get(&relay_id), + Some(&RelayConnectionState::Dialing), + "redial campaign must arm" + ); + assert!(mgr.dial_states.contains_key(&relay_id)); +} + +#[tokio::test] +async fn on_connection_closed_established_relay_skips_reservation_lost() { + let mut mgr = manager(); + let relay_id = PeerId::random(); + mgr.connection_states + .insert(relay_id, RelayConnectionState::Established); + mgr.relay_addrs + .insert(relay_id, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + + mgr.on_connection_closed(relay_id); + + let lost = mgr.events.iter().any(|e| { + matches!( + e, + ToSwarm::GenerateEvent(RelayManagerEvent::RelayReservationLost(_)) + ) + }); + assert!( + !lost, + "no ReservationLost event when prev state wasn't Reserved" + ); +} + +// ---- on_dial_failure: Skipped path -------------------------------- + +fn skipped_dial_error() -> DialError { + DialError::DialPeerConditionFalse( + libp2p::swarm::dial_opts::PeerCondition::DisconnectedAndNotDialing, + ) +} + +#[tokio::test] +async fn on_dial_failure_skipped_cluster_peer_drops_dial_state() { + let mut mgr = manager(); + let target = PeerId::random(); + mgr.dial_states.insert( + target, + RelayDialState::new( + RelayDialType::ClusterPeer, + target, + vec![addr("/ip4/10.0.0.1/tcp/9000")], + ), + ); + + mgr.on_dial_failure(Some(target), &skipped_dial_error()); + + assert!( + !mgr.dial_states.contains_key(&target), + "cluster-peer dial state must be dropped on Skipped" + ); +} + +#[tokio::test] +async fn on_dial_failure_skipped_relay_keeps_dial_state() { + // Regression for the wedge bug: keep the campaign armed so backoff + // continues to retry until libp2p surfaces the connection state. + let mut mgr = manager(); + let relay_id = PeerId::random(); + mgr.connection_states + .insert(relay_id, RelayConnectionState::Dialing); + mgr.dial_states.insert( + relay_id, + RelayDialState::new( + RelayDialType::RelayServer, + relay_id, + vec![addr("/ip4/10.0.0.1/tcp/9000")], + ), + ); + + mgr.on_dial_failure(Some(relay_id), &skipped_dial_error()); + + assert!( + mgr.dial_states.contains_key(&relay_id), + "relay dial state must survive Skipped so backoff can retry" + ); + assert_eq!( + mgr.connection_states.get(&relay_id), + Some(&RelayConnectionState::Dialing), + "connection state must still be Dialing" + ); +} + +// ---- upsert_peer_dial --------------------------------------------- + +#[tokio::test] +async fn upsert_peer_dial_preserves_backoff_when_addrs_unchanged() { + let mut mgr = manager(); + let target = PeerId::random(); + let relay = PeerId::random(); + mgr.connection_states + .insert(relay, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + + mgr.upsert_peer_dial(target); + let inserted_count = mgr.dial_states.get(&target).map(|s| s.retry_count); + // Pretend the dial has retried. + if let Some(s) = mgr.dial_states.get_mut(&target) { + s.retry_count = 5; + } + mgr.upsert_peer_dial(target); + let after = mgr.dial_states.get(&target).map(|s| s.retry_count); + assert_eq!(inserted_count, Some(0)); + assert_eq!( + after, + Some(5), + "addr-set unchanged: existing dial state (and its backoff) must be preserved" + ); +} + +#[tokio::test] +async fn upsert_peer_dial_resets_backoff_when_addrs_change() { + let mut mgr = manager(); + let target = PeerId::random(); + let relay_a = PeerId::random(); + let relay_b = PeerId::random(); + mgr.connection_states + .insert(relay_a, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay_a, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + + mgr.upsert_peer_dial(target); + if let Some(s) = mgr.dial_states.get_mut(&target) { + s.retry_count = 5; + } + + // Reserve a second relay → new circuit addr → addr-set changes. + mgr.connection_states + .insert(relay_b, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay_b, vec![addr("/ip4/10.0.0.2/tcp/9000")]); + mgr.upsert_peer_dial(target); + + assert_eq!( + mgr.dial_states.get(&target).map(|s| s.retry_count), + Some(0), + "addr-set changed: dial state (and backoff) must be replaced" + ); +} + +#[tokio::test] +async fn upsert_peer_dial_drops_stale_state_when_no_route_left() { + let mut mgr = manager(); + let target = PeerId::random(); + let relay = PeerId::random(); + mgr.connection_states + .insert(relay, RelayConnectionState::Reserved); + mgr.relay_addrs + .insert(relay, vec![addr("/ip4/10.0.0.1/tcp/9000")]); + + mgr.upsert_peer_dial(target); + assert!(mgr.dial_states.contains_key(&target)); + + // Demote the only reserved relay → no circuit addrs left. + mgr.connection_states + .insert(relay, RelayConnectionState::Established); + mgr.upsert_peer_dial(target); + + assert!( + !mgr.dial_states.contains_key(&target), + "no reserved relay can reach target: stale dial state must be dropped" + ); +} diff --git a/crates/p2p/src/relay/mod.rs b/crates/p2p/src/relay/mod.rs new file mode 100644 index 00000000..89485c08 --- /dev/null +++ b/crates/p2p/src/relay/mod.rs @@ -0,0 +1,31 @@ +//! Relay reservation and cluster-peer routing. +//! +//! [`RelayManager`] is a libp2p [`NetworkBehaviour`] with three +//! responsibilities: +//! +//! 1. Subscribe to [`MutablePeer`] watch channels to receive relay address +//! updates as they're discovered. +//! 2. Manage each relay's reservation lifecycle (`Dialing → Established → +//! Reserved`) and redial with exponential backoff when transport connections +//! drop. +//! 3. Route known cluster peers through reserved relay circuits so peer-to-peer +//! traffic can traverse NATs that would otherwise block direct dials. +//! +//! The implementation is split into focused submodules: +//! +//! - [`event`] — public event/error types ([`RelayManagerEvent`], +//! [`RelayDialError`], [`RelayDialType`]). +//! - [`dial`] — dial-campaign machinery: exponential backoff and the per-target +//! dial state stream. +//! - [`manager`] — the [`RelayManager`] behaviour, its [`RelayConnectionState`] +//! lifecycle, and the [`NetworkBehaviour`] implementation. +//! +//! [`NetworkBehaviour`]: libp2p::swarm::NetworkBehaviour +//! [`MutablePeer`]: crate::peer::MutablePeer + +mod dial; +mod event; +mod manager; + +pub use event::{RelayDialError, RelayDialType, RelayManagerEvent}; +pub use manager::{RelayConnectionState, RelayManager}; From 1514d7159997a29e21986e95fa149ae2c303e8b5 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Thu, 11 Jun 2026 19:14:33 -0300 Subject: [PATCH 42/48] Fix merge conflicts - Remove generics from `DutyDefinitionSet` --- crates/core/src/scheduler.rs | 3 +- crates/core/src/validatorapi/component.rs | 45 ++++++++++++++--------- crates/eth2api/src/valcache.rs | 8 ---- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index b3917a36..21c4bc28 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -8,7 +8,8 @@ use pluto_eth2api::{EthBeaconNodeApiClientError, client}; use tokio::sync; use tokio_util::{future::FutureExt, sync::CancellationToken}; -use crate::{scheduler::metrics::SCHEDULER_METRICS, types, valcache}; +use crate::{scheduler::metrics::SCHEDULER_METRICS, types}; +use pluto_eth2api::valcache; mod metrics; diff --git a/crates/core/src/validatorapi/component.rs b/crates/core/src/validatorapi/component.rs index f9f7729e..e02cea37 100644 --- a/crates/core/src/validatorapi/component.rs +++ b/crates/core/src/validatorapi/component.rs @@ -95,7 +95,7 @@ pub type AwaitAggSigDbFn = Arc< /// Looks up the duty-definition set for a given [`Duty`]. The return type /// is an untyped interface map keyed by pubkey, kept as a type-erased /// `Box` so callers can downcast to the concrete -/// `DutyDefinitionSet` they need. +/// `DutyDefinitionSet` they need. pub type DutyDefFn = Arc< dyn Fn(Duty) -> BoxFuture<'static, Result, CallbackError>> + Send @@ -342,16 +342,14 @@ impl Component { .with_boxed_source(err) })?; - let def_set = boxed - .downcast::>() - .map_err(|_| { - ApiError::new( - StatusCode::INTERNAL_SERVER_ERROR, - "duty definition lookup returned unexpected type", - ) - })?; + let def_set = boxed.downcast::().map_err(|_| { + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "duty definition lookup returned unexpected type", + ) + })?; - if def_set.inner().len() != 1 { + if def_set.len() != 1 { return Err(ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "unexpected amount of proposer duties", @@ -1318,7 +1316,7 @@ mod tests { SignedRandao, SyncContribution, VersionedAggregatedAttestation, }, testutils::random_core_pub_key, - types::{Duty, DutyDefinition, DutyType, PubKey, SlotNumber}, + types::{Duty, DutyDefinition, DutyType, ProposerDutyDefinition, PubKey, SlotNumber}, validatorapi::types::AttestationDataOpts, }; @@ -2167,13 +2165,18 @@ mod tests { (component, mock) } - /// Build a single-entry `DutyDefinitionSet` keyed by + /// Build a single-entry `DutyDefinitionSet` keyed by /// `pubkey`. The inner `ProposerDuty` value is a default placeholder /// — `lookup_proposer_pubkey` only reads the map keys, so the /// value's contents are immaterial to these tests. - fn proposer_def_set(pubkey: PubKey) -> DutyDefinitionSet { + fn proposer_def_set(pubkey: PubKey) -> DutyDefinitionSet { + let definition = ProposerDutyDefinition { + pubkey, + v_idx: 0, + slot: 0.into(), + }; let mut set = DutyDefinitionSet::new(); - set.insert(pubkey, DutyDefinition::new(ProposerDuty::default())); + set.insert(pubkey, DutyDefinition::Proposer(definition)); set } @@ -2472,14 +2475,22 @@ mod tests { let (mut component, _mock) = make_proposal_component().await; component.register_get_duty_definition(|_duty| async move { - let mut set: DutyDefinitionSet = DutyDefinitionSet::new(); + let mut set: DutyDefinitionSet = DutyDefinitionSet::new(); set.insert( core_pubkey(0xAA), - DutyDefinition::new(ProposerDuty::default()), + DutyDefinition::Proposer(ProposerDutyDefinition { + pubkey: core_pubkey(0xAA), + v_idx: 0, + slot: 0.into(), + }), ); set.insert( core_pubkey(0xBB), - DutyDefinition::new(ProposerDuty::default()), + DutyDefinition::Proposer(ProposerDutyDefinition { + pubkey: core_pubkey(0xBB), + v_idx: 0, + slot: 0.into(), + }), ); Ok(Box::new(set) as Box) }); diff --git a/crates/eth2api/src/valcache.rs b/crates/eth2api/src/valcache.rs index 47c9cc90..ea52ef6d 100644 --- a/crates/eth2api/src/valcache.rs +++ b/crates/eth2api/src/valcache.rs @@ -1,12 +1,4 @@ -<<<<<<<< HEAD:crates/core/src/valcache.rs -use crate::types::PubKey; -use pluto_eth2api::{ -|||||||| ed684e31:crates/app/src/eth2wrap/valcache.rs -use pluto_core::types::PubKey; -use pluto_eth2api::{ -======== use crate::{ ->>>>>>>> origin/main:crates/eth2api/src/valcache.rs EthBeaconNodeApiClient, EthBeaconNodeApiClientError, GetStateValidatorsResponseResponse, GetStateValidatorsResponseResponseDatum, PostStateValidatorsRequest, PostStateValidatorsRequestPath, PostStateValidatorsResponse, ValidatorRequestBody, From c6ec545b6e8e23736ec8cbae14eb4da185581412 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 12 Jun 2026 16:17:09 -0300 Subject: [PATCH 43/48] Cancellable inter-slot sleep --- crates/core/src/scheduler.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 21c4bc28..6e60fc74 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -633,7 +633,15 @@ async fn new_slot_ticker( .signed_duration_since(chrono::Utc::now()) .to_std() .unwrap_or_default(); - tokio::time::sleep(wait).await; + + if tokio::time::sleep(wait) + .with_cancellation_token(&ct) + .await + .is_none() + { + // Cancelled early + return; + }; // Avoid "thundering herd" problem by skipping slots if missed due // to pause-the-world events (i.e. resources are already constrained). From f46816126355123fa067e80308b3132e757eeb4e Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 12 Jun 2026 16:17:35 -0300 Subject: [PATCH 44/48] Remove timeout from `get_duty_definition` --- crates/core/src/scheduler.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 6e60fc74..45ad3aca 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -74,10 +74,6 @@ pub enum SchedulerError { /// Duty attempted to be accessed duty: types::Duty, }, - /// Timed out while waiting for the scheduler to respond with a duty - /// definition. - #[error("Timed out while waiting for a duty definition")] - TimeoutError, /// The underlying scheduler actor has been terminated. #[error("Scheduler actor has been terminated")] @@ -253,8 +249,6 @@ pub struct SchedulerHandle { impl SchedulerHandle { /// Returns the definition for a duty if a definition exists for a resolved /// epoch. - /// - /// NOTE: this operation has a default timeout of 100 ms. pub async fn get_duty_definition(&self, duty: types::Duty) -> Result { let (tx, rx) = sync::oneshot::channel(); let msg = SchedulerMessage::GetDutyDefinition { duty, resp: tx }; @@ -264,12 +258,7 @@ impl SchedulerHandle { .await .map_err(|_| SchedulerError::Terminated)?; - // This has to be very rare event, when the requested epoch is being resolved. - // We wait for the epoch to be resolved before returning the duty definition. - tokio::time::timeout(Duration::from_millis(100), rx) - .await - .map_err(|_| SchedulerError::TimeoutError)? - .map_err(|_| SchedulerError::Terminated)? + rx.await.map_err(|_| SchedulerError::Terminated)? } } From 5fc9431e67d91d55178f417665b4202556429164 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 12 Jun 2026 16:18:31 -0300 Subject: [PATCH 45/48] Reduce visibility of `handle_chain_reorg` --- crates/core/src/scheduler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 45ad3aca..a69e0e81 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -317,7 +317,7 @@ impl SchedulerActor { /// In case of a reorg of an already resolved epoch trim all duties. /// /// Duties will be resolved again in the nex slot. - pub async fn handle_chain_reorg(&mut self, epoch: u64) { + async fn handle_chain_reorg(&mut self, epoch: u64) { let resolved_epoch = self.resolved_epoch; if epoch < resolved_epoch { self.trim_duties(resolved_epoch); From b58d3e72fb3d514991489105decc24e4f5631b3d Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 12 Jun 2026 16:23:55 -0300 Subject: [PATCH 46/48] Log `duty` on `EpochNotResolved` error --- crates/core/src/scheduler.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index a69e0e81..432e2ed2 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -63,6 +63,9 @@ pub enum SchedulerError { EpochNotResolved { /// The unresolved epoch. epoch: u64, + + /// Duty attempted to be accessed + duty: types::Duty, }, /// Duty definition not found for a resolved epoch. @@ -201,7 +204,7 @@ impl SchedulerBuilder { .await .ok_or(SchedulerError::Terminated)??; - let slot_rx = new_slot_ticker(&client.clone(), ct.clone()).await?; + let slot_rx = new_slot_ticker(&client, ct.clone()).await?; let actor = SchedulerActor { client: client.clone(), @@ -346,7 +349,7 @@ impl SchedulerActor { .expect("non-zero"); if !self.is_epoch_resolved(epoch) { - return Err(SchedulerError::EpochNotResolved { epoch }); + return Err(SchedulerError::EpochNotResolved { epoch, duty }); } if self.is_epoch_trimmed(epoch) { @@ -1283,7 +1286,7 @@ mod tests { let att = types::Duty::new_attester_duty(slot0); assert!(matches!( actor.get_duty_definition(att.clone()).await, - Err(SchedulerError::EpochNotResolved { epoch: 0 }) + Err(SchedulerError::EpochNotResolved { epoch: 0, .. }) )); // Resolved but no duty stored. From abc0637d7e3075e54171f1bcf7cb8268a4039f56 Mon Sep 17 00:00:00 2001 From: Lautaro Emanuel Date: Fri, 12 Jun 2026 17:20:46 -0300 Subject: [PATCH 47/48] Nits --- crates/core/src/scheduler.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index 432e2ed2..d033af47 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -233,6 +233,7 @@ impl Default for SchedulerBuilder { Self::new() } } + enum SchedulerMessage { GetDutyDefinition { duty: types::Duty, @@ -341,6 +342,7 @@ impl SchedulerActor { return Err(SchedulerError::DeprecatedDutyBuilderProposer); } + // TODO: `client.fetch_slots_config` should be cached. let (_, slots_per_epoch) = self.client.fetch_slots_config().await?; let epoch = duty .slot From 6a25109aaa6341f551754d0a02728dd75d8917eb Mon Sep 17 00:00:00 2001 From: "emlautarom1-agent[bot]" <292495798+emlautarom1-agent[bot]@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:21:14 -0300 Subject: [PATCH 48/48] Test cancellation during slot offset wait --- crates/core/src/scheduler.rs | 53 ++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/crates/core/src/scheduler.rs b/crates/core/src/scheduler.rs index d033af47..b7491285 100644 --- a/crates/core/src/scheduler.rs +++ b/crates/core/src/scheduler.rs @@ -1135,6 +1135,20 @@ mod tests { } } + /// A [`types::Slot`] dated at `now` with a short slot duration, so + /// `delay_slot_offset` still has a live wait pending when the duty task is + /// spawned (unlike [`test_past_slot`], whose deadline has already elapsed). + /// The sync-committee contribution offset is 2/3 of the slot duration + /// (~600ms here), leaving a window to cancel before the broadcast fires. + fn test_future_slot(slot: u64, slots_per_epoch: u64) -> types::Slot { + types::Slot { + slot: types::SlotNumber::new(slot), + time: chrono::Utc::now(), + slot_duration: chrono::Duration::milliseconds(900), + slots_per_epoch, + } + } + /// Builds an attester duty definition for tests. fn test_attester_def(pubkey: types::PubKey, v_idx: u64, slot: u64) -> types::DutyDefinition { let datum = pluto_eth2api::types::GetAttesterDutiesResponseResponseDatum { @@ -1525,4 +1539,43 @@ mod tests { h.ct.cancel(); } + + #[tokio::test] + async fn cancellation_during_slot_offset_suppresses_duty_broadcast() { + let mock = duties_mock(16).await; + mount_head_validators(&mock, validator_set_a_datums()).await; + let mut h = spawn_actor(&mock); + + // A mid-epoch slot triggers only the sync-committee contribution duty, + // whose broadcast is delayed by 2/3 of the slot duration (~600ms here). + // Dated at `now`, the offset deadline is still in the future when the + // duty task is spawned, so it parks on the live `delay_slot_offset` wait + // inside `with_cancellation_token_owned`. + h.slot_tx + .send(test_future_slot(5, 16)) + .await + .expect("send slot"); + + // The slot itself broadcasts immediately, before the offset wait. + let slot = tokio::time::timeout(Duration::from_secs(2), h.slot_sub.recv()) + .await + .expect("slot broadcast within timeout") + .expect("slot value"); + assert_eq!(slot.slot.inner(), 5); + + // Cancel while the duty task is still waiting on the offset. + h.ct.cancel(); + + // No duty value must ever arrive: the offset wait is cancelled before + // its deadline. Wait past the ~600ms deadline to catch a regression + // where cancellation is not wired into `delay_slot_offset`. A timeout or + // a closed channel (the actor shut down and dropped its sender) both + // mean no broadcast fired; only a received duty (`Ok(Ok(_))`) is a + // failure. + let next = tokio::time::timeout(Duration::from_secs(1), h.duty_sub.recv()).await; + assert!( + !matches!(next, Ok(Ok(_))), + "expected no duty broadcast after cancellation, got {next:?}" + ); + } }