Skip to content

Commit 7d947c6

Browse files
committed
feat: add back path congestion metrics
1 parent d44cfc2 commit 7d947c6

File tree

3 files changed

+229
-60
lines changed

3 files changed

+229
-60
lines changed

iroh/src/magicsock/metrics.rs

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use iroh_metrics::{Counter, MetricsGroup};
1+
use iroh_metrics::{Counter, Histogram, MetricsGroup};
22
use serde::{Deserialize, Serialize};
33

44
/// Enum of metrics for the module
@@ -14,8 +14,11 @@ pub struct Metrics {
1414
pub send_ipv4: Counter,
1515
pub send_ipv6: Counter,
1616
pub send_relay: Counter,
17+
pub send_relay_error: Counter,
1718

1819
// Data packets (non-disco)
20+
pub send_data: Counter,
21+
pub send_data_network_down: Counter,
1922
pub recv_data_relay: Counter,
2023
pub recv_data_ipv4: Counter,
2124
pub recv_data_ipv6: Counter,
@@ -69,25 +72,27 @@ pub struct Metrics {
6972
pub actor_tick_direct_addr_heartbeat: Counter,
7073
pub actor_link_change: Counter,
7174
pub actor_tick_other: Counter,
72-
// /// Histogram of connection latency in milliseconds across all endpoint connections.
73-
// #[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))]
74-
// pub connection_latency_ms: Histogram,
75-
// /*
76-
// * Path Congestion Metrics
77-
// */
75+
76+
/// Histogram of connection latency in milliseconds across all endpoint connections.
77+
#[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))]
78+
pub connection_latency_ms: Histogram,
79+
80+
/*
81+
* Path Congestion Metrics
82+
*/
7883
// /// Number of times a path was marked as outdated due to consecutive ping failures.
7984
// pub path_marked_outdated: Counter,
8085
// /// Number of ping failures recorded across all paths.
8186
// pub path_ping_failures: Counter,
8287
// /// Number of consecutive failure resets (path recovered).
8388
// pub path_failure_resets: Counter,
84-
// /// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
85-
// #[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
86-
// pub path_packet_loss_rate: Histogram,
87-
// /// Histogram of RTT variance (in milliseconds) as a congestion indicator.
88-
// #[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
89-
// pub path_rtt_variance_ms: Histogram,
90-
// /// Histogram of path quality scores (0.0-1.0).
91-
// #[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
92-
// pub path_quality_score: Histogram,
89+
/// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
90+
#[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
91+
pub path_packet_loss_rate: Histogram,
92+
/// Histogram of RTT variance (in milliseconds) as a congestion indicator.
93+
#[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
94+
pub path_rtt_variance_ms: Histogram,
95+
/// Histogram of path quality scores (0.0-1.0).
96+
#[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
97+
pub path_quality_score: Histogram,
9398
}

iroh/src/magicsock/remote_map/remote_state.rs

Lines changed: 50 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ use crate::{
4545
mod guarded_channel;
4646
mod path_state;
4747

48+
#[cfg(feature = "metrics")]
49+
mod metrics;
50+
4851
// TODO: use this
4952
// /// Number of addresses that are not active that we keep around per endpoint.
5053
// ///
@@ -83,6 +86,9 @@ const APPLICATION_ABANDON_PATH: u8 = 30;
8386
/// in a high frequency, and to keep data about previous path around for subsequent connections.
8487
const ACTOR_MAX_IDLE_TIMEOUT: Duration = Duration::from_secs(60);
8588

89+
/// Interval in which connection and path metrics are emitted.
90+
const METRICS_INTERVAL: Duration = Duration::from_secs(10);
91+
8692
/// A stream of events from all paths for all connections.
8793
///
8894
/// The connection is identified using [`ConnId`]. The event `Err` variant happens when the
@@ -240,8 +246,12 @@ impl RemoteStateActor {
240246
/// discipline is needed to not turn pending for a long time.
241247
async fn run(mut self, mut inbox: GuardedReceiver<RemoteStateMessage>) {
242248
trace!("actor started");
249+
243250
let idle_timeout = time::sleep(ACTOR_MAX_IDLE_TIMEOUT);
244251
n0_future::pin!(idle_timeout);
252+
253+
let mut metrics_interval = time::interval(METRICS_INTERVAL);
254+
245255
loop {
246256
let scheduled_path_open = match self.scheduled_open_path {
247257
Some(when) => MaybeFuture::Some(time::sleep_until(when)),
@@ -296,6 +306,9 @@ impl RemoteStateActor {
296306
item = self.discovery_stream.next() => {
297307
self.handle_discovery_item(item);
298308
}
309+
_ = metrics_interval.tick() => {
310+
self.record_metrics();
311+
}
299312
_ = &mut idle_timeout => {
300313
if self.connections.is_empty() && inbox.close_if_idle() {
301314
trace!("idle timeout expired and still idle: terminate actor");
@@ -430,7 +443,8 @@ impl RemoteStateActor {
430443
paths: Default::default(),
431444
open_paths: Default::default(),
432445
path_ids: Default::default(),
433-
transport_summary: TransportSummary::default(),
446+
#[cfg(feature = "metrics")]
447+
metrics: Default::default(),
434448
})
435449
.into_mut();
436450

@@ -589,8 +603,10 @@ impl RemoteStateActor {
589603

590604
fn handle_connection_close(&mut self, conn_id: ConnId) {
591605
if let Some(state) = self.connections.remove(&conn_id) {
592-
self.metrics.num_conns_closed.inc();
593-
state.transport_summary.record(&self.metrics);
606+
#[cfg(feature = "metrics")]
607+
state.metrics.record_closed(&self.metrics);
608+
#[cfg(not(feature = "metrics"))]
609+
let _ = state;
594610
}
595611
if self.connections.is_empty() {
596612
trace!("last connection closed - clearing selected_path");
@@ -1066,6 +1082,13 @@ impl RemoteStateActor {
10661082
}
10671083
}
10681084
}
1085+
1086+
fn record_metrics(&mut self) {
1087+
#[cfg(feature = "metrics")]
1088+
for state in self.connections.values_mut() {
1089+
state.record_metrics_periodic(&self.metrics, self.selected_path.get());
1090+
}
1091+
}
10691092
}
10701093

10711094
/// Messages to send to the [`RemoteStateActor`].
@@ -1174,8 +1197,11 @@ struct ConnectionState {
11741197
open_paths: FxHashMap<PathId, transports::Addr>,
11751198
/// Reverse map of [`Self::paths].
11761199
path_ids: FxHashMap<transports::Addr, PathId>,
1177-
/// Summary over transports used in this connection, for metrics tracking.
1178-
transport_summary: TransportSummary,
1200+
/// Tracker for stateful metrics for this connection and its paths
1201+
///
1202+
/// Feature-gated on the `metrics` feature because it increases memory use.
1203+
#[cfg(feature = "metrics")]
1204+
metrics: self::metrics::MetricsTracker,
11791205
}
11801206

11811207
impl ConnectionState {
@@ -1187,7 +1213,8 @@ impl ConnectionState {
11871213

11881214
/// Tracks an open path for the connection.
11891215
fn add_open_path(&mut self, remote: transports::Addr, path_id: PathId) {
1190-
self.transport_summary.add_path(&remote);
1216+
#[cfg(feature = "metrics")]
1217+
self.metrics.add_path(path_id, &remote);
11911218
self.paths.insert(path_id, remote.clone());
11921219
self.open_paths.insert(path_id, remote.clone());
11931220
self.path_ids.insert(remote, path_id);
@@ -1200,11 +1227,15 @@ impl ConnectionState {
12001227
self.path_ids.remove(&addr);
12011228
}
12021229
self.open_paths.remove(path_id);
1230+
#[cfg(feature = "metrics")]
1231+
self.metrics.remove_path(path_id);
12031232
}
12041233

12051234
/// Removes the path from the open paths.
12061235
fn remove_open_path(&mut self, path_id: &PathId) {
12071236
self.open_paths.remove(path_id);
1237+
#[cfg(feature = "metrics")]
1238+
self.metrics.remove_path(path_id);
12081239

12091240
self.update_pub_path_info();
12101241
}
@@ -1224,6 +1255,19 @@ impl ConnectionState {
12241255

12251256
self.pub_open_paths.set(new).ok();
12261257
}
1258+
1259+
#[cfg(feature = "metrics")]
1260+
fn record_metrics_periodic(
1261+
&mut self,
1262+
metrics: &MagicsockMetrics,
1263+
selected_path: Option<transports::Addr>,
1264+
) {
1265+
let Some(conn) = self.handle.upgrade() else {
1266+
return;
1267+
};
1268+
self.metrics
1269+
.record_periodic(metrics, &conn, &self.open_paths, selected_path);
1270+
}
12271271
}
12281272

12291273
/// Watcher for the open paths and selected transmission path in a connection.
@@ -1435,44 +1479,6 @@ impl Future for OnClosed {
14351479
}
14361480
}
14371481

1438-
/// Used for metrics tracking.
1439-
#[derive(Debug, Clone, Copy, Default)]
1440-
enum TransportSummary {
1441-
#[default]
1442-
None,
1443-
IpOnly,
1444-
RelayOnly,
1445-
IpAndRelay,
1446-
}
1447-
1448-
impl TransportSummary {
1449-
fn add_path(&mut self, addr: &transports::Addr) {
1450-
use transports::Addr;
1451-
*self = match (*self, addr) {
1452-
(TransportSummary::None | TransportSummary::IpOnly, Addr::Ip(_)) => Self::IpOnly,
1453-
(TransportSummary::None | TransportSummary::RelayOnly, Addr::Relay(_, _)) => {
1454-
Self::RelayOnly
1455-
}
1456-
_ => Self::IpAndRelay,
1457-
}
1458-
}
1459-
1460-
fn record(&self, metrics: &MagicsockMetrics) {
1461-
match self {
1462-
TransportSummary::IpOnly => {
1463-
metrics.num_conns_transport_ip_only.inc();
1464-
}
1465-
TransportSummary::RelayOnly => {
1466-
metrics.num_conns_transport_relay_only.inc();
1467-
}
1468-
TransportSummary::IpAndRelay => {
1469-
metrics.num_conns_transport_ip_and_relay.inc();
1470-
}
1471-
TransportSummary::None => {}
1472-
}
1473-
}
1474-
}
1475-
14761482
/// Converts an iterator of [`TransportAddr'] into an iterator of [`transports::Addr`].
14771483
fn to_transports_addr(
14781484
endpoint_id: EndpointId,

0 commit comments

Comments
 (0)