From 1a7886c0ce7c05ff63ef7e052a9457285211aba2 Mon Sep 17 00:00:00 2001 From: chaodu-agent Date: Mon, 11 May 2026 11:26:56 +0000 Subject: [PATCH 1/2] fix: add reconnect loop for Discord gateway MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When serenity's client.start() returns (either Ok or transient error), the Discord adapter now automatically reconnects with exponential backoff instead of silently dying. - Wrap client build + start in a retry loop - Fatal errors (bad token, bad intents) still exit immediately - Transient errors use exponential backoff (1s → 60s max) - Successful sessions reset backoff to 1s - Graceful shutdown via shutdown_rx breaks the loop - Log reconnect attempts at WARN level for observability Fixes #790 --- src/main.rs | 137 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 85 insertions(+), 52 deletions(-) diff --git a/src/main.rs b/src/main.rs index 706079b6..22253e6a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -360,6 +360,14 @@ async fn main() -> anyhow::Result<()> { None }; + // Spawn shutdown signal listener that notifies all adapters via shutdown_tx. + let shutdown_tx_signal = shutdown_tx.clone(); + tokio::spawn(async move { + shutdown_signal().await; + info!("shutdown signal received"); + let _ = shutdown_tx_signal.send(true); + }); + // Run Discord adapter (foreground, blocking) or wait for ctrl_c if let Some(discord_cfg) = cfg.discord { let allow_all_channels = config::resolve_allow_all( @@ -403,71 +411,96 @@ async fn main() -> anyhow::Result<()> { )); dispatchers.lock().unwrap().push(discord_dispatcher.clone()); - let handler = discord::Handler { - router, - allow_all_channels, - allow_all_users, - allowed_channels, - allowed_users, - stt_config: cfg.stt.clone(), - adapter: std::sync::OnceLock::new(), - allow_bot_messages: discord_cfg.allow_bot_messages, - trusted_bot_ids, - allow_user_messages: discord_cfg.allow_user_messages, - allowed_role_ids, - participated_threads: tokio::sync::Mutex::new(std::collections::HashMap::new()), - multibot_threads: tokio::sync::Mutex::new(std::collections::HashMap::new()), - session_ttl: std::time::Duration::from_secs(ttl_secs), - max_bot_turns: discord_cfg.max_bot_turns, - bot_turns: tokio::sync::Mutex::new(bot_turns::BotTurnTracker::new( - discord_cfg.max_bot_turns, - )), - allow_dm: discord_cfg.allow_dm, - dispatcher: discord_dispatcher, - }; - let intents = GatewayIntents::GUILD_MESSAGES | GatewayIntents::MESSAGE_CONTENT | GatewayIntents::GUILDS | GatewayIntents::DIRECT_MESSAGES; - let mut client = Client::builder(&discord_cfg.bot_token, intents) - .event_handler(handler) - .await?; + let mut reconnect_delay = std::time::Duration::from_secs(1); + const MAX_RECONNECT_DELAY: std::time::Duration = std::time::Duration::from_secs(60); + let mut shutdown_rx_discord = shutdown_rx.clone(); - // Graceful Discord shutdown on ctrl_c - let shard_manager = client.shard_manager.clone(); - tokio::spawn(async move { - shutdown_signal().await; - info!("shutdown signal received"); - shard_manager.shutdown_all().await; - }); + loop { + let handler = discord::Handler { + router: router.clone(), + allow_all_channels, + allow_all_users, + allowed_channels: allowed_channels.clone(), + allowed_users: allowed_users.clone(), + stt_config: cfg.stt.clone(), + adapter: std::sync::OnceLock::new(), + allow_bot_messages: discord_cfg.allow_bot_messages, + trusted_bot_ids: trusted_bot_ids.clone(), + allow_user_messages: discord_cfg.allow_user_messages, + allowed_role_ids: allowed_role_ids.clone(), + participated_threads: tokio::sync::Mutex::new(std::collections::HashMap::new()), + multibot_threads: tokio::sync::Mutex::new(std::collections::HashMap::new()), + session_ttl: std::time::Duration::from_secs(ttl_secs), + max_bot_turns: discord_cfg.max_bot_turns, + bot_turns: tokio::sync::Mutex::new(bot_turns::BotTurnTracker::new( + discord_cfg.max_bot_turns, + )), + allow_dm: discord_cfg.allow_dm, + dispatcher: discord_dispatcher.clone(), + }; + + let mut client = Client::builder(&discord_cfg.bot_token, intents) + .event_handler(handler) + .await?; + + let shard_manager = client.shard_manager.clone(); + let mut shutdown_rx_inner = shutdown_rx.clone(); + tokio::spawn(async move { + let _ = shutdown_rx_inner.changed().await; + shard_manager.shutdown_all().await; + }); + + info!("discord bot running"); + let result = client.start().await; + + // Check if we're shutting down — if so, don't reconnect. + if *shutdown_rx_discord.borrow() { + break; + } - info!("discord bot running"); - match client.start().await { - Err(serenity::Error::Gateway(GatewayError::DisallowedGatewayIntents)) => { - error!( - "Discord rejected privileged intents. \ - Enable MESSAGE CONTENT INTENT at: \ - https://discord.com/developers/applications → Bot → Privileged Gateway Intents" - ); - std::process::exit(1); + match result { + Err(serenity::Error::Gateway(GatewayError::DisallowedGatewayIntents)) => { + error!( + "Discord rejected privileged intents. \ + Enable MESSAGE CONTENT INTENT at: \ + https://discord.com/developers/applications → Bot → Privileged Gateway Intents" + ); + std::process::exit(1); + } + Err(serenity::Error::Gateway(GatewayError::InvalidAuthentication)) => { + error!( + "Discord rejected bot token. \ + Verify your bot_token in config.toml is correct and has not been reset." + ); + std::process::exit(1); + } + Err(e) => { + warn!(error = %e, delay_secs = reconnect_delay.as_secs(), "discord gateway error, reconnecting"); + } + Ok(_) => { + // Gateway ran successfully then disconnected — reset backoff. + reconnect_delay = std::time::Duration::from_secs(1); + warn!("discord gateway exited, reconnecting in 1s"); + } } - Err(serenity::Error::Gateway(GatewayError::InvalidAuthentication)) => { - error!( - "Discord rejected bot token. \ - Verify your bot_token in config.toml is correct and has not been reset." - ); - std::process::exit(1); + + tokio::select! { + _ = tokio::time::sleep(reconnect_delay) => {} + _ = shutdown_rx_discord.changed() => { break; } } - Err(e) => return Err(e.into()), - Ok(_) => {} + // Escalate delay only for errors (Ok resets above). + reconnect_delay = (reconnect_delay * 2).min(MAX_RECONNECT_DELAY); } } else { // No Discord — wait for SIGINT or SIGTERM info!("running without discord, press ctrl+c to stop"); - shutdown_signal().await; - info!("shutdown signal received"); + let mut shutdown_rx_wait = shutdown_rx.clone(); + let _ = shutdown_rx_wait.changed().await; } // Cleanup From b338de2482df94711fe55eff07e18e98434e6852 Mon Sep 17 00:00:00 2001 From: chaodu-agent Date: Mon, 11 May 2026 12:35:50 +0000 Subject: [PATCH 2/2] fix: address review findings (F1 builder error handling, F2 task accumulation, F3 backoff logic) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - F1 (🔴): Wrap Client::builder().await in match to retry on transient build failures instead of crashing main with ? - F2 (🟡): Abort shutdown listener task after client.start() returns to prevent task accumulation across reconnect iterations - F3 (🟡): Move backoff escalation into Err arm only; Ok path resets to 1s and does not escalate --- src/main.rs | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/src/main.rs b/src/main.rs index 22253e6a..4fbdf642 100644 --- a/src/main.rs +++ b/src/main.rs @@ -444,13 +444,27 @@ async fn main() -> anyhow::Result<()> { dispatcher: discord_dispatcher.clone(), }; - let mut client = Client::builder(&discord_cfg.bot_token, intents) + // F1 fix: handle builder errors within the loop instead of propagating with ? + let mut client = match Client::builder(&discord_cfg.bot_token, intents) .event_handler(handler) - .await?; + .await + { + Ok(c) => c, + Err(e) => { + warn!(error = %e, delay_secs = reconnect_delay.as_secs(), "failed to build discord client, retrying"); + tokio::select! { + _ = tokio::time::sleep(reconnect_delay) => {} + _ = shutdown_rx_discord.changed() => { break; } + } + reconnect_delay = (reconnect_delay * 2).min(MAX_RECONNECT_DELAY); + continue; + } + }; + // F2 fix: use an abort handle so the shutdown listener is cleaned up each iteration let shard_manager = client.shard_manager.clone(); let mut shutdown_rx_inner = shutdown_rx.clone(); - tokio::spawn(async move { + let shutdown_task = tokio::spawn(async move { let _ = shutdown_rx_inner.changed().await; shard_manager.shutdown_all().await; }); @@ -458,6 +472,9 @@ async fn main() -> anyhow::Result<()> { info!("discord bot running"); let result = client.start().await; + // Abort the shutdown listener for this iteration to avoid accumulation. + shutdown_task.abort(); + // Check if we're shutting down — if so, don't reconnect. if *shutdown_rx_discord.borrow() { break; @@ -481,20 +498,23 @@ async fn main() -> anyhow::Result<()> { } Err(e) => { warn!(error = %e, delay_secs = reconnect_delay.as_secs(), "discord gateway error, reconnecting"); + // F3 fix: escalate backoff only on errors + tokio::select! { + _ = tokio::time::sleep(reconnect_delay) => {} + _ = shutdown_rx_discord.changed() => { break; } + } + reconnect_delay = (reconnect_delay * 2).min(MAX_RECONNECT_DELAY); } Ok(_) => { - // Gateway ran successfully then disconnected — reset backoff. + // Gateway ran successfully then disconnected — reset backoff and retry quickly. reconnect_delay = std::time::Duration::from_secs(1); warn!("discord gateway exited, reconnecting in 1s"); + tokio::select! { + _ = tokio::time::sleep(reconnect_delay) => {} + _ = shutdown_rx_discord.changed() => { break; } + } } } - - tokio::select! { - _ = tokio::time::sleep(reconnect_delay) => {} - _ = shutdown_rx_discord.changed() => { break; } - } - // Escalate delay only for errors (Ok resets above). - reconnect_delay = (reconnect_delay * 2).min(MAX_RECONNECT_DELAY); } } else { // No Discord — wait for SIGINT or SIGTERM