diff --git a/README.md b/README.md index 7f248a6..6fe7934 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,33 @@ OTEL_TRACES_SAMPLER=traceidratio OTEL_TRACES_SAMPLER_ARG=1.0 ``` +### Cron Monitoring + +If you run this tool on a schedule, you'll often want to be alerted when a backup +run fails to start or complete. To support this, GitHub Backup can report the +state of each scheduled run to an HTTP-based cron monitoring service such as +[Sentry Cron Monitors](https://docs.sentry.io/product/crons/) or +[healthchecks.io](https://healthchecks.io/). + +Monitoring is configured under the top-level `ping` key, where you can provide +a separate URL for each state you care about. Each URL is fetched with a simple +HTTP `GET` request when the corresponding state is reached, and any state you +omit is simply not reported. + +```yaml +ping: + # Fetched when a backup run starts. + start: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=in_progress + # Fetched when a backup run completes successfully. + success: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=ok + # Fetched when a backup run completes with one or more errors. + failure: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=error +``` + +A run is reported as a `failure` if any policy reports one or more errors, and as +a `success` otherwise. Reporting is best-effort: if the monitoring service can't +be reached, a warning is logged but the backup run itself is unaffected. + ## Filters This tool allows you to configure filters to control which GitHub repositories are backed up and diff --git a/docs/.vuepress/config.ts b/docs/.vuepress/config.ts index 84ef8ab..8dca884 100644 --- a/docs/.vuepress/config.ts +++ b/docs/.vuepress/config.ts @@ -79,7 +79,8 @@ export default defineUserConfig({ children: [ '/guide/README.md', '/guide/enterprise.md', - '/guide/telemetry.md' + '/guide/telemetry.md', + '/guide/monitors.md' ] }, { diff --git a/docs/guide/monitors.md b/docs/guide/monitors.md new file mode 100644 index 0000000..1cb8d2a --- /dev/null +++ b/docs/guide/monitors.md @@ -0,0 +1,69 @@ +# Cron Monitoring +GitHub Backup is designed to run unattended on a schedule, which makes it +important to know when a backup run fails to start or complete. To support this, +GitHub Backup can report the state of each scheduled run to an HTTP-based cron +monitoring service such as [Sentry Cron Monitors](https://docs.sentry.io/product/crons/) +or [healthchecks.io](https://healthchecks.io/). + +Whenever a backup run starts or completes, GitHub Backup will make a simple HTTP +`GET` request to the URL you've configured for that state, allowing your +monitoring service to track whether your backups are running as expected and to +alert you if they stop. + +## Configuration +Monitoring is configured under the top-level `ping` key in your configuration +file. You may provide a separate URL for each of the `start`, `success`, and +`failure` states, and any state you leave out is simply not reported. + +```yaml +schedule: "0 * * * *" + +ping: + # Fetched when a backup run starts. + start: https://example.com/monitor/start + # Fetched when a backup run completes successfully. + success: https://example.com/monitor/success + # Fetched when a backup run completes with one or more errors. + failure: https://example.com/monitor/failure + +backups: + - kind: github/repo + from: user + to: /backup/github + credentials: !Token your_access_token +``` + +A run is reported as a `failure` if any backup policy reports one or more errors, +and as a `success` otherwise. + +::: tip +Reporting is best-effort. If the monitoring service can't be reached, a warning is +logged but the backup run itself is never affected, ensuring that a flaky monitor +can't cause an otherwise healthy backup to be reported as failed. +::: + +## Examples + +### Sentry +[Sentry's Cron Monitors](https://docs.sentry.io/product/crons/getting-started/http/) +expose a check-in URL which accepts a `status` query parameter. You can point each +state at the same URL while varying the `status` value to report the lifecycle of +your backups. + +```yaml +ping: + start: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=in_progress + success: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=ok + failure: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=error +``` + +### healthchecks.io +[healthchecks.io](https://healthchecks.io/) provides a base ping URL, with +`/start` and `/fail` suffixes used to signal the start and failure of a run. + +```yaml +ping: + start: https://hc-ping.com/your-uuid/start + success: https://hc-ping.com/your-uuid + failure: https://hc-ping.com/your-uuid/fail +``` diff --git a/examples/config.yaml b/examples/config.yaml index e067adc..39b362d 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -1,5 +1,14 @@ schedule: "0 * * * *" +# Optionally report the status of each backup run to an HTTP-based cron +# monitoring service (such as Sentry Crons or healthchecks.io). Each URL is +# fetched with a simple HTTP GET request when the corresponding state is +# reached, and any state you leave out is simply not reported. +ping: + start: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=in_progress + success: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=ok + failure: https://sentry.io/api/0/organizations/your-org/monitors/github-backup/checkins/?status=error + backups: # Backup all the repositories that the provided credentials have access to - kind: github/repo diff --git a/src/config.rs b/src/config.rs index 9b84221..8798d32 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,13 +2,16 @@ use human_errors::ResultExt; use serde::{Deserialize, Deserializer}; use std::str::FromStr; -use crate::{Args, policy::BackupPolicy}; +use crate::{Args, ping::PingConfig, policy::BackupPolicy}; #[derive(Deserialize)] pub struct Config { #[serde(default, deserialize_with = "deserialize_cron")] pub schedule: Option, + #[serde(default)] + pub ping: PingConfig, + #[serde(default)] pub backups: Vec, } @@ -63,6 +66,38 @@ mod tests { assert!(config.schedule.is_none()); } + #[test] + fn deserialize_ping_not_provided() { + let config: Config = serde_yaml::from_str("").unwrap(); + assert_eq!(config.ping, crate::ping::PingConfig::default()); + } + + #[test] + fn deserialize_ping() { + let config: Config = serde_yaml::from_str( + r#" + ping: + start: https://example.com/start + success: https://example.com/success + failure: https://example.com/failure + "#, + ) + .unwrap(); + + assert_eq!( + config.ping.start.as_deref(), + Some("https://example.com/start") + ); + assert_eq!( + config.ping.success.as_deref(), + Some("https://example.com/success") + ); + assert_eq!( + config.ping.failure.as_deref(), + Some("https://example.com/failure") + ); + } + #[test] #[cfg_attr(feature = "pure_tests", ignore)] fn deserialize_example_config() { diff --git a/src/main.rs b/src/main.rs index 3505675..d0ba53b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,8 @@ use clap::Parser; use engines::BackupState; use human_errors::Error; use pairing::PairingHandler; -use std::sync::atomic::AtomicBool; +use ping::Pinger; +use std::sync::atomic::{AtomicBool, AtomicUsize}; use std::time::Duration; use tracing_batteries::prelude::*; use tracing_batteries::{OpenTelemetry, Session, Umami}; @@ -16,6 +17,7 @@ mod entities; mod errors; pub(crate) mod helpers; mod pairing; +mod ping; mod policy; mod sources; mod target; @@ -51,6 +53,8 @@ pub struct Args { async fn run(args: Args, session: &Session) -> Result<(), Error> { let config = config::Config::try_from(&args)?; + let pinger = Pinger::new(config.ping.clone()); + let github_repo = pairing::Pairing::new( sources::GitHubRepoSource::default(), engines::RepoEngine::new(), @@ -78,6 +82,10 @@ async fn run(args: Args, session: &Session) -> Result<(), Error> { .as_ref() .and_then(|s| s.find_next_occurrence(&chrono::Utc::now(), false).ok()); + let handler = LoggingPairingHandler::default(); + + pinger.on_start().await; + { let _span = info_span!("backup.all").entered(); @@ -88,21 +96,15 @@ async fn run(args: Args, session: &Session) -> Result<(), Error> { match policy.kind.as_str() { k if k == GitHubArtifactKind::Repo.as_str() => { info!("Backing up repositories for {}", &policy); - github_repo - .run(policy, &LoggingPairingHandler, &CANCEL) - .await; + github_repo.run(policy, &handler, &CANCEL).await; } k if k == GitHubArtifactKind::Release.as_str() => { info!("Backing up release artifacts for {}", &policy); - github_release - .run(policy, &LoggingPairingHandler, &CANCEL) - .await; + github_release.run(policy, &handler, &CANCEL).await; } k if k == GitHubArtifactKind::Gist.as_str() => { info!("Backing up gist artifacts for {}", &policy); - github_gist - .run(policy, &LoggingPairingHandler, &CANCEL) - .await; + github_gist.run(policy, &handler, &CANCEL).await; } _ => { error!("Unknown policy kind: {}", policy.kind); @@ -112,9 +114,17 @@ async fn run(args: Args, session: &Session) -> Result<(), Error> { } if CANCEL.load(std::sync::atomic::Ordering::Relaxed) { + // The run was interrupted (e.g. by SIGINT), so we deliberately avoid + // reporting either success or failure to the cron monitor. break; } + if handler.errors() > 0 { + pinger.on_failure().await; + } else { + pinger.on_success().await; + } + if let Some(next_run) = next_run { info!("Next backup scheduled for: {}", next_run); @@ -131,7 +141,19 @@ async fn run(args: Args, session: &Session) -> Result<(), Error> { Ok(()) } -pub struct LoggingPairingHandler; +#[derive(Default)] +pub struct LoggingPairingHandler { + errors: AtomicUsize, +} + +impl LoggingPairingHandler { + /// The total number of errors observed across every policy reported to this + /// handler, used to decide whether a backup run should be reported as a + /// success or a failure to the cron monitor. + fn errors(&self) -> usize { + self.errors.load(std::sync::atomic::Ordering::Relaxed) + } +} impl PairingHandler for LoggingPairingHandler { fn on_complete(&self, entity: E, state: BackupState) { @@ -144,6 +166,8 @@ impl PairingHandler for LoggingPairingHandler { } fn on_error(&self, error: Error) { + self.errors + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); warn!("Error: {}", error); } @@ -186,3 +210,26 @@ async fn main() { session.shutdown(); } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::entities::GitRepo; + + #[test] + fn logging_handler_counts_errors() { + let handler = LoggingPairingHandler::default(); + assert_eq!(handler.errors(), 0); + + // Each reported error should be accumulated so that a run with any + // failures can be reported to the cron monitor as a failure. + PairingHandler::::on_error(&handler, human_errors::user("boom", &[])); + PairingHandler::::on_error(&handler, human_errors::user("boom", &[])); + assert_eq!(handler.errors(), 2); + + // Successful completions must not affect the error count. + let repo = GitRepo::new("octocat/Hello-World", "https://example.com/repo.git", None); + handler.on_complete(repo, BackupState::Skipped); + assert_eq!(handler.errors(), 2); + } +} diff --git a/src/ping.rs b/src/ping.rs new file mode 100644 index 0000000..551dd47 --- /dev/null +++ b/src/ping.rs @@ -0,0 +1,175 @@ +use serde::Deserialize; +use tracing_batteries::prelude::*; + +/// Configuration for an HTTP-based cron monitoring solution (such as +/// [Sentry Cron Monitors](https://docs.sentry.io/product/crons/) or +/// [healthchecks.io](https://healthchecks.io)). +/// +/// Each field holds an optional URL which will be fetched (via an HTTP `GET` +/// request) when the corresponding state is reached during a scheduled backup +/// run. Any field which is left unset is simply skipped, allowing you to report +/// only the states you care about. +#[derive(Debug, Default, Clone, Deserialize, PartialEq, Eq)] +pub struct PingConfig { + /// The URL to fetch when a backup run starts. + #[serde(default)] + pub start: Option, + + /// The URL to fetch when a backup run completes successfully. + #[serde(default)] + pub success: Option, + + /// The URL to fetch when a backup run completes with one or more errors. + #[serde(default)] + pub failure: Option, +} + +/// Reports the lifecycle of a backup run to an HTTP-based cron monitoring +/// service by issuing simple `GET` requests to the URLs configured in +/// [`PingConfig`]. +/// +/// Reporting is best-effort: failures to reach the monitoring service are +/// logged but never propagated, ensuring that a flaky monitor can never cause +/// an otherwise healthy backup run to be reported as failed. +pub struct Pinger { + config: PingConfig, + client: reqwest::Client, +} + +impl Pinger { + pub fn new(config: PingConfig) -> Self { + Self { + config, + client: reqwest::Client::new(), + } + } + + /// Report that a backup run has started. + pub async fn on_start(&self) { + self.ping("start", self.config.start.as_deref()).await; + } + + /// Report that a backup run has completed successfully. + pub async fn on_success(&self) { + self.ping("success", self.config.success.as_deref()).await; + } + + /// Report that a backup run has completed with one or more errors. + pub async fn on_failure(&self) { + self.ping("failure", self.config.failure.as_deref()).await; + } + + #[tracing::instrument(skip(self, url), fields(ping.state = state))] + async fn ping(&self, state: &str, url: Option<&str>) { + let Some(url) = url else { + return; + }; + + debug!("Reporting '{state}' state to cron monitor."); + + match self + .client + .get(url) + .header("User-Agent", "SierraSoftworks/github-backup") + .send() + .await + { + Ok(resp) if resp.status().is_success() => { + debug!("Successfully reported '{state}' state to cron monitor."); + } + Ok(resp) => { + warn!( + "Cron monitor returned HTTP {} when reporting the '{state}' state.", + resp.status() + ); + } + Err(e) => { + warn!("Failed to report the '{state}' state to the cron monitor: {e}"); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + #[test] + fn deserialize_empty() { + let config: PingConfig = serde_yaml::from_str("{}").unwrap(); + assert_eq!(config, PingConfig::default()); + } + + #[test] + fn deserialize_partial() { + let config: PingConfig = serde_yaml::from_str("start: https://example.com/start").unwrap(); + assert_eq!(config.start.as_deref(), Some("https://example.com/start")); + assert_eq!(config.success, None); + assert_eq!(config.failure, None); + } + + #[tokio::test] + async fn reports_each_state() { + let server = MockServer::start().await; + + for state in ["start", "success", "failure"] { + Mock::given(method("GET")) + .and(path(format!("/{state}"))) + .respond_with(ResponseTemplate::new(200)) + .expect(1) + .mount(&server) + .await; + } + + let pinger = Pinger::new(PingConfig { + start: Some(format!("{}/start", server.uri())), + success: Some(format!("{}/success", server.uri())), + failure: Some(format!("{}/failure", server.uri())), + }); + + pinger.on_start().await; + pinger.on_success().await; + pinger.on_failure().await; + + // `MockServer` verifies the `.expect(1)` expectations when dropped. + } + + #[tokio::test] + async fn unconfigured_states_make_no_request() { + let server = MockServer::start().await; + + // Any request reaching the server would fail the `expect(0)` guard. + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200)) + .expect(0) + .mount(&server) + .await; + + let pinger = Pinger::new(PingConfig::default()); + + pinger.on_start().await; + pinger.on_success().await; + pinger.on_failure().await; + } + + #[tokio::test] + async fn error_responses_are_swallowed() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(500)) + .expect(1) + .mount(&server) + .await; + + let pinger = Pinger::new(PingConfig { + success: Some(format!("{}/success", server.uri())), + ..Default::default() + }); + + // This must not panic even though the monitor returned an error status. + pinger.on_success().await; + } +}