From f68d4b10ecdd515de3ebc509c5d3c7fd7730349d Mon Sep 17 00:00:00 2001 From: Alessandro Aresta Date: Tue, 18 Nov 2025 11:47:24 +0100 Subject: [PATCH] feat(security): add global tracing scrubber to prevent token exposure in logs Implements automatic sanitization of GitHub tokens and credentials across all log output using tracing-subscriber. Prevents accidental exposure of: - GitHub tokens (ghp_, gho_, ghu_, ghs_, ghr_) - Credentials in URLs (x-access-token, basic auth) - Bearer tokens Resolves critical security vulnerability where installation tokens embedded in git URLs would leak through error messages. - add tracing and tracing-subscriber to Cargo.toml --- Cargo.toml | 3 + src/app_auth.rs | 26 ++++- src/gitops.rs | 47 ++++++++- src/lib.rs | 80 ++++++++++++++ src/tracing_sanitizer.rs | 223 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 373 insertions(+), 6 deletions(-) create mode 100644 src/tracing_sanitizer.rs diff --git a/Cargo.toml b/Cargo.toml index 3793812..534131a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,9 @@ hmac = "0.12" sha2 = "0.10" glob = "0.3" thiserror = "2.0" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] } +regex = "1.0" [dev-dependencies] tempfile = "3.10" diff --git a/src/app_auth.rs b/src/app_auth.rs index 4ffcb70..1feffbd 100644 --- a/src/app_auth.rs +++ b/src/app_auth.rs @@ -4,6 +4,7 @@ use jsonwebtoken::{encode, Algorithm, EncodingKey, Header}; use serde::{Deserialize, Serialize}; use std::sync::Arc; use tokio::sync::Mutex; +use tracing::{debug, error, info, instrument}; #[derive(Debug, Serialize)] struct Claims { @@ -39,7 +40,10 @@ impl GitHubTokenProvider { } } + #[instrument(skip(self))] fn create_jwt(&self) -> Result { + debug!("Creating JWT for GitHub App authentication"); + let now = Utc::now(); let iat = now.timestamp(); let exp = (now + Duration::minutes(10)).timestamp(); @@ -52,10 +56,15 @@ impl GitHubTokenProvider { let key = EncodingKey::from_rsa_pem(self.config.private_key_pem.as_bytes())?; let token = encode(&Header::new(Algorithm::RS256), &claims, &key)?; + + debug!("JWT created successfully"); Ok(token) } + #[instrument(skip(self), fields(installation_id = %self.config.installation_id))] async fn fetch_installation_token(&self) -> Result { + info!("Fetching new installation access token"); + let jwt = self.create_jwt()?; let url = format!( @@ -63,6 +72,8 @@ impl GitHubTokenProvider { self.config.installation_id ); + debug!(url = %url, "Requesting installation token"); + let response = self .client .post(&url) @@ -76,9 +87,17 @@ impl GitHubTokenProvider { if !response.status().is_success() { let status = response.status(); let body = response.text().await?; + + // Tracing will automatically sanitize any tokens in the body + error!( + status = %status, + response_body = %body, + "Failed to get installation token" + ); + return Err(GitHubError::Other(format!( - "Failed to get installation token: {} - {}", - status, body + "Failed to get installation token: {}", + status ))); } @@ -87,6 +106,8 @@ impl GitHubTokenProvider { .map_err(|e| GitHubError::Other(format!("Failed to parse expiry time: {}", e)))? .with_timezone(&Utc); + info!(expires_at = %expires_at, "Installation token fetched successfully"); + Ok(CachedToken { token: token_response.token, expires_at, @@ -99,6 +120,7 @@ impl GitHubTokenProvider { /// 1. Quick read-only check if token is valid (fast path) /// 2. If refresh needed, acquire lock and check again /// 3. Only one task fetches new token, others wait and reuse it + #[instrument(skip(self))] pub async fn get_token(&self) -> Result { // Fast path: Check if we have a valid token without holding lock during HTTP { diff --git a/src/gitops.rs b/src/gitops.rs index afc150a..b2a70d8 100644 --- a/src/gitops.rs +++ b/src/gitops.rs @@ -2,6 +2,7 @@ use crate::{GitHubAppConfig, GitHubError, GitHubTokenProvider}; use serde::de::DeserializeOwned; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; +use tracing::{debug, error, info, instrument, warn}; pub struct GitHubGitOps { config: GitHubAppConfig, @@ -16,34 +17,53 @@ impl GitHubGitOps { } } + #[instrument(skip(self, args), fields(git_cmd = ?args))] fn run_git_command(&self, args: &[&str], cwd: Option<&Path>) -> Result { + debug!("Executing git command"); + let mut cmd = Command::new("git"); cmd.args(args).stdout(Stdio::piped()).stderr(Stdio::piped()); if let Some(dir) = cwd { cmd.current_dir(dir); + debug!(directory = ?dir, "Set working directory"); } let output = cmd.output()?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); + + // Log the error with tracing - the sanitizer will redact tokens automatically + error!( + exit_code = ?output.status.code(), + stderr = %stderr, + "Git command failed" + ); + + // Return a generic error to users (details are in logs) return Err(GitHubError::Git(format!( - "Git command failed: git {}. Error: {}", - args.join(" "), - stderr + "Git command failed. Check logs for details. Exit code: {:?}", + output.status.code() ))); } - Ok(String::from_utf8_lossy(&output.stdout).to_string()) + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + debug!(output_length = stdout.len(), "Git command succeeded"); + Ok(stdout) } + #[instrument(skip(self), fields(repo = %self.config.repo, branch = %self.config.branch))] pub async fn initialize(&self) -> Result<(), GitHubError> { if self.config.git_clone_path.exists() { + info!("Repository already exists, skipping clone"); return Ok(()); } + info!("Initializing repository clone"); + std::fs::create_dir_all(&self.config.git_clone_path)?; + debug!("Created clone directory"); let token = self.token_provider.get_token().await?; let clone_url = format!( @@ -51,50 +71,66 @@ impl GitHubGitOps { token, self.config.repo ); + // Tracing will automatically sanitize the clone_url in logs + debug!("Starting git clone operation"); self.run_git_command( &["clone", "--branch", &self.config.branch, &clone_url, "."], Some(&self.config.git_clone_path), )?; + info!("Repository clone completed successfully"); Ok(()) } + #[instrument(skip(self), fields(repo = %self.config.repo, branch = %self.config.branch))] pub async fn sync(&self) -> Result<(), GitHubError> { if !self.config.git_clone_path.exists() { + warn!("Repository not initialized"); return Err(GitHubError::Git( "Repository not initialized. Call initialize() first.".to_string(), )); } + info!("Syncing repository with remote"); + let token = self.token_provider.get_token().await?; let remote_url = format!( "https://x-access-token:{}@github.com/{}.git", token, self.config.repo ); + // Tracing will automatically sanitize the remote_url in logs + debug!("Updating remote URL"); self.run_git_command( &["remote", "set-url", "origin", &remote_url], Some(&self.config.git_clone_path), )?; + debug!("Fetching from origin"); self.run_git_command(&["fetch", "origin"], Some(&self.config.git_clone_path))?; let remote_branch = format!("origin/{}", self.config.branch); + debug!(remote_branch = %remote_branch, "Resetting to remote branch"); self.run_git_command( &["reset", "--hard", &remote_branch], Some(&self.config.git_clone_path), )?; + info!("Repository sync completed successfully"); Ok(()) } + #[instrument(skip(self), fields(repo = %self.config.repo, glob = %self.config.manifest_glob))] pub fn load_all_manifests(&self) -> Result, GitHubError> { if !self.config.git_clone_path.exists() { + warn!("Repository not initialized"); return Err(GitHubError::Git( "Repository not initialized. Call initialize() first.".to_string(), )); } + debug!("Loading manifests"); + let pattern = self .config .git_clone_path @@ -106,11 +142,14 @@ impl GitHubGitOps { for entry in glob::glob(&pattern)? { let path = entry?; + debug!(file = ?path, "Loading manifest file"); + let content = std::fs::read_to_string(&path)?; let manifest: T = serde_yaml::from_str(&content).map_err(GitHubError::Yaml)?; manifests.push(manifest); } + info!(count = manifests.len(), "Loaded manifests"); Ok(manifests) } diff --git a/src/lib.rs b/src/lib.rs index 9799761..d8946bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,10 +2,90 @@ pub mod app_auth; pub mod config; pub mod error; pub mod gitops; +pub mod tracing_sanitizer; pub mod webhook; pub use app_auth::GitHubTokenProvider; pub use config::GitHubAppConfig; pub use error::GitHubError; pub use gitops::GitHubGitOps; +pub use tracing_sanitizer::sanitize_sensitive_data; pub use webhook::{PushEvent, WebhookEvent, WebhookVerifier}; + +use tracing_subscriber::{fmt, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; + +/// Initialize tracing with automatic sanitization of sensitive data +/// +/// This sets up structured logging with automatic redaction of: +/// - GitHub tokens (ghp_, gho_, ghu_, ghs_, ghr_) +/// - Credentials in URLs +/// - Bearer tokens +/// - x-access-token URLs +/// +/// # Environment Variables +/// +/// - `RUST_LOG`: Control log level (e.g., "debug", "info", "warn", "error") +/// - Default: "info" +/// - Example: `RUST_LOG=debug cargo run` +/// +/// # Examples +/// +/// ```no_run +/// use github_app::init_tracing; +/// +/// // Initialize once at application startup +/// init_tracing(); +/// +/// // Now all logs will have sensitive data automatically redacted +/// tracing::info!("Starting application"); +/// ``` +/// +/// # Panics +/// +/// Panics if called more than once (tracing can only be initialized once per process) +pub fn init_tracing() { + let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); + + // Create a formatter that writes to a sanitizing writer + let fmt_layer = fmt::layer() + .with_target(true) + .with_thread_ids(false) + .with_thread_names(false) + .with_file(true) + .with_line_number(true) + .with_writer(tracing_sanitizer::SanitizingMakeWriter::new()); + + tracing_subscriber::registry() + .with(filter) + .with(fmt_layer) + .init(); +} + +/// Initialize tracing with JSON output for structured logging +/// +/// Useful for production environments where logs are shipped to aggregation systems +/// like DataDog, Splunk, or ELK. All output is still sanitized. +/// +/// # Examples +/// +/// ```no_run +/// use github_app::init_tracing_json; +/// +/// init_tracing_json(); +/// tracing::info!(user = "alice", "User logged in"); +/// ``` +pub fn init_tracing_json() { + let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); + + let fmt_layer = fmt::layer() + .json() + .with_target(true) + .with_file(true) + .with_line_number(true) + .with_writer(tracing_sanitizer::SanitizingMakeWriter::new()); + + tracing_subscriber::registry() + .with(filter) + .with(fmt_layer) + .init(); +} diff --git a/src/tracing_sanitizer.rs b/src/tracing_sanitizer.rs new file mode 100644 index 0000000..f6ea61d --- /dev/null +++ b/src/tracing_sanitizer.rs @@ -0,0 +1,223 @@ +//! Tracing sanitizer for redacting sensitive data from logs +//! +//! This module provides automatic sanitization of GitHub tokens and credentials +//! from all log output, preventing accidental exposure in logs, error messages, +//! and monitoring systems. + +use regex::Regex; +use std::sync::OnceLock; + +/// Global regex patterns for sanitization (compiled once, used everywhere) +static TOKEN_PATTERNS: OnceLock> = OnceLock::new(); + +/// Initialize sanitization patterns +fn get_patterns() -> &'static Vec { + TOKEN_PATTERNS.get_or_init(|| { + vec![ + // GitHub Personal Access Tokens (classic) - minimum 36 chars + Regex::new(r"ghp_[A-Za-z0-9]{36,}").unwrap(), + // GitHub OAuth Access Tokens + Regex::new(r"gho_[A-Za-z0-9]{36,}").unwrap(), + // GitHub App User Access Tokens + Regex::new(r"ghu_[A-Za-z0-9]{36,}").unwrap(), + // GitHub App Installation Access Tokens + Regex::new(r"ghs_[A-Za-z0-9]{36,}").unwrap(), + // GitHub App Refresh Tokens + Regex::new(r"ghr_[A-Za-z0-9]{36,}").unwrap(), + // Generic pattern for tokens in URLs (x-access-token:TOKEN@) + Regex::new(r"(x-access-token:)[A-Za-z0-9_-]+(@)").unwrap(), + // Generic pattern for any credentials in URLs (https://USER:PASS@) + Regex::new(r"(https?://[^:/@\s]+:)[^@\s]+(@)").unwrap(), + // Bearer tokens in Authorization headers + Regex::new(r"(Bearer\s+)[A-Za-z0-9._-]+").unwrap(), + // Basic auth in URLs (git://, http://, https://) + Regex::new(r"(://[^:/@\s]+:)[^@\s]+(@)").unwrap(), + ] + }) +} + +/// Sanitize sensitive data from a string +/// +/// This function removes: +/// - GitHub tokens (ghp_, gho_, ghu_, ghs_, ghr_) +/// - Credentials in URLs (https://user:pass@...) +/// - Bearer tokens +/// - x-access-token URLs +/// +/// # Examples +/// +/// ``` +/// use github_app::sanitize_sensitive_data; +/// +/// let input = "Failed to clone https://x-access-token:ghs_16C7e42F292c6912E7710c838347Ae178B4a@github.com/repo.git"; +/// let output = sanitize_sensitive_data(input); +/// assert!(!output.contains("ghs_")); +/// assert!(output.contains("[REDACTED]")); +/// ``` +pub fn sanitize_sensitive_data(input: &str) -> String { + let patterns = get_patterns(); + let mut result = input.to_string(); + + for (idx, pattern) in patterns.iter().enumerate() { + result = match idx { + // GitHub tokens - replace entire token + 0..=4 => pattern.replace_all(&result, "[REDACTED_TOKEN]").to_string(), + // x-access-token URLs - keep structure, redact token + 5 => pattern.replace_all(&result, "$1[REDACTED]$2").to_string(), + // Credentials in URLs - keep structure, redact password + 6 | 8 => pattern.replace_all(&result, "$1[REDACTED]$2").to_string(), + // Bearer tokens - keep "Bearer", redact token + 7 => pattern.replace_all(&result, "$1[REDACTED]").to_string(), + _ => result, + }; + } + + result +} + +/// Writer that sanitizes output before writing +#[derive(Clone)] +pub struct SanitizingMakeWriter; + +impl SanitizingMakeWriter { + pub fn new() -> Self { + Self + } +} + +impl Default for SanitizingMakeWriter { + fn default() -> Self { + Self::new() + } +} + +// Implement tracing-subscriber's MakeWriter trait +impl<'a> tracing_subscriber::fmt::MakeWriter<'a> for SanitizingMakeWriter { + type Writer = SanitizingWriter; + + fn make_writer(&'a self) -> Self::Writer { + SanitizingWriter { + inner: std::io::stdout(), + } + } +} + +/// Format writer that sanitizes output +pub struct SanitizingWriter { + inner: W, +} + +impl std::io::Write for SanitizingWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + // Convert to string, sanitize, write sanitized version + let input = String::from_utf8_lossy(buf); + let sanitized = sanitize_sensitive_data(&input); + self.inner.write_all(sanitized.as_bytes())?; + Ok(buf.len()) // Return original length for compatibility + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sanitize_github_token_ghp() { + let input = "Token: ghp_1234567890ABCDEFGHIJKLMNOPQRSTUVabcd"; + let output = sanitize_sensitive_data(input); + assert_eq!(output, "Token: [REDACTED_TOKEN]"); + } + + #[test] + fn test_sanitize_github_token_ghs() { + let input = "Error with ghs_ABCDEFGHIJKLMNOPQRSTUVabcdef12345678"; + let output = sanitize_sensitive_data(input); + assert_eq!(output, "Error with [REDACTED_TOKEN]"); + } + + #[test] + fn test_sanitize_github_token_gho() { + let input = "OAuth: gho_ABCDEFGHIJKLMNOPQRSTUVabcdef12345678"; + let output = sanitize_sensitive_data(input); + assert_eq!(output, "OAuth: [REDACTED_TOKEN]"); + } + + #[test] + fn test_sanitize_x_access_token_url() { + let input = "Clone failed: https://x-access-token:ghs_16C7e42F292c6912E7710c838347Ae178B4a@github.com/owner/repo.git"; + let output = sanitize_sensitive_data(input); + assert!(output.contains("x-access-token:[REDACTED]@github.com")); + assert!(!output.contains("ghs_")); + } + + #[test] + fn test_sanitize_url_with_credentials() { + let input = "Error: https://user:secretpassword@example.com/path"; + let output = sanitize_sensitive_data(input); + assert!(output.contains("https://user:[REDACTED]@example.com")); + assert!(!output.contains("secretpassword")); + } + + #[test] + fn test_sanitize_bearer_token() { + let input = "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9"; + let output = sanitize_sensitive_data(input); + assert!(output.contains("Bearer [REDACTED]")); + assert!(!output.contains("eyJhbG")); + } + + #[test] + fn test_sanitize_multiple_tokens() { + let input = "Failed with ghp_1234567890ABCDEFGHIJKLMNOPQRSTUVabcd at https://x-access-token:ghs_ABCDEFGHIJKLMNOPQRSTUVabcdef12345678@github.com/repo.git using Bearer abc123def456"; + let output = sanitize_sensitive_data(input); + + // Verify tokens are redacted + assert!(!output.contains("ghp_1234567890")); + assert!(!output.contains("ghs_ABCDEFGH")); + assert!(!output.contains("abc123def456")); + + // Check for redaction markers (at least 2: gh tokens in URL, bearer token) + let redacted_count = output.matches("[REDACTED]").count(); + assert!( + redacted_count >= 2, + "Should redact at least 2 tokens, found: {}. Output: {}", + redacted_count, + output + ); + } + + #[test] + fn test_no_sanitization_needed() { + let input = "Normal log message with no secrets"; + let output = sanitize_sensitive_data(input); + assert_eq!(output, input); + } + + #[test] + fn test_preserve_url_structure() { + let input = + "git clone --branch main https://x-access-token:TOKEN@github.com/owner/repo.git ."; + let output = sanitize_sensitive_data(input); + assert!(output.contains("github.com/owner/repo.git")); + assert!(output.contains("--branch main")); + assert!(!output.contains("TOKEN")); + } + + #[test] + fn test_empty_string() { + let input = ""; + let output = sanitize_sensitive_data(input); + assert_eq!(output, ""); + } + + #[test] + fn test_git_protocol_url() { + let input = "git://user:pass@github.com/repo.git"; + let output = sanitize_sensitive_data(input); + assert!(output.contains("git://user:[REDACTED]@github.com")); + } +}