From a02065604d439dee7f89ec913360dc337e78950a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kuras?= Date: Thu, 28 May 2026 12:20:20 +0200 Subject: [PATCH 1/4] feat(auth): slide auth_sessions.expires_at on every successful auth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the only path to recover from an expired `auth_sessions` row was `tracevault login`. Tokens hard-expire at 30 days regardless of how active the user is — daily users hit the cliff just as easily as inactive ones, with no signal until a push gets blocked. Replace the SELECT-only validity check in both `AuthUser` and `OrgAuth` extractors with a single `UPDATE auth_sessions SET expires_at = NOW() + INTERVAL '30 days' WHERE token_hash = AND expires_at > NOW() RETURNING user_id`. Atomic: one round-trip, no race between the validity check and the extension. Net effect: * Active users (anything that talks to TV once per 30 days) never see a token expiry again — the window slides forward on every hit. * Inactive users still hit the 30-day expiry cliff and must re-login, which is the right security tradeoff for genuinely-dormant tokens. * No new endpoint, no client change, no breaking-change to existing callers. Three integration tests against a real Postgres pool exercise the SQL directly: extend-on-hit, no-revive-of-already-expired, no-op-on- unknown-token. --- crates/tracevault-server/src/extractors.rs | 23 +++- .../tests/sliding_session_test.rs | 116 ++++++++++++++++++ 2 files changed, 135 insertions(+), 4 deletions(-) create mode 100644 crates/tracevault-server/tests/sliding_session_test.rs diff --git a/crates/tracevault-server/src/extractors.rs b/crates/tracevault-server/src/extractors.rs index 4b924943..2776c19d 100644 --- a/crates/tracevault-server/src/extractors.rs +++ b/crates/tracevault-server/src/extractors.rs @@ -32,9 +32,19 @@ impl FromRequestParts for AuthUser { let token_hash = sha256_hex(header); - // Try auth_sessions first + // Sliding session window: on every successful auth, bump expires_at + // to NOW() + 30 days. As long as the user touches TV at least once + // per 30 days, the token never expires from their perspective. + // Inactive users still hit the 30-day cliff and must re-login. + // + // Using UPDATE..RETURNING means we atomically (a) verify the token + // is still valid (the WHERE filters expired rows out), (b) extend + // it, (c) read back the user_id — one round-trip, no race. let session_row = sqlx::query_as::<_, (Uuid,)>( - "SELECT user_id FROM auth_sessions WHERE token_hash = $1 AND expires_at > NOW()", + "UPDATE auth_sessions + SET expires_at = NOW() + INTERVAL '30 days' + WHERE token_hash = $1 AND expires_at > NOW() + RETURNING user_id", ) .bind(&token_hash) .fetch_optional(&state.pool) @@ -120,9 +130,14 @@ impl FromRequestParts for OrgAuth { let org_id = org_row.0; - // Try session auth + // Try session auth — sliding window: same pattern as AuthUser, see + // the comment there. UPDATE..RETURNING atomically bumps expires_at + // and returns the user_id only when the token is still valid. let session_row = sqlx::query_as::<_, (Uuid,)>( - "SELECT user_id FROM auth_sessions WHERE token_hash = $1 AND expires_at > NOW()", + "UPDATE auth_sessions + SET expires_at = NOW() + INTERVAL '30 days' + WHERE token_hash = $1 AND expires_at > NOW() + RETURNING user_id", ) .bind(&token_hash) .fetch_optional(&state.pool) diff --git a/crates/tracevault-server/tests/sliding_session_test.rs b/crates/tracevault-server/tests/sliding_session_test.rs new file mode 100644 index 00000000..cf2fc612 --- /dev/null +++ b/crates/tracevault-server/tests/sliding_session_test.rs @@ -0,0 +1,116 @@ +//! Verifies the sliding-session-window behavior added to the auth +//! extractors. On every successful auth, the matching `auth_sessions` row +//! has its `expires_at` bumped to NOW() + 30 days. +//! +//! The extractor is wired into HTTP handlers and not trivially callable in +//! isolation, but the SQL pattern it uses is `UPDATE auth_sessions SET +//! expires_at = NOW() + INTERVAL '30 days' WHERE token_hash = $1 AND +//! expires_at > NOW() RETURNING user_id`. These tests exercise that +//! statement directly against a real Postgres pool, which is the precise +//! source of the behavior we want to lock in. + +mod common; + +use chrono::{Duration, Utc}; +use tracevault_server::auth::sha256_hex; +use uuid::Uuid; + +async fn seed_session_with_expiry( + pool: &sqlx::PgPool, + user_id: Uuid, + token_hash: &str, + expires_at: chrono::DateTime, +) -> Uuid { + sqlx::query_scalar::<_, Uuid>( + "INSERT INTO auth_sessions (user_id, token_hash, expires_at) \ + VALUES ($1, $2, $3) RETURNING id", + ) + .bind(user_id) + .bind(token_hash) + .bind(expires_at) + .fetch_one(pool) + .await + .unwrap() +} + +/// Run the sliding-window UPDATE used by the extractor and return the +/// updated `expires_at` for the session row (or None if the row was +/// expired / not found). +async fn run_sliding_update( + pool: &sqlx::PgPool, + token_hash: &str, +) -> Option> { + sqlx::query_as::<_, (chrono::DateTime,)>( + "UPDATE auth_sessions + SET expires_at = NOW() + INTERVAL '30 days' + WHERE token_hash = $1 AND expires_at > NOW() + RETURNING expires_at", + ) + .bind(token_hash) + .fetch_optional(pool) + .await + .unwrap() + .map(|(ts,)| ts) +} + +#[sqlx::test(migrations = "./migrations")] +async fn auth_extends_expires_at_on_each_hit(pool: sqlx::PgPool) { + let user_id = common::seed_user(&pool).await; + let token = format!("tok_{}", Uuid::new_v4()); + let token_hash = sha256_hex(&token); + + // Seed an "almost expired" session: 2 minutes left. + let initial_expires = Utc::now() + Duration::minutes(2); + seed_session_with_expiry(&pool, user_id, &token_hash, initial_expires).await; + + let bumped = run_sliding_update(&pool, &token_hash) + .await + .expect("valid (not-yet-expired) session should be extended"); + + // After the bump, expires_at should be ~30 days in the future — at + // minimum, more than 29 days from now. We don't pin it exactly because + // NOW() in Postgres is evaluated server-side and clock skew between + // test machine and server is real (postgres in Docker, test in host). + let min_expected = Utc::now() + Duration::days(29); + assert!( + bumped > min_expected, + "sliding window should extend expires_at to ~30 days; got {bumped}" + ); +} + +#[sqlx::test(migrations = "./migrations")] +async fn auth_does_not_extend_already_expired_session(pool: sqlx::PgPool) { + let user_id = common::seed_user(&pool).await; + let token = format!("tok_{}", Uuid::new_v4()); + let token_hash = sha256_hex(&token); + + // Seed a session that expired an hour ago. + let expired_at = Utc::now() - Duration::hours(1); + seed_session_with_expiry(&pool, user_id, &token_hash, expired_at).await; + + let result = run_sliding_update(&pool, &token_hash).await; + assert!( + result.is_none(), + "an already-expired session must NOT be revived by a sliding-window update; got {result:?}" + ); + + // Belt-and-braces: confirm the row's expires_at is still in the past. + let row_expires_at = sqlx::query_scalar::<_, chrono::DateTime>( + "SELECT expires_at FROM auth_sessions WHERE token_hash = $1", + ) + .bind(&token_hash) + .fetch_one(&pool) + .await + .unwrap(); + assert!( + row_expires_at < Utc::now(), + "row's expires_at should remain in the past after a no-op sliding update" + ); +} + +#[sqlx::test(migrations = "./migrations")] +async fn auth_sliding_update_is_a_noop_for_unknown_tokens(pool: sqlx::PgPool) { + let unknown_hash = sha256_hex("definitely-not-a-real-token"); + let result = run_sliding_update(&pool, &unknown_hash).await; + assert!(result.is_none()); +} From effdbe48ad6a23d6fe059061976af4c2903c3970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kuras?= Date: Thu, 28 May 2026 12:20:31 +0200 Subject: [PATCH 2/4] fix(cli): actionable error messages on tracevault check failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-push hook runs `tracevault check` and exits 1 on any failure. Previously that 1 came with an opaque message — typically "Stream failed (500 Internal Server Error)" or "Server returned 401: Invalid or expired token". Both an agent and a human reader were left guessing what to do. Adds `connectivity_message()` which sniffs the raw error string and appends a specific next step: * 401 / Unauthorized: "Session token may be expired. Run `tracevault login --server-url=` to refresh." * 403 / Forbidden: org-membership hint (not a re-login). * DNS / connect / timeout / connection refused: network + server_url hint. * 5xx Server Error / Internal: "team has been paged, retry shortly." * Unknown shapes: pass through the raw error, no fake advice. All connectivity-class errors still propagate as Err so the pre-push hook exits non-zero — a TV-tracked repo MUST be evaluated by TV on every push, that contract doesn't change. The fix here is purely the quality of the message the user/agent reads after the block. Six unit tests pin the message shapes for the common error patterns the api_client surfaces today. --- crates/tracevault-cli/src/commands/check.rs | 122 +++++++++++++++++--- 1 file changed, 107 insertions(+), 15 deletions(-) diff --git a/crates/tracevault-cli/src/commands/check.rs b/crates/tracevault-cli/src/commands/check.rs index 68aec742..b9b858a2 100644 --- a/crates/tracevault-cli/src/commands/check.rs +++ b/crates/tracevault-cli/src/commands/check.rs @@ -135,31 +135,38 @@ fn collect_session_data(session_dir: &Path) -> Option { pub async fn check_policies(project_root: &Path) -> Result<(), Box> { let (server_url, token) = resolve_credentials(project_root); - let server_url = match server_url { - Some(url) => url, - None => { - return Err("No server URL configured. Run 'tracevault login' first.".into()); - } - }; + let server_url = server_url + .ok_or("No server URL configured. Run `tracevault login --server-url=` to set one.")?; if token.is_none() { - return Err("Not logged in. Run 'tracevault login' to check policies.".into()); + return Err( + "Not logged in. Run `tracevault login --server-url=` to authenticate." + .into(), + ); } let org_slug = TracevaultConfig::load(project_root) .and_then(|c| c.org_slug) - .ok_or("No org_slug in config. Run 'tracevault init' first.")?; + .ok_or("No org_slug in config. Run `tracevault init` first.")?; let client = ApiClient::new(&server_url, token.as_deref()); - // Resolve repo_id by name + // Resolve repo_id by name. + // + // Connectivity errors here (auth expired, server down, network + // unreachable) propagate so the pre-push hook exits non-zero — if a + // repo is opted into TraceVault, every push must be evaluated, full + // stop. Letting pushes slip when TV is unreachable would defeat the + // point of enforcement. We attach an actionable next step to each + // error so the user (or agent) knows the recovery command without + // guessing — see `connectivity_message` below. let repo_name = git_repo_name(project_root); - let repos = client.list_repos(&org_slug).await?; + let repos = client + .list_repos(&org_slug) + .await + .map_err(|e| connectivity_message(&e.to_string()))?; let repo = repos.iter().find(|r| r.name == repo_name).ok_or_else(|| { - format!( - "Repo '{}' not found on server. Run 'tracevault sync' first.", - repo_name - ) + format!("Repo '{repo_name}' not found on server. Run `tracevault sync` first.") })?; // Collect session data from unpushed sessions @@ -200,7 +207,8 @@ pub async fn check_policies(project_root: &Path) -> Result<(), Box Result<(), Box String { + let lower = raw.to_ascii_lowercase(); + let action = if lower.contains("401") || lower.contains("unauthorized") { + Some("Session token may be expired. Run `tracevault login --server-url=` to refresh.") + } else if lower.contains("403") || lower.contains("forbidden") { + Some("Your token lacks access to this repo's policies. Check your org membership and rerun `tracevault login`.") + } else if lower.contains("dns") + || lower.contains("connect") + || lower.contains("timed out") + || lower.contains("timeout") + || lower.contains("connection refused") + { + Some("Could not reach the TraceVault server. Check network and `server_url` in .tracevault/config.toml.") + } else if lower.contains("5") && (lower.contains("server error") || lower.contains("internal")) + { + Some("TraceVault server returned an error. The team has likely been paged; retry shortly.") + } else { + None + }; + + match action { + Some(a) => format!("Policy check could not run: {raw}.\n → {a}"), + None => format!("Policy check could not run: {raw}."), + } +} + +#[cfg(test)] +mod tests { + use super::connectivity_message; + + #[test] + fn connectivity_message_suggests_login_on_401() { + let m = connectivity_message("Stream failed (401 Unauthorized): bad token"); + assert!( + m.contains("tracevault login"), + "401 errors must surface the login hint; got: {m}" + ); + } + + #[test] + fn connectivity_message_suggests_login_on_unauthorized_text() { + let m = connectivity_message("Server returned: unauthorized request"); + assert!(m.contains("tracevault login")); + } + + #[test] + fn connectivity_message_suggests_network_check_on_dns_error() { + let m = connectivity_message("error sending request for url: dns error"); + assert!( + m.to_lowercase().contains("network") || m.to_lowercase().contains("server_url"), + "DNS errors must surface a network hint; got: {m}" + ); + } + + #[test] + fn connectivity_message_suggests_network_check_on_connection_refused() { + let m = connectivity_message("connection refused"); + assert!(m.to_lowercase().contains("network")); + } + + #[test] + fn connectivity_message_falls_back_when_unrecognized() { + let m = connectivity_message("some weird new error shape we have not seen before"); + // No suggestion — but the raw error must still be in the message so + // the user can debug. + assert!(m.contains("some weird new error shape")); + assert!(!m.contains("→")); // no action arrow + } + + #[test] + fn connectivity_message_does_not_collide_on_403() { + let m = connectivity_message("403 Forbidden"); + // 403 should suggest org membership, NOT a re-login. + assert!( + m.to_lowercase().contains("membership"), + "403 should mention membership; got: {m}" + ); + } +} From 9955798f40c41159ab5bedc9bf43c9749bff7253 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kuras?= Date: Mon, 1 Jun 2026 07:51:00 +0200 Subject: [PATCH 3/4] refactor(auth): slide session expiry only when stale, dedupe the query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sliding-session check ran an unconditional `UPDATE ... RETURNING` on every authenticated request. TraceVault streams an event on every agent hook, so the same token authenticates many times in quick succession (often concurrently) — an always-write check serializes those requests on a single row lock and amplifies WAL/IO. This is the cost the Aikido review flagged on extractors.rs. Replace it with a CTE that always reads user_id for the auth decision but only slides expires_at forward once it has drifted more than a day from the 30-day target. The common already-fresh-token path now performs no write and takes no row lock; user-facing behavior is unchanged (active tokens still never expire, expired ones are never revived). Extract the statement to `auth::SLIDING_SESSION_AUTH_SQL` so both extractors share one definition and the integration test exercises the real query instead of a hand-copied duplicate. Add a test pinning the no-rewrite-when-fresh path. Co-Authored-By: Claude Opus 4.8 --- crates/tracevault-server/src/auth.rs | 36 ++++++ crates/tracevault-server/src/extractors.rs | 45 ++----- .../tests/sliding_session_test.rs | 122 ++++++++++-------- 3 files changed, 121 insertions(+), 82 deletions(-) diff --git a/crates/tracevault-server/src/auth.rs b/crates/tracevault-server/src/auth.rs index e261ee62..69ab75a1 100644 --- a/crates/tracevault-server/src/auth.rs +++ b/crates/tracevault-server/src/auth.rs @@ -68,6 +68,42 @@ pub fn sha256_hex(input: &str) -> String { hex::encode(hasher.finalize()) } +/// Single-statement sliding-session auth check, shared by both request +/// extractors (`AuthUser`, `OrgAuth`) so the behavior is defined once. +/// +/// Binds `$1` = `token_hash` and returns the matching `user_id`, or no row +/// when the token is missing or expired (the `expires_at > NOW()` filter +/// rejects expired sessions — they are never revived). +/// +/// Sliding window: as long as a user authenticates at least once per ~30 +/// days the token never expires; genuinely-dormant tokens still hit the +/// cliff and must re-login. +/// +/// Why a CTE instead of a plain `UPDATE ... RETURNING`: an unconditional +/// UPDATE would take a row lock and write a new tuple on *every* +/// authenticated request. TraceVault streams an event on every agent hook, +/// so the same token authenticates many times in quick succession (often +/// concurrently) — an always-write check serializes those requests on one +/// row lock and amplifies WAL/IO. Here the `session` CTE always reads the +/// `user_id` for the auth decision, while the `slide` CTE only writes when +/// the expiry has drifted more than a day from the 30-day target. The +/// common "already-fresh token" path therefore performs no write and takes +/// no row lock. A data-modifying CTE always runs to completion even though +/// nothing selects from it, so the slide still happens when due. +pub const SLIDING_SESSION_AUTH_SQL: &str = "\ +WITH session AS ( + SELECT user_id FROM auth_sessions + WHERE token_hash = $1 AND expires_at > NOW() +), +slide AS ( + UPDATE auth_sessions + SET expires_at = NOW() + INTERVAL '30 days' + WHERE token_hash = $1 + AND expires_at > NOW() + AND expires_at < NOW() + INTERVAL '29 days' +) +SELECT user_id FROM session"; + #[cfg(test)] mod tests { use super::*; diff --git a/crates/tracevault-server/src/extractors.rs b/crates/tracevault-server/src/extractors.rs index 2776c19d..c960bdc1 100644 --- a/crates/tracevault-server/src/extractors.rs +++ b/crates/tracevault-server/src/extractors.rs @@ -32,24 +32,13 @@ impl FromRequestParts for AuthUser { let token_hash = sha256_hex(header); - // Sliding session window: on every successful auth, bump expires_at - // to NOW() + 30 days. As long as the user touches TV at least once - // per 30 days, the token never expires from their perspective. - // Inactive users still hit the 30-day cliff and must re-login. - // - // Using UPDATE..RETURNING means we atomically (a) verify the token - // is still valid (the WHERE filters expired rows out), (b) extend - // it, (c) read back the user_id — one round-trip, no race. - let session_row = sqlx::query_as::<_, (Uuid,)>( - "UPDATE auth_sessions - SET expires_at = NOW() + INTERVAL '30 days' - WHERE token_hash = $1 AND expires_at > NOW() - RETURNING user_id", - ) - .bind(&token_hash) - .fetch_optional(&state.pool) - .await - .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?; + // Sliding session window — validate the token and slide its expiry + // forward when due. See `SLIDING_SESSION_AUTH_SQL` for the why. + let session_row = sqlx::query_as::<_, (Uuid,)>(crate::auth::SLIDING_SESSION_AUTH_SQL) + .bind(&token_hash) + .fetch_optional(&state.pool) + .await + .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?; if let Some((user_id,)) = session_row { return Ok(AuthUser { user_id }); @@ -130,19 +119,13 @@ impl FromRequestParts for OrgAuth { let org_id = org_row.0; - // Try session auth — sliding window: same pattern as AuthUser, see - // the comment there. UPDATE..RETURNING atomically bumps expires_at - // and returns the user_id only when the token is still valid. - let session_row = sqlx::query_as::<_, (Uuid,)>( - "UPDATE auth_sessions - SET expires_at = NOW() + INTERVAL '30 days' - WHERE token_hash = $1 AND expires_at > NOW() - RETURNING user_id", - ) - .bind(&token_hash) - .fetch_optional(&state.pool) - .await - .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()))?; + // Try session auth — sliding window, same statement as `AuthUser`. + // See `SLIDING_SESSION_AUTH_SQL` for the why. + let session_row = sqlx::query_as::<_, (Uuid,)>(crate::auth::SLIDING_SESSION_AUTH_SQL) + .bind(&token_hash) + .fetch_optional(&state.pool) + .await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()))?; if let Some((user_id,)) = session_row { // Check membership diff --git a/crates/tracevault-server/tests/sliding_session_test.rs b/crates/tracevault-server/tests/sliding_session_test.rs index cf2fc612..90360ecd 100644 --- a/crates/tracevault-server/tests/sliding_session_test.rs +++ b/crates/tracevault-server/tests/sliding_session_test.rs @@ -1,18 +1,18 @@ -//! Verifies the sliding-session-window behavior added to the auth -//! extractors. On every successful auth, the matching `auth_sessions` row -//! has its `expires_at` bumped to NOW() + 30 days. +//! Verifies the sliding-session-window behavior used by the auth +//! extractors. On a successful auth the matching `auth_sessions` row has +//! its `expires_at` slid forward to NOW() + 30 days — but only when the +//! expiry has drifted more than a day, so a fresh token is a pure read. //! -//! The extractor is wired into HTTP handlers and not trivially callable in -//! isolation, but the SQL pattern it uses is `UPDATE auth_sessions SET -//! expires_at = NOW() + INTERVAL '30 days' WHERE token_hash = $1 AND -//! expires_at > NOW() RETURNING user_id`. These tests exercise that -//! statement directly against a real Postgres pool, which is the precise -//! source of the behavior we want to lock in. +//! The extractors are wired into HTTP handlers and not trivially callable +//! in isolation, so these tests run the *exact* statement the extractors +//! use — `tracevault_server::auth::SLIDING_SESSION_AUTH_SQL` — against a +//! real Postgres pool. Importing the shared const (rather than copying the +//! SQL here) means the test breaks if the production query ever diverges. mod common; use chrono::{Duration, Utc}; -use tracevault_server::auth::sha256_hex; +use tracevault_server::auth::{sha256_hex, SLIDING_SESSION_AUTH_SQL}; use uuid::Uuid; async fn seed_session_with_expiry( @@ -33,53 +33,82 @@ async fn seed_session_with_expiry( .unwrap() } -/// Run the sliding-window UPDATE used by the extractor and return the -/// updated `expires_at` for the session row (or None if the row was -/// expired / not found). -async fn run_sliding_update( - pool: &sqlx::PgPool, - token_hash: &str, -) -> Option> { - sqlx::query_as::<_, (chrono::DateTime,)>( - "UPDATE auth_sessions - SET expires_at = NOW() + INTERVAL '30 days' - WHERE token_hash = $1 AND expires_at > NOW() - RETURNING expires_at", +/// Run the production auth statement and return the matched `user_id` +/// (or None if the token was expired / not found). +async fn run_auth(pool: &sqlx::PgPool, token_hash: &str) -> Option { + sqlx::query_as::<_, (Uuid,)>(SLIDING_SESSION_AUTH_SQL) + .bind(token_hash) + .fetch_optional(pool) + .await + .unwrap() + .map(|(id,)| id) +} + +/// Read the current `expires_at` for a token's session row. +async fn current_expiry(pool: &sqlx::PgPool, token_hash: &str) -> chrono::DateTime { + sqlx::query_scalar::<_, chrono::DateTime>( + "SELECT expires_at FROM auth_sessions WHERE token_hash = $1", ) .bind(token_hash) - .fetch_optional(pool) + .fetch_one(pool) .await .unwrap() - .map(|(ts,)| ts) } #[sqlx::test(migrations = "./migrations")] -async fn auth_extends_expires_at_on_each_hit(pool: sqlx::PgPool) { +async fn auth_extends_expires_at_for_a_stale_session(pool: sqlx::PgPool) { let user_id = common::seed_user(&pool).await; let token = format!("tok_{}", Uuid::new_v4()); let token_hash = sha256_hex(&token); - // Seed an "almost expired" session: 2 minutes left. + // Seed an "almost expired" session: 2 minutes left. This is well past + // the 1-day slide threshold, so auth should both succeed and bump it. let initial_expires = Utc::now() + Duration::minutes(2); seed_session_with_expiry(&pool, user_id, &token_hash, initial_expires).await; - let bumped = run_sliding_update(&pool, &token_hash) + let returned = run_auth(&pool, &token_hash) .await - .expect("valid (not-yet-expired) session should be extended"); + .expect("valid (not-yet-expired) session should authenticate"); + assert_eq!(returned, user_id, "auth must return the owning user_id"); - // After the bump, expires_at should be ~30 days in the future — at - // minimum, more than 29 days from now. We don't pin it exactly because - // NOW() in Postgres is evaluated server-side and clock skew between - // test machine and server is real (postgres in Docker, test in host). + // After the slide, expires_at should be ~30 days out — at minimum more + // than 29 days from now. We don't pin it exactly because NOW() is + // evaluated server-side and clock skew between the test host and the + // Postgres container is real. let min_expected = Utc::now() + Duration::days(29); assert!( - bumped > min_expected, - "sliding window should extend expires_at to ~30 days; got {bumped}" + current_expiry(&pool, &token_hash).await > min_expected, + "sliding window should extend a stale session's expiry to ~30 days" ); } #[sqlx::test(migrations = "./migrations")] -async fn auth_does_not_extend_already_expired_session(pool: sqlx::PgPool) { +async fn auth_does_not_rewrite_a_fresh_session(pool: sqlx::PgPool) { + let user_id = common::seed_user(&pool).await; + let token = format!("tok_{}", Uuid::new_v4()); + let token_hash = sha256_hex(&token); + + // Seed a freshly-bumped session: expires in 30 days. It is within the + // 1-day slide threshold, so auth must still succeed but leave the row + // untouched (the common hot-path that avoids a write + row lock). + let fresh_expires = Utc::now() + Duration::days(30); + seed_session_with_expiry(&pool, user_id, &token_hash, fresh_expires).await; + let before = current_expiry(&pool, &token_hash).await; + + let returned = run_auth(&pool, &token_hash) + .await + .expect("a fresh session should still authenticate"); + assert_eq!(returned, user_id); + + let after = current_expiry(&pool, &token_hash).await; + assert_eq!( + before, after, + "a fresh session must not be rewritten by auth (no needless write)" + ); +} + +#[sqlx::test(migrations = "./migrations")] +async fn auth_does_not_revive_an_already_expired_session(pool: sqlx::PgPool) { let user_id = common::seed_user(&pool).await; let token = format!("tok_{}", Uuid::new_v4()); let token_hash = sha256_hex(&token); @@ -88,29 +117,20 @@ async fn auth_does_not_extend_already_expired_session(pool: sqlx::PgPool) { let expired_at = Utc::now() - Duration::hours(1); seed_session_with_expiry(&pool, user_id, &token_hash, expired_at).await; - let result = run_sliding_update(&pool, &token_hash).await; assert!( - result.is_none(), - "an already-expired session must NOT be revived by a sliding-window update; got {result:?}" + run_auth(&pool, &token_hash).await.is_none(), + "an already-expired session must NOT authenticate or be revived" ); - // Belt-and-braces: confirm the row's expires_at is still in the past. - let row_expires_at = sqlx::query_scalar::<_, chrono::DateTime>( - "SELECT expires_at FROM auth_sessions WHERE token_hash = $1", - ) - .bind(&token_hash) - .fetch_one(&pool) - .await - .unwrap(); + // Belt-and-braces: the row's expires_at must remain in the past. assert!( - row_expires_at < Utc::now(), - "row's expires_at should remain in the past after a no-op sliding update" + current_expiry(&pool, &token_hash).await < Utc::now(), + "row's expires_at should remain in the past after a no-op auth" ); } #[sqlx::test(migrations = "./migrations")] -async fn auth_sliding_update_is_a_noop_for_unknown_tokens(pool: sqlx::PgPool) { +async fn auth_is_a_noop_for_unknown_tokens(pool: sqlx::PgPool) { let unknown_hash = sha256_hex("definitely-not-a-real-token"); - let result = run_sliding_update(&pool, &unknown_hash).await; - assert!(result.is_none()); + assert!(run_auth(&pool, &unknown_hash).await.is_none()); } From be9dea1f9c39759016587731ab885f2783a533f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kuras?= Date: Mon, 1 Jun 2026 07:51:00 +0200 Subject: [PATCH 4/4] refactor(cli): flatten connectivity_message, recognize bare 5xx codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use early returns instead of an intermediate Option + match (the second Aikido note). Replace the near-meaningless `contains("5")` heuristic with explicit 5xx status-code matching so `503 Service Unavailable` and `504 Gateway Timeout` — which carry neither "internal" nor "server error" text — are classified as server faults instead of falling through with no advice. Check 5xx before the transport keywords so a gateway timeout is not misreported as a local network problem. Co-Authored-By: Claude Opus 4.8 --- crates/tracevault-cli/src/commands/check.rs | 104 ++++++++++++++++---- 1 file changed, 85 insertions(+), 19 deletions(-) diff --git a/crates/tracevault-cli/src/commands/check.rs b/crates/tracevault-cli/src/commands/check.rs index b9b858a2..5f3ea1a2 100644 --- a/crates/tracevault-cli/src/commands/check.rs +++ b/crates/tracevault-cli/src/commands/check.rs @@ -238,32 +238,66 @@ pub async fn check_policies(project_root: &Path) -> Result<(), Box String { let lower = raw.to_ascii_lowercase(); - let action = if lower.contains("401") || lower.contains("unauthorized") { - Some("Session token may be expired. Run `tracevault login --server-url=` to refresh.") - } else if lower.contains("403") || lower.contains("forbidden") { - Some("Your token lacks access to this repo's policies. Check your org membership and rerun `tracevault login`.") - } else if lower.contains("dns") + + // 401 — token rejected. An expired session is the most common cause of + // a surprise blocked push, so point straight at the refresh command. + if lower.contains("401") || lower.contains("unauthorized") { + return with_action( + raw, + "Session token may be expired. Run `tracevault login --server-url=` to refresh.", + ); + } + // 403 — authenticated but not allowed. Re-login won't help; this is an + // org-membership problem. + if lower.contains("403") || lower.contains("forbidden") { + return with_action( + raw, + "Your token lacks access to this repo's policies. Check your org membership and rerun `tracevault login`.", + ); + } + // 5xx — server-side fault. Checked before the transport keywords below + // so a `504 Gateway Timeout` reads as a server issue, not a local one. + if is_server_error(&lower) { + return with_action( + raw, + "TraceVault server returned an error. The team has likely been paged; retry shortly.", + ); + } + // Transport-level failure with no HTTP status — DNS, refused, timeout. + // ("connect" also covers "connection refused"/"connection reset".) + if lower.contains("dns") || lower.contains("connect") || lower.contains("timed out") || lower.contains("timeout") - || lower.contains("connection refused") - { - Some("Could not reach the TraceVault server. Check network and `server_url` in .tracevault/config.toml.") - } else if lower.contains("5") && (lower.contains("server error") || lower.contains("internal")) { - Some("TraceVault server returned an error. The team has likely been paged; retry shortly.") - } else { - None - }; - - match action { - Some(a) => format!("Policy check could not run: {raw}.\n → {a}"), - None => format!("Policy check could not run: {raw}."), + return with_action( + raw, + "Could not reach the TraceVault server. Check network and `server_url` in .tracevault/config.toml.", + ); } + // Unrecognized — surface the raw error verbatim, no invented advice. + format!("Policy check could not run: {raw}.") +} + +/// Format the standard "could not run" line with an actionable next step. +fn with_action(raw: &str, action: &str) -> String { + format!("Policy check could not run: {raw}.\n → {action}") +} + +/// Heuristic for a 5xx server-side failure. Matches the textual status +/// (`internal`, `server error`) and the concrete 5xx codes so a bare +/// `503 Service Unavailable` — which carries neither phrase — is still +/// recognized. +fn is_server_error(lower: &str) -> bool { + lower.contains("internal") + || lower.contains("server error") + || ["500", "501", "502", "503", "504"] + .iter() + .any(|code| lower.contains(code)) } #[cfg(test)] @@ -318,4 +352,36 @@ mod tests { "403 should mention membership; got: {m}" ); } + + #[test] + fn connectivity_message_flags_500_as_server_error() { + let m = connectivity_message("Server returned 500 Internal Server Error: oops"); + assert!( + m.to_lowercase().contains("server returned an error"), + "500 should surface the server-error hint; got: {m}" + ); + } + + #[test] + fn connectivity_message_flags_503_without_internal_text() { + // `503 Service Unavailable` carries neither "internal" nor + // "server error" — the bare code must still be recognized. + let m = connectivity_message("Server returned 503 Service Unavailable: "); + assert!( + m.to_lowercase().contains("server returned an error"), + "503 should surface the server-error hint; got: {m}" + ); + } + + #[test] + fn connectivity_message_treats_504_as_server_not_network() { + // A gateway timeout is a server-side fault even though it contains + // the word "timeout"; it must not be misreported as a local network + // problem. + let m = connectivity_message("Server returned 504 Gateway Timeout: "); + assert!( + m.to_lowercase().contains("server returned an error"), + "504 should be treated as a server error; got: {m}" + ); + } }