diff --git a/apps/web/src/app/api/models/up/route.ts b/apps/web/src/app/api/models/up/route.ts index 129791612..e0e56217b 100644 --- a/apps/web/src/app/api/models/up/route.ts +++ b/apps/web/src/app/api/models/up/route.ts @@ -12,7 +12,9 @@ type ModelHealthMetrics = { previousRequests: number; baselineRequests: number; percentChange: number; - absoluteDrop: number; // Absolute difference (negative for drops) + absoluteDrop: number; + uniqueUsersCurrent: number; + uniqueUsersBaseline: number; }; type HealthResponseMetadata = { @@ -33,9 +35,15 @@ type HealthResponseError = { const HIGH_BASELINE = 300; const LOW_BASELINE = 50; -// Models excluded from the health check but still preferred/recommended. -// Useful for preview models with inconsistent traffic that cause false alerts. -const HEALTH_CHECK_EXCLUSIONS = new Set(['google/gemini-3.1-pro-preview']); +// Only alert if the baseline window had at least this many distinct users. +// Prevents abuse actors (who operate many accounts from few IPs) from +// inflating baselines and triggering false drops when they pause. +const MIN_UNIQUE_USERS_FOR_ALERT = 20; + +// Statement timeout for the health check query. If the query takes longer, +// we fail open (report healthy) since a timeout is not evidence of a model +// being down. +const STATEMENT_TIMEOUT_MS = 10_000; export async function GET( request: Request @@ -47,38 +55,50 @@ export async function GET( return NextResponse.json({ healthy: false }, { status: 401 }); } - const monitoredModels = (await getMonitoredModels()).filter(m => !HEALTH_CHECK_EXCLUSIONS.has(m)); + const monitoredModels = await getMonitoredModels(); try { const queryStartTime = Date.now(); - const result = await db.execute<{ - requested_model: string; - current_requests: string; - previous_requests: string; - baseline_requests: string; - }>(sql` - WITH all_periods AS ( + const result = await db.transaction(async tx => { + await tx.execute(sql.raw(`SET LOCAL statement_timeout = '${STATEMENT_TIMEOUT_MS}'`)); + return tx.execute<{ + requested_model: string; + current_requests: string; + previous_requests: string; + baseline_requests: string; + unique_users_current: string; + unique_users_baseline: string; + }>(sql` + WITH all_periods AS ( + SELECT + requested_model, + COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') AS current_requests, + COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 minutes' + AND created_at < NOW() - INTERVAL '15 minutes') AS previous_requests, + COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours' + AND created_at < NOW() - INTERVAL '30 minutes') / 6.0 AS avg_baseline, + COUNT(DISTINCT kilo_user_id) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') + AS unique_users_current, + COUNT(DISTINCT kilo_user_id) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours' + AND created_at < NOW() - INTERVAL '30 minutes') + AS unique_users_baseline + FROM ${microdollar_usage} + WHERE + created_at >= NOW() - INTERVAL '2 hours' + AND has_error = false + AND requested_model IN (${sql.join(monitoredModels, sql`, `)}) + GROUP BY requested_model + ) SELECT requested_model, - COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') AS current_requests, - COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 minutes' - AND created_at < NOW() - INTERVAL '15 minutes') AS previous_requests, - COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours' - AND created_at < NOW() - INTERVAL '30 minutes') / 6.0 AS avg_baseline - FROM ${microdollar_usage} - WHERE - created_at >= NOW() - INTERVAL '2 hours' - AND has_error = false - AND requested_model IN (${sql.join(monitoredModels, sql`, `)}) - GROUP BY requested_model - ) - SELECT - requested_model, - current_requests::text AS current_requests, - previous_requests::text AS previous_requests, - ROUND(avg_baseline)::text AS baseline_requests - FROM all_periods - `); + current_requests::text AS current_requests, + previous_requests::text AS previous_requests, + ROUND(avg_baseline)::text AS baseline_requests, + unique_users_current::text AS unique_users_current, + unique_users_baseline::text AS unique_users_baseline + FROM all_periods + `); + }); const models: Record = {}; let hasSignificantDrop = false; @@ -87,6 +107,8 @@ export async function GET( const currentRequests = parseInt(row.current_requests, 10); const previousRequests = parseInt(row.previous_requests, 10); const baselineRequests = parseInt(row.baseline_requests, 10); + const uniqueUsersCurrent = parseInt(row.unique_users_current, 10); + const uniqueUsersBaseline = parseInt(row.unique_users_baseline, 10); const percentChange = baselineRequests > 0 ? Math.round(((currentRequests - baselineRequests) / baselineRequests) * 100) @@ -99,20 +121,25 @@ export async function GET( baselineRequests, percentChange, absoluteDrop, + uniqueUsersCurrent, + uniqueUsersBaseline, }; // Alert logic: // - High traffic models (>HIGH_BASELINE): Alert on >90% drop // - Low traffic models (>LOW_BASELINE && HIGH_BASELINE && percentChange < -90) || - (baselineRequests > LOW_BASELINE && - baselineRequests < HIGH_BASELINE && - currentRequests === 0 && - previousRequests === 0) - ) { - hasSignificantDrop = true; + // - Only alert if the baseline had enough distinct users to represent organic traffic + + if (uniqueUsersBaseline >= MIN_UNIQUE_USERS_FOR_ALERT) { + if ( + (baselineRequests > HIGH_BASELINE && percentChange < -90) || + (baselineRequests > LOW_BASELINE && + baselineRequests < HIGH_BASELINE && + currentRequests === 0 && + previousRequests === 0) + ) { + hasSignificantDrop = true; + } } }); @@ -125,6 +152,8 @@ export async function GET( baselineRequests: 0, percentChange: 0, absoluteDrop: 0, + uniqueUsersCurrent: 0, + uniqueUsersBaseline: 0, }; // Don't mark as unhealthy if no data - baseline is 0 anyway } @@ -150,6 +179,17 @@ export async function GET( extra: { monitoredModels }, }); - return NextResponse.json({ healthy: false }, { status: 503 }); + // Fail open: a query timeout or DB error is not evidence of a model being down. + return NextResponse.json( + { + healthy: true, + models: {} as Record, + metadata: { + timestamp: new Date().toISOString(), + queryExecutionTimeMs: -1, + }, + }, + { status: 200 } + ); } }