-
Notifications
You must be signed in to change notification settings - Fork 31
fix(models/up): harden health check against abuse-inflated baselines #2392
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+81
−41
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,9 @@ type ModelHealthMetrics = { | |
| previousRequests: number; | ||
| baselineRequests: number; | ||
| percentChange: number; | ||
| absoluteDrop: number; // Absolute difference (negative for drops) | ||
| absoluteDrop: number; | ||
| uniqueUsersCurrent: number; | ||
| uniqueUsersBaseline: number; | ||
| }; | ||
|
|
||
| type HealthResponseMetadata = { | ||
|
|
@@ -33,9 +35,15 @@ type HealthResponseError = { | |
| const HIGH_BASELINE = 300; | ||
| const LOW_BASELINE = 50; | ||
|
|
||
| // Models excluded from the health check but still preferred/recommended. | ||
| // Useful for preview models with inconsistent traffic that cause false alerts. | ||
| const HEALTH_CHECK_EXCLUSIONS = new Set(['google/gemini-3.1-pro-preview']); | ||
| // Only alert if the baseline window had at least this many distinct users. | ||
| // Prevents abuse actors (who operate many accounts from few IPs) from | ||
| // inflating baselines and triggering false drops when they pause. | ||
| const MIN_UNIQUE_USERS_FOR_ALERT = 20; | ||
|
|
||
| // Statement timeout for the health check query. If the query takes longer, | ||
| // we fail open (report healthy) since a timeout is not evidence of a model | ||
| // being down. | ||
| const STATEMENT_TIMEOUT_MS = 10_000; | ||
|
|
||
| export async function GET( | ||
| request: Request | ||
|
|
@@ -47,38 +55,50 @@ export async function GET( | |
| return NextResponse.json({ healthy: false }, { status: 401 }); | ||
| } | ||
|
|
||
| const monitoredModels = (await getMonitoredModels()).filter(m => !HEALTH_CHECK_EXCLUSIONS.has(m)); | ||
| const monitoredModels = await getMonitoredModels(); | ||
|
|
||
| try { | ||
| const queryStartTime = Date.now(); | ||
| const result = await db.execute<{ | ||
| requested_model: string; | ||
| current_requests: string; | ||
| previous_requests: string; | ||
| baseline_requests: string; | ||
| }>(sql` | ||
| WITH all_periods AS ( | ||
| const result = await db.transaction(async tx => { | ||
| await tx.execute(sql.raw(`SET LOCAL statement_timeout = '${STATEMENT_TIMEOUT_MS}'`)); | ||
| return tx.execute<{ | ||
| requested_model: string; | ||
| current_requests: string; | ||
| previous_requests: string; | ||
| baseline_requests: string; | ||
| unique_users_current: string; | ||
| unique_users_baseline: string; | ||
| }>(sql` | ||
| WITH all_periods AS ( | ||
| SELECT | ||
| requested_model, | ||
| COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') AS current_requests, | ||
| COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 minutes' | ||
| AND created_at < NOW() - INTERVAL '15 minutes') AS previous_requests, | ||
| COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours' | ||
| AND created_at < NOW() - INTERVAL '30 minutes') / 6.0 AS avg_baseline, | ||
| COUNT(DISTINCT kilo_user_id) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') | ||
| AS unique_users_current, | ||
| COUNT(DISTINCT kilo_user_id) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours' | ||
| AND created_at < NOW() - INTERVAL '30 minutes') | ||
| AS unique_users_baseline | ||
| FROM ${microdollar_usage} | ||
| WHERE | ||
| created_at >= NOW() - INTERVAL '2 hours' | ||
| AND has_error = false | ||
| AND requested_model IN (${sql.join(monitoredModels, sql`, `)}) | ||
| GROUP BY requested_model | ||
| ) | ||
| SELECT | ||
| requested_model, | ||
| COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') AS current_requests, | ||
| COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 minutes' | ||
| AND created_at < NOW() - INTERVAL '15 minutes') AS previous_requests, | ||
| COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours' | ||
| AND created_at < NOW() - INTERVAL '30 minutes') / 6.0 AS avg_baseline | ||
| FROM ${microdollar_usage} | ||
| WHERE | ||
| created_at >= NOW() - INTERVAL '2 hours' | ||
| AND has_error = false | ||
| AND requested_model IN (${sql.join(monitoredModels, sql`, `)}) | ||
| GROUP BY requested_model | ||
| ) | ||
| SELECT | ||
| requested_model, | ||
| current_requests::text AS current_requests, | ||
| previous_requests::text AS previous_requests, | ||
| ROUND(avg_baseline)::text AS baseline_requests | ||
| FROM all_periods | ||
| `); | ||
| current_requests::text AS current_requests, | ||
| previous_requests::text AS previous_requests, | ||
| ROUND(avg_baseline)::text AS baseline_requests, | ||
| unique_users_current::text AS unique_users_current, | ||
| unique_users_baseline::text AS unique_users_baseline | ||
| FROM all_periods | ||
| `); | ||
| }); | ||
|
|
||
| const models: Record<string, ModelHealthMetrics> = {}; | ||
| let hasSignificantDrop = false; | ||
|
|
@@ -87,6 +107,8 @@ export async function GET( | |
| const currentRequests = parseInt(row.current_requests, 10); | ||
| const previousRequests = parseInt(row.previous_requests, 10); | ||
| const baselineRequests = parseInt(row.baseline_requests, 10); | ||
| const uniqueUsersCurrent = parseInt(row.unique_users_current, 10); | ||
| const uniqueUsersBaseline = parseInt(row.unique_users_baseline, 10); | ||
| const percentChange = | ||
| baselineRequests > 0 | ||
| ? Math.round(((currentRequests - baselineRequests) / baselineRequests) * 100) | ||
|
|
@@ -99,20 +121,25 @@ export async function GET( | |
| baselineRequests, | ||
| percentChange, | ||
| absoluteDrop, | ||
| uniqueUsersCurrent, | ||
| uniqueUsersBaseline, | ||
| }; | ||
|
|
||
| // Alert logic: | ||
| // - High traffic models (>HIGH_BASELINE): Alert on >90% drop | ||
| // - Low traffic models (>LOW_BASELINE && <HIGH_BASELINE): Alert on consecutive zeros (current AND previous) | ||
|
|
||
| if ( | ||
| (baselineRequests > HIGH_BASELINE && percentChange < -90) || | ||
| (baselineRequests > LOW_BASELINE && | ||
| baselineRequests < HIGH_BASELINE && | ||
| currentRequests === 0 && | ||
| previousRequests === 0) | ||
| ) { | ||
| hasSignificantDrop = true; | ||
| // - Only alert if the baseline had enough distinct users to represent organic traffic | ||
|
|
||
| if (uniqueUsersBaseline >= MIN_UNIQUE_USERS_FOR_ALERT) { | ||
| if ( | ||
| (baselineRequests > HIGH_BASELINE && percentChange < -90) || | ||
| (baselineRequests > LOW_BASELINE && | ||
| baselineRequests < HIGH_BASELINE && | ||
| currentRequests === 0 && | ||
| previousRequests === 0) | ||
| ) { | ||
| hasSignificantDrop = true; | ||
| } | ||
| } | ||
| }); | ||
|
|
||
|
|
@@ -125,6 +152,8 @@ export async function GET( | |
| baselineRequests: 0, | ||
| percentChange: 0, | ||
| absoluteDrop: 0, | ||
| uniqueUsersCurrent: 0, | ||
| uniqueUsersBaseline: 0, | ||
| }; | ||
| // Don't mark as unhealthy if no data - baseline is 0 anyway | ||
| } | ||
|
|
@@ -150,6 +179,17 @@ export async function GET( | |
| extra: { monitoredModels }, | ||
| }); | ||
|
|
||
| return NextResponse.json({ healthy: false }, { status: 503 }); | ||
| // Fail open: a query timeout or DB error is not evidence of a model being down. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. query timeout is currently 15 times actual time it takes to load |
||
| return NextResponse.json( | ||
| { | ||
| healthy: true, | ||
| models: {} as Record<string, ModelHealthMetrics>, | ||
| metadata: { | ||
| timestamp: new Date().toISOString(), | ||
| queryExecutionTimeMs: -1, | ||
| }, | ||
| }, | ||
| { status: 200 } | ||
| ); | ||
| } | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
WARNING: Distinct-user threshold does not block the incident pattern
The incident in the PR description already came from a single actor spread across ~280 accounts. Because
unique_users_baselineis counting distinctkilo_user_ids over the whole baseline window, that actor still clears the>= 20gate and the same pause will continue to look like an organic drop. To protect against the described false positive, this check needs a signal the actor cannot inflate by creating accounts (for example IP / JA4 / user-agent concentration, or a per-identity cap).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we need to look at this specific example as well, but this is probably a good step for now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is a good step. As follow up, worth exploring a concentration based signal. Like you mentioned the IP join is expensive, but combining it with users would make the baseline less prone to these attacks since user-level alone wouldn't have caught this one (280 accounts).