Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 81 additions & 41 deletions apps/web/src/app/api/models/up/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ type ModelHealthMetrics = {
previousRequests: number;
baselineRequests: number;
percentChange: number;
absoluteDrop: number; // Absolute difference (negative for drops)
absoluteDrop: number;
uniqueUsersCurrent: number;
uniqueUsersBaseline: number;
};

type HealthResponseMetadata = {
Expand All @@ -33,9 +35,15 @@ type HealthResponseError = {
const HIGH_BASELINE = 300;
const LOW_BASELINE = 50;

// Models excluded from the health check but still preferred/recommended.
// Useful for preview models with inconsistent traffic that cause false alerts.
const HEALTH_CHECK_EXCLUSIONS = new Set(['google/gemini-3.1-pro-preview']);
// Only alert if the baseline window had at least this many distinct users.
// Prevents abuse actors (who operate many accounts from few IPs) from
// inflating baselines and triggering false drops when they pause.
const MIN_UNIQUE_USERS_FOR_ALERT = 20;

// Statement timeout for the health check query. If the query takes longer,
// we fail open (report healthy) since a timeout is not evidence of a model
// being down.
const STATEMENT_TIMEOUT_MS = 10_000;

export async function GET(
request: Request
Expand All @@ -47,38 +55,50 @@ export async function GET(
return NextResponse.json({ healthy: false }, { status: 401 });
}

const monitoredModels = (await getMonitoredModels()).filter(m => !HEALTH_CHECK_EXCLUSIONS.has(m));
const monitoredModels = await getMonitoredModels();

try {
const queryStartTime = Date.now();
const result = await db.execute<{
requested_model: string;
current_requests: string;
previous_requests: string;
baseline_requests: string;
}>(sql`
WITH all_periods AS (
const result = await db.transaction(async tx => {
await tx.execute(sql.raw(`SET LOCAL statement_timeout = '${STATEMENT_TIMEOUT_MS}'`));
return tx.execute<{
requested_model: string;
current_requests: string;
previous_requests: string;
baseline_requests: string;
unique_users_current: string;
unique_users_baseline: string;
}>(sql`
WITH all_periods AS (
SELECT
requested_model,
COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') AS current_requests,
COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 minutes'
AND created_at < NOW() - INTERVAL '15 minutes') AS previous_requests,
COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours'
AND created_at < NOW() - INTERVAL '30 minutes') / 6.0 AS avg_baseline,
COUNT(DISTINCT kilo_user_id) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes')
AS unique_users_current,
COUNT(DISTINCT kilo_user_id) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours'
AND created_at < NOW() - INTERVAL '30 minutes')
AS unique_users_baseline
FROM ${microdollar_usage}
WHERE
created_at >= NOW() - INTERVAL '2 hours'
AND has_error = false
AND requested_model IN (${sql.join(monitoredModels, sql`, `)})
GROUP BY requested_model
)
SELECT
requested_model,
COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '15 minutes') AS current_requests,
COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 minutes'
AND created_at < NOW() - INTERVAL '15 minutes') AS previous_requests,
COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '2 hours'
AND created_at < NOW() - INTERVAL '30 minutes') / 6.0 AS avg_baseline
FROM ${microdollar_usage}
WHERE
created_at >= NOW() - INTERVAL '2 hours'
AND has_error = false
AND requested_model IN (${sql.join(monitoredModels, sql`, `)})
GROUP BY requested_model
)
SELECT
requested_model,
current_requests::text AS current_requests,
previous_requests::text AS previous_requests,
ROUND(avg_baseline)::text AS baseline_requests
FROM all_periods
`);
current_requests::text AS current_requests,
previous_requests::text AS previous_requests,
ROUND(avg_baseline)::text AS baseline_requests,
unique_users_current::text AS unique_users_current,
unique_users_baseline::text AS unique_users_baseline
FROM all_periods
`);
});

const models: Record<string, ModelHealthMetrics> = {};
let hasSignificantDrop = false;
Expand All @@ -87,6 +107,8 @@ export async function GET(
const currentRequests = parseInt(row.current_requests, 10);
const previousRequests = parseInt(row.previous_requests, 10);
const baselineRequests = parseInt(row.baseline_requests, 10);
const uniqueUsersCurrent = parseInt(row.unique_users_current, 10);
const uniqueUsersBaseline = parseInt(row.unique_users_baseline, 10);
const percentChange =
baselineRequests > 0
? Math.round(((currentRequests - baselineRequests) / baselineRequests) * 100)
Expand All @@ -99,20 +121,25 @@ export async function GET(
baselineRequests,
percentChange,
absoluteDrop,
uniqueUsersCurrent,
uniqueUsersBaseline,
};

// Alert logic:
// - High traffic models (>HIGH_BASELINE): Alert on >90% drop
// - Low traffic models (>LOW_BASELINE && <HIGH_BASELINE): Alert on consecutive zeros (current AND previous)

if (
(baselineRequests > HIGH_BASELINE && percentChange < -90) ||
(baselineRequests > LOW_BASELINE &&
baselineRequests < HIGH_BASELINE &&
currentRequests === 0 &&
previousRequests === 0)
) {
hasSignificantDrop = true;
// - Only alert if the baseline had enough distinct users to represent organic traffic

if (uniqueUsersBaseline >= MIN_UNIQUE_USERS_FOR_ALERT) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Distinct-user threshold does not block the incident pattern

The incident in the PR description already came from a single actor spread across ~280 accounts. Because unique_users_baseline is counting distinct kilo_user_ids over the whole baseline window, that actor still clears the >= 20 gate and the same pause will continue to look like an organic drop. To protect against the described false positive, this check needs a signal the actor cannot inflate by creating accounts (for example IP / JA4 / user-agent concentration, or a per-identity cap).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to look at this specific example as well, but this is probably a good step for now

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is a good step. As follow up, worth exploring a concentration based signal. Like you mentioned the IP join is expensive, but combining it with users would make the baseline less prone to these attacks since user-level alone wouldn't have caught this one (280 accounts).

if (
(baselineRequests > HIGH_BASELINE && percentChange < -90) ||
(baselineRequests > LOW_BASELINE &&
baselineRequests < HIGH_BASELINE &&
currentRequests === 0 &&
previousRequests === 0)
) {
hasSignificantDrop = true;
}
}
});

Expand All @@ -125,6 +152,8 @@ export async function GET(
baselineRequests: 0,
percentChange: 0,
absoluteDrop: 0,
uniqueUsersCurrent: 0,
uniqueUsersBaseline: 0,
};
// Don't mark as unhealthy if no data - baseline is 0 anyway
}
Expand All @@ -150,6 +179,17 @@ export async function GET(
extra: { monitoredModels },
});

return NextResponse.json({ healthy: false }, { status: 503 });
// Fail open: a query timeout or DB error is not evidence of a model being down.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

query timeout is currently 15 times actual time it takes to load

return NextResponse.json(
{
healthy: true,
models: {} as Record<string, ModelHealthMetrics>,
metadata: {
timestamp: new Date().toISOString(),
queryExecutionTimeMs: -1,
},
},
{ status: 200 }
);
}
}
Loading