Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions packages/app/cypress/e2e/measured-power-overlay.cy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Verifies the new measured-power Y-axis options render on the unofficial-run
// overlay path against a real GitHub Actions artifact (run 26312107787 — the
// on-PR sweep for PR #1558 / qwen3.5-fp8-h200-sglang). This is the canonical
// "preview before merge" test path per CLAUDE.md's overlay requirement.

describe('Measured power on unofficial-run overlay', () => {
beforeEach(() => {
cy.visit('/inference?unofficialrun=26312107787', {
onBeforeLoad(win) {
win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
win.localStorage.setItem('inferencex-feature-gate', '1');
},
});
cy.get('[data-testid="inference-chart-display"]', { timeout: 30_000 }).should('exist');
});

it('exposes the Measured Energy dropdown group and renders overlay points', () => {
// Open Y-axis dropdown
cy.get('[data-testid="yaxis-metric-selector"]').click();
cy.get('[data-slot="select-content"]').should('exist');

// Verify the gated "Measured Energy" group + both options. The select list is a
// scroll container (max-h-72 overflow-y-auto), and this group sits below the fold,
// so scroll each target into view before asserting visibility.
cy.contains('[data-slot="select-content"]', 'Measured Energy')
.scrollIntoView()
.should('be.visible');
cy.contains('[role="option"]', 'Measured Average Power per GPU')
.scrollIntoView()
.should('be.visible');
cy.contains('[role="option"]', 'Measured Joules per Output Token')
.scrollIntoView()
.should('be.visible');

// Select the power option
cy.contains('[role="option"]', 'Measured Average Power per GPU').click();
cy.get('[data-slot="select-content"]').should('not.exist');

// Initial-load screenshot
cy.screenshot('measured-power-selected', { capture: 'viewport' });

// The chart should now contain SVG <path> + <circle>/<polygon> elements
// (overlay points typically render as triangles). Existence is enough —
// visual correctness is reviewed in the screenshot.
cy.get('[data-testid="inference-chart-display"] svg', { timeout: 10_000 }).should('exist');
});

it('switches to Measured Joules per Output Token without errors', () => {
cy.get('[data-testid="yaxis-metric-selector"]').click();
cy.contains('[role="option"]', 'Measured Joules per Output Token').click();
cy.get('[data-slot="select-content"]').should('not.exist');
cy.screenshot('measured-joules-selected', { capture: 'viewport' });
cy.get('[data-testid="inference-chart-display"] svg').should('exist');
});
});
30 changes: 30 additions & 0 deletions packages/app/src/app/api/unofficial-run/route.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,36 @@ describe('normalizeArtifactRows', () => {
);
expect(rows.every((r) => r.date === '2026-03-11')).toBe(true);
});

it('surfaces the per-worker measured-power array on the BenchmarkRow', () => {
const workers = [
{
role: 'prefill',
worker_idx: 0,
hosts: ['pn0'],
num_gpus: 4,
avg_power_w: 612.3,
avg_temp_c: 71.2,
},
{
role: 'decode',
worker_idx: 0,
hosts: ['dn0', 'dn1'],
num_gpus: 8,
avg_power_w: 712.1,
},
];
const rows = normalizeArtifactRows([rawRow({ workers })], '2026-03-01');
expect(rows[0].workers).toHaveLength(2);
expect(rows[0].workers![0].hosts).toEqual(['pn0']);
expect(rows[0].workers![0].avg_temp_c).toBe(71.2);
expect(rows[0].workers![1].role).toBe('decode');
});

it('leaves workers undefined when the artifact omits the field', () => {
const rows = normalizeArtifactRows([rawRow()], '2026-03-01');
expect(rows[0].workers).toBeUndefined();
});
});

describe('normalizeEvalArtifactRows', () => {
Expand Down
3 changes: 3 additions & 0 deletions packages/app/src/app/api/unofficial-run/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ export function normalizeArtifactRows(
conc: params.conc,
image: params.image,
metrics: params.metrics,
// Surface the same per-worker payload the DB path emits so unofficial
// overlays carry the multinode measured-power breakdown too.
workers: params.workers,
date,
run_url: runUrl,
});
Expand Down
69 changes: 69 additions & 0 deletions packages/app/src/components/inference/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,50 @@ import type React from 'react';
import type { HardwareEntry } from '@/lib/constants';
import type { Model, Sequence } from '@/lib/data-mappings';

/**
* Role of a single worker process in a multinode / disaggregated deployment.
* - `prefill` / `decode`: the two halves of a disaggregated serving setup
* - `agg`: an aggregated (non-disagg) worker that handles both phases
* - `frontend`: a router / load-balancer process (typically zero GPUs)
*
* Carried on `WorkerPower.role` as `string` (not the literal union) because
* the runner emits the role at the JSONB boundary — we can't statically
* guarantee the value at the type system level. Consumers that switch on the
* role should narrow via `if (role === 'prefill') ...` or a `WorkerRole`
* cast at the point of use.
*/
export type WorkerRole = 'prefill' | 'decode' | 'agg' | 'frontend';

/**
* Per-worker measured power entry emitted by the runner's aggregate_power.py
* for multinode and disaggregated runs. The chart layer can use these to
* surface a stacked breakdown of where energy is spent across worker types.
*
* `hosts` lists the node hostnames whose perfmon CSVs were rolled up into
* this worker entry (a single-node worker has one host; a multinode decode
* worker spanning 4 nodes has four). Optional because pre-multinode versions
* of aggregate_power.py didn't emit it.
*
* `avg_temp_c`, `peak_temp_c`, `avg_util_pct`, `avg_mem_used_mb` mirror the
* cluster-wide telemetry scalars and are only present when the perfmon CSVs
* include the corresponding sample columns. Each is optional so callers can
* distinguish "field absent from this run" from "field present and equal to 0".
*/
export interface WorkerPower {
// `string` rather than `WorkerRole` so the type lines up with what we get
// from the JSONB column without an unsafe cast at every boundary. Chart
// code can still narrow on the literal values it understands.
role: string;
worker_idx: number;
hosts?: string[];
num_gpus: number;
avg_power_w: number;
avg_temp_c?: number;
peak_temp_c?: number;
avg_util_pct?: number;
avg_mem_used_mb?: number;
}

/**
* Represents an aggregated data entry, typically from a raw data source.
* This interface contains various performance metrics.
Expand Down Expand Up @@ -72,6 +116,31 @@ export interface AggDataEntry {
avg_power_w?: number;
joules_per_output_token?: number;
joules_per_total_token?: number;
// Multinode / disagg-only measured power. The aggregate_power.py runner
// emits per-role energy splits when the deployment has separate prefill
// and decode workers (single-node disagg or multinode disagg). Single-node
// aggregated configs leave these undefined.
// - prefill_avg_power_w / decode_avg_power_w: mean per-GPU draw (W) within each role
// - joules_per_input_token: prefill_energy / total_input_tokens (prefill GPUs only)
// The disagg decode-only J/output is carried by joules_per_output_token above
// (the runner overrides it to decode_energy / total_output_tokens on disagg) —
// there is no separate _decode field.
prefill_avg_power_w?: number;
decode_avg_power_w?: number;
joules_per_input_token?: number;
// Cluster-wide GPU telemetry beyond power (temperature, utilization, memory).
// Emitted by aggregate_power.py when the perfmon CSVs include the matching
// sample columns. Optional because older runs (and runs without the relevant
// perfmon samples) leave them unset — the chart layer must distinguish "no
// measurement" from "0".
avg_temp_c?: number;
peak_temp_c?: number;
avg_util_pct?: number;
avg_mem_used_mb?: number;
// Per-worker measured power breakdown. Each entry is one worker process
// (a prefill, decode, agg, or frontend role). Optional because pre-multinode
// and pre-aggregate_power.py runs don't emit it.
workers?: WorkerPower[];
disagg: boolean;
num_prefill_gpu: number;
num_decode_gpu: number;
Expand Down
11 changes: 11 additions & 0 deletions packages/app/src/lib/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* Each function is a thin fetch wrapper returning typed data.
*/

import type { WorkerPower } from '@/components/inference/types';

import type { SubmissionsResponse } from './submissions-types';

export interface BenchmarkRow {
Expand All @@ -28,6 +30,15 @@ export interface BenchmarkRow {
conc: number;
image: string | null;
metrics: Record<string, number>;
/**
* Per-worker measured power for multinode / disagg runs. The runner emits
* this as a JSONB sibling of the scalar metrics; the API layer surfaces it
* as a separate field here so the scalar `metrics` index signature can stay
* `Record<string, number>` and existing `m.x ?? 0` call sites keep narrowing
* cleanly. Undefined for single-node runs and any run predating
* aggregate_power.py.
*/
workers?: WorkerPower[];
date: string;
run_url: string | null;
}
Expand Down
110 changes: 110 additions & 0 deletions packages/app/src/lib/benchmark-transform.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,116 @@ describe('rowToAggDataEntry', () => {
expect(entry.avg_power_w).toBeUndefined();
expect(entry.joules_per_output_token).toBeUndefined();
});

it('passes through multinode / disagg role-split power scalars when present', () => {
const entry = rowToAggDataEntry(
makeRow({
metrics: {
tput_per_gpu: 100,
prefill_avg_power_w: 612.3,
decode_avg_power_w: 701.5,
joules_per_input_token: 1.2,
// disagg: joules_per_output_token IS the per-stage decode value.
joules_per_output_token: 9.7,
},
}),
);
expect(entry.prefill_avg_power_w).toBe(612.3);
expect(entry.decode_avg_power_w).toBe(701.5);
expect(entry.joules_per_input_token).toBe(1.2);
expect(entry.joules_per_output_token).toBe(9.7);
});

it('passes through per-worker measured power array intact', () => {
const workers = [
{ role: 'prefill' as const, worker_idx: 0, num_gpus: 4, avg_power_w: 588.4 },
{ role: 'prefill' as const, worker_idx: 1, num_gpus: 4, avg_power_w: 601.2 },
{ role: 'decode' as const, worker_idx: 0, num_gpus: 8, avg_power_w: 712.1 },
{ role: 'frontend' as const, worker_idx: 0, num_gpus: 0, avg_power_w: 0 },
];
const entry = rowToAggDataEntry(makeRow({ workers }));
expect(entry.workers).toEqual(workers);
});

it('defensively drops a non-array workers payload', () => {
// The DB JSONB column is untyped at the wire boundary, so guard against a
// malformed row reaching downstream consumers.
const entry = rowToAggDataEntry(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
makeRow({ workers: 'oops' as any }),
);
expect(entry.workers).toBeUndefined();
});

it('leaves multinode role-split scalars and workers undefined for legacy rows', () => {
// Single-node configs predating the multinode runner don't emit any of
// the role-split fields; transform must yield undefined (not 0) so the
// chart layer can distinguish "no measurement" from a real zero.
const entry = rowToAggDataEntry(makeRow({ metrics: {} }));
expect(entry.prefill_avg_power_w).toBeUndefined();
expect(entry.decode_avg_power_w).toBeUndefined();
expect(entry.joules_per_input_token).toBeUndefined();
expect(entry.workers).toBeUndefined();
});

it('passes through cluster-wide temp/util/mem scalars when present', () => {
const entry = rowToAggDataEntry(
makeRow({
metrics: {
tput_per_gpu: 100,
avg_temp_c: 68.4,
peak_temp_c: 79.2,
avg_util_pct: 88.5,
avg_mem_used_mb: 71234.5,
},
}),
);
expect(entry.avg_temp_c).toBe(68.4);
expect(entry.peak_temp_c).toBe(79.2);
expect(entry.avg_util_pct).toBe(88.5);
expect(entry.avg_mem_used_mb).toBe(71234.5);
});

it('leaves cluster-wide temp/util/mem fields undefined when absent (legacy rows)', () => {
// Same undefined-vs-zero distinction as the measured-power scalars —
// historic rows predate the perfmon CSV scrape, so missing values must
// not be silently coerced to 0.
const entry = rowToAggDataEntry(makeRow({ metrics: {} }));
expect(entry.avg_temp_c).toBeUndefined();
expect(entry.peak_temp_c).toBeUndefined();
expect(entry.avg_util_pct).toBeUndefined();
expect(entry.avg_mem_used_mb).toBeUndefined();
});

it('preserves new optional WorkerPower fields (hosts, telemetry) on workers entries', () => {
const workers = [
{
role: 'prefill' as const,
worker_idx: 0,
hosts: ['pn0'],
num_gpus: 4,
avg_power_w: 612.3,
avg_temp_c: 71.2,
peak_temp_c: 78,
avg_util_pct: 92.1,
avg_mem_used_mb: 65432,
},
{
role: 'decode' as const,
worker_idx: 0,
hosts: ['dn0', 'dn1', 'dn2', 'dn3'],
num_gpus: 16,
avg_power_w: 712.1,
},
];
const entry = rowToAggDataEntry(makeRow({ workers }));
expect(entry.workers).toEqual(workers);
expect(entry.workers![0].hosts).toEqual(['pn0']);
expect(entry.workers![0].avg_temp_c).toBe(71.2);
expect(entry.workers![1].hosts).toEqual(['dn0', 'dn1', 'dn2', 'dn3']);
// Optional telemetry fields stay undefined when source omits them.
expect(entry.workers![1].avg_temp_c).toBeUndefined();
});
});

describe('transformBenchmarkRows', () => {
Expand Down
17 changes: 17 additions & 0 deletions packages/app/src/lib/benchmark-transform.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,23 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
avg_power_w: m.avg_power_w,
joules_per_output_token: m.joules_per_output_token,
joules_per_total_token: m.joules_per_total_token,
// Multinode / disagg-only role splits — same undefined-for-legacy pattern.
// (disagg's decode-only J/output is carried by joules_per_output_token above,
// which the runner overrides to the per-stage value — no separate _decode key.)
prefill_avg_power_w: m.prefill_avg_power_w,
decode_avg_power_w: m.decode_avg_power_w,
joules_per_input_token: m.joules_per_input_token,
// Cluster-wide GPU telemetry beyond power. Emitted when the perfmon CSVs
// include the corresponding sample columns; left undefined otherwise so
// the chart layer can distinguish "no measurement" from a real zero.
avg_temp_c: m.avg_temp_c,
peak_temp_c: m.peak_temp_c,
avg_util_pct: m.avg_util_pct,
avg_mem_used_mb: m.avg_mem_used_mb,
// Per-worker measured power. Surfaced on BenchmarkRow as a sibling of the
// scalar `metrics` dict (see api.ts). Narrow defensively so a malformed
// payload can't poison downstream consumers.
workers: Array.isArray(row.workers) ? row.workers : undefined,
disagg: row.disagg,
num_prefill_gpu: row.num_prefill_gpu,
num_decode_gpu: row.num_decode_gpu,
Expand Down
31 changes: 28 additions & 3 deletions packages/constants/src/metric-keys.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,35 @@ export const METRIC_KEYS = new Set([
'std_intvty',
// measured power / energy (emitted by runner's aggregate_power.py)
// avg_power_w: mean per-GPU draw (W) during the load window
// joules_per_output_token: avg_power_w * num_gpus * duration / total_output_tokens
// joules_per_total_token: avg_power_w * num_gpus * duration / (total_input + total_output)
// — workload-shape-fair view that doesn't treat prompt as free
// joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE on
// single-node / non-disagg (total_system_energy);
// PER-STAGE decode_energy on disagg (decode GPUs only),
// symmetric with joules_per_input_token below.
// joules_per_total_token: total_system_energy / (total_input + total_output)
// — cluster-wide; workload-shape-fair view that
// doesn't treat prompt as free.
'avg_power_w',
'joules_per_output_token',
'joules_per_total_token',
// multinode / disagg role splits (emitted only when the deployment has
// distinct prefill / decode workers)
// prefill_avg_power_w / decode_avg_power_w: mean per-GPU draw within each role
// joules_per_input_token: prefill_energy / total_input_tokens (prefill GPUs only).
// The disagg output counterpart is joules_per_output_token above (decode GPUs
// only) — there is no separate _decode key.
'prefill_avg_power_w',
'decode_avg_power_w',
'joules_per_input_token',
// cluster-wide GPU telemetry beyond power (emitted by aggregate_power.py when
// the perfmon CSVs include temperature, utilization, or memory samples).
// avg_temp_c: mean per-GPU temperature (Celsius) during load window
// peak_temp_c: max instantaneous per-GPU temperature in window
// avg_util_pct: mean per-GPU GPU-utilization percent (0-100)
// avg_mem_used_mb: mean per-GPU memory used (MiB / MB)
// Single-node and multinode runs both surface these as flat scalars; the
// per-worker breakdown carries the same fields on each entry in workers[].
'avg_temp_c',
'peak_temp_c',
'avg_util_pct',
'avg_mem_used_mb',
]);
Loading
Loading