Skip to content

Commit 54ba188

Browse files
pranaygpclaude
andauthored
fix: compare benchmarks against PR base branch instead of main (#560)
* fix: compare benchmarks against PR base branch instead of main - Use github.event.pull_request.base.ref instead of hardcoded main - Remove search_artifacts: true to ensure most recent baseline is used - For stacked PRs, this compares against the parent PR's baseline 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * fix: group failed e2e tests by category and app in summary Instead of listing each failed test as a separate item, group them by: 1. Category (world): e.g., "Community Worlds", "Vercel Production" 2. App (framework): e.g., "mongodb", "turso", "nextjs-turbopack" This makes the summary much more readable when there are many failures. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * fix: ensure local E2E tests always produce JSON output - Add 'fastify' to app detection list in aggregate-e2e-results.js - Change && to ; so e2e tests run even if dev.test.ts fails - This ensures local-dev, local-prod, and local-postgres categories appear in the E2E summary comment 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * fix: ensure local E2E tests always produce JSON output - Add 'fastify' to app detection list in aggregate-e2e-results.js - Change && to ; so e2e tests run even if dev.test.ts fails - This ensures local-dev, local-prod, and local-postgres categories appear in the E2E summary comment 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * feat: publish CI results to GitHub Pages for docs - Add generate-docs-data.js script to create JSON summaries from CI artifacts - Add publish-results job to tests.yml and benchmarks.yml workflows - Update docs/lib/worlds-data.ts to fetch from GitHub Pages URLs - Results published to https://vercel.github.io/workflow/ci/ This allows the docs worlds page to display actual test/benchmark results without requiring a GITHUB_TOKEN. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * fix: correct outputFile path for local E2E test artifacts The --outputFile path was using ../../ which placed files outside the repo because pnpm run test:e2e executes from workspace root, not from the cd'd workbench directory. This prevented local-dev, local-prod, and local-postgres test results from being uploaded as artifacts. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * fix: show green checkmark for skipped tests instead of warning Skipped tests are intentional and shouldn't show as warnings in the E2E test summary comments. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * feat: use collapsible sections in benchmark PR comment Wrap each benchmark, stream benchmarks section, and summary tables in <details> toggles to make the PR comment more compact and readable. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * feat: add Vercel observability links to benchmark PR comments - Store runId in benchmark timing data - Add project-slug to Vercel benchmark matrix - Pass WORKFLOW_VERCEL_PROJECT_SLUG env var to benchmarks - Store Vercel metadata (teamSlug, projectSlug, environment) in timing files - Generate observability deep links for each Vercel world benchmark - Show observability links below Production (Vercel) tables 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * fix: use correct Vercel project slugs for observability links - nextjs-turbopack → example-nextjs-workflow-turbopack - nitro-v3 → workbench-nitro-workflow 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent 0bbd26f commit 54ba188

File tree

12 files changed

+708
-230
lines changed

12 files changed

+708
-230
lines changed

.github/scripts/aggregate-benchmarks.js

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,15 @@ const path = require('path');
77
const args = process.argv.slice(2);
88
let resultsDir = '.';
99
let baselineDir = null;
10+
let runUrl = '';
1011

1112
for (let i = 0; i < args.length; i++) {
1213
if (args[i] === '--baseline' && args[i + 1]) {
1314
baselineDir = args[i + 1];
1415
i++;
16+
} else if (args[i] === '--run-url' && args[i + 1]) {
17+
runUrl = args[i + 1];
18+
i++;
1519
} else if (!args[i].startsWith('--')) {
1620
resultsDir = args[i];
1721
}
@@ -127,6 +131,16 @@ function loadTimingData(benchmarkFile) {
127131
return null;
128132
}
129133

134+
// Generate Vercel observability URL for a workflow run
135+
function getObservabilityUrl(vercelMetadata, runId) {
136+
if (!vercelMetadata || !runId) return null;
137+
const { teamSlug, projectSlug, environment } = vercelMetadata;
138+
if (!teamSlug || !projectSlug) return null;
139+
// Always use 'preview' for PR benchmarks
140+
const env = environment === 'production' ? 'production' : 'preview';
141+
return `https://vercel.com/${teamSlug}/${projectSlug}/observability/workflows/runs/${runId}?environment=${env}`;
142+
}
143+
130144
// Collect all benchmark data
131145
function collectBenchmarkData(resultFiles) {
132146
// Structure: { [benchmarkName]: { [app]: { [backend]: { wallTime, workflowTime, overhead, min, max, samples, firstByteTime } } } }
@@ -162,13 +176,29 @@ function collectBenchmarkData(resultFiles) {
162176
// Get workflow timing if available
163177
let workflowTimeMs = null;
164178
let firstByteTimeMs = null;
179+
let lastRunId = null;
180+
let observabilityUrl = null;
165181
if (timings?.summary?.[benchName]) {
166182
workflowTimeMs = timings.summary[benchName].avgExecutionTimeMs;
167183
// Get TTFB for stream benchmarks
168184
if (timings.summary[benchName].avgFirstByteTimeMs !== undefined) {
169185
firstByteTimeMs = timings.summary[benchName].avgFirstByteTimeMs;
170186
}
171187
}
188+
// Get the last runId for observability link (Vercel only)
189+
if (timings?.timings?.[benchName]?.length > 0) {
190+
const lastTiming =
191+
timings.timings[benchName][
192+
timings.timings[benchName].length - 1
193+
];
194+
lastRunId = lastTiming?.runId;
195+
if (timings?.vercel && lastRunId) {
196+
observabilityUrl = getObservabilityUrl(
197+
timings.vercel,
198+
lastRunId
199+
);
200+
}
201+
}
172202

173203
data[benchName][app][backend] = {
174204
wallTime: bench.mean,
@@ -179,6 +209,8 @@ function collectBenchmarkData(resultFiles) {
179209
max: bench.max,
180210
samples: bench.sampleCount,
181211
firstByteTime: firstByteTimeMs,
212+
runId: lastRunId,
213+
observabilityUrl: observabilityUrl,
182214
};
183215
}
184216
}
@@ -378,6 +410,18 @@ function renderBenchmarkTable(
378410
}
379411
}
380412
console.log('');
413+
414+
// Collect and render observability links for Vercel world
415+
const observabilityLinks = dataPoints
416+
.filter((dp) => dp.metrics?.observabilityUrl && dp.backend === 'vercel')
417+
.map((dp) => {
418+
const frameworkInfo = frameworkConfig[dp.app] || { label: dp.app };
419+
return `[${frameworkInfo.label}](${dp.metrics.observabilityUrl})`;
420+
});
421+
422+
if (observabilityLinks.length > 0) {
423+
console.log(`_🔍 Observability: ${observabilityLinks.join(' | ')}_\n`);
424+
}
381425
}
382426

383427
// Render the comparison tables
@@ -419,7 +463,8 @@ function renderComparison(data, baselineData) {
419463
const renderBenchmarkWithEnvironments = (benchName, benchData, isStream) => {
420464
const baselineBenchData = baselineData?.[benchName] || null;
421465

422-
console.log(`## ${benchName}\n`);
466+
console.log(`<details>`);
467+
console.log(`<summary><strong>${benchName}</strong></summary>\n`);
423468

424469
// Render Local Development table
425470
if (localDevBackends.length > 0) {
@@ -448,6 +493,8 @@ function renderComparison(data, baselineData) {
448493
{ showHeading: false }
449494
);
450495
}
496+
497+
console.log('</details>\n');
451498
};
452499

453500
// Render regular benchmarks
@@ -457,15 +504,16 @@ function renderComparison(data, baselineData) {
457504

458505
// Render stream benchmarks in a separate section
459506
if (streamBenchmarks.length > 0) {
460-
console.log('---\n');
461-
console.log('## Stream Benchmarks\n');
507+
console.log('<details>');
462508
console.log(
463-
'_Stream benchmarks include Time to First Byte (TTFB) metrics._\n'
509+
'<summary><strong>Stream Benchmarks</strong> <em>(includes TTFB metrics)</em></summary>\n'
464510
);
465511

466512
for (const [benchName, benchData] of streamBenchmarks) {
467513
renderBenchmarkWithEnvironments(benchName, benchData, true);
468514
}
515+
516+
console.log('</details>\n');
469517
}
470518

471519
// Summary: Count wins per framework (within each world) and per world (within each framework)
@@ -541,8 +589,11 @@ function renderComparison(data, baselineData) {
541589
}
542590

543591
// Summary: Best framework per world (by wins)
544-
console.log('---\n');
545-
console.log('## Summary: Fastest Framework by World\n');
592+
console.log('### Summary\n');
593+
console.log('<details>');
594+
console.log(
595+
'<summary><strong>Fastest Framework by World</strong></summary>\n'
596+
);
546597
console.log(`_Winner determined by most benchmark wins_\n`);
547598
console.log('| World | 🥇 Fastest Framework | Wins |');
548599
console.log('|:------|:---------------------|-----:|');
@@ -579,10 +630,13 @@ function renderComparison(data, baselineData) {
579630
);
580631
}
581632
}
582-
console.log('');
633+
console.log('\n</details>\n');
583634

584635
// Summary: Best world per framework (by wins)
585-
console.log('## Summary: Fastest World by Framework\n');
636+
console.log('<details>');
637+
console.log(
638+
'<summary><strong>Fastest World by Framework</strong></summary>\n'
639+
);
586640
console.log(`_Winner determined by most benchmark wins_\n`);
587641
console.log('| Framework | 🥇 Fastest World | Wins |');
588642
console.log('|:----------|:-----------------|-----:|');
@@ -615,11 +669,11 @@ function renderComparison(data, baselineData) {
615669
);
616670
}
617671
}
618-
console.log('');
672+
console.log('\n</details>\n');
619673

620674
// Legend
621675
console.log('<details>');
622-
console.log('<summary>Column Definitions</summary>\n');
676+
console.log('<summary><strong>Column Definitions</strong></summary>\n');
623677
console.log(
624678
'- **Workflow Time**: Runtime reported by workflow (completedAt - createdAt) - *primary metric*'
625679
);
@@ -646,6 +700,12 @@ function renderComparison(data, baselineData) {
646700
}
647701
}
648702
console.log('</details>');
703+
704+
// Add link to workflow run
705+
if (runUrl) {
706+
console.log('\n---');
707+
console.log(`📋 [View full workflow run](${runUrl})`);
708+
}
649709
}
650710

651711
// Main

.github/scripts/aggregate-e2e-results.js

Lines changed: 58 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ const args = process.argv.slice(2);
88
let resultsDir = '.';
99
let jobName = 'E2E Tests';
1010
let mode = 'single'; // 'single' for step summary, 'aggregate' for PR comment
11+
let runUrl = '';
1112

1213
for (let i = 0; i < args.length; i++) {
1314
if (args[i] === '--job-name' && args[i + 1]) {
@@ -16,6 +17,9 @@ for (let i = 0; i < args.length; i++) {
1617
} else if (args[i] === '--mode' && args[i + 1]) {
1718
mode = args[i + 1];
1819
i++;
20+
} else if (args[i] === '--run-url' && args[i + 1]) {
21+
runUrl = args[i + 1];
22+
i++;
1923
} else if (!args[i].startsWith('--')) {
2024
resultsDir = args[i];
2125
}
@@ -106,6 +110,7 @@ function parseJobInfo(filename) {
106110
'sveltekit',
107111
'hono',
108112
'express',
113+
'fastify',
109114
'astro',
110115
'example',
111116
'turso',
@@ -211,14 +216,9 @@ function aggregateByCategory(files) {
211216
function renderSingleJobSummary(summary) {
212217
const total =
213218
summary.totalPassed + summary.totalFailed + summary.totalSkipped;
214-
const statusEmoji =
215-
summary.totalFailed > 0 ? '❌' : summary.totalSkipped > 0 ? '⚠️' : '✅';
219+
const statusEmoji = summary.totalFailed > 0 ? '❌' : '✅';
216220
const statusText =
217-
summary.totalFailed > 0
218-
? 'Some tests failed'
219-
: summary.totalSkipped > 0
220-
? 'All tests passed (some skipped)'
221-
: 'All tests passed';
221+
summary.totalFailed > 0 ? 'Some tests failed' : 'All tests passed';
222222

223223
console.log(`## ${statusEmoji} ${jobName}\n`);
224224
console.log(`**Status:** ${statusText}\n`);
@@ -259,8 +259,7 @@ function renderSingleJobSummary(summary) {
259259
console.log('| File | Passed | Failed | Skipped |');
260260
console.log('|:-----|-------:|-------:|--------:|');
261261
for (const result of summary.fileResults) {
262-
const fileStatus =
263-
result.failed > 0 ? '❌' : result.skipped > 0 ? '⚠️' : '✅';
262+
const fileStatus = result.failed > 0 ? '❌' : '✅';
264263
console.log(
265264
`| ${fileStatus} ${result.file} | ${result.passed} | ${result.failed} | ${result.skipped} |`
266265
);
@@ -297,18 +296,9 @@ function renderAggregatedSummary(categories, overallSummary) {
297296
overallSummary.totalPassed +
298297
overallSummary.totalFailed +
299298
overallSummary.totalSkipped;
300-
const statusEmoji =
301-
overallSummary.totalFailed > 0
302-
? '❌'
303-
: overallSummary.totalSkipped > 0
304-
? '⚠️'
305-
: '✅';
299+
const statusEmoji = overallSummary.totalFailed > 0 ? '❌' : '✅';
306300
const statusText =
307-
overallSummary.totalFailed > 0
308-
? 'Some tests failed'
309-
: overallSummary.totalSkipped > 0
310-
? 'All tests passed (some skipped)'
311-
: 'All tests passed';
301+
overallSummary.totalFailed > 0 ? 'Some tests failed' : 'All tests passed';
312302

313303
console.log('<!-- e2e-test-results -->');
314304
console.log(`## 🧪 E2E Test Results\n`);
@@ -328,7 +318,7 @@ function renderAggregatedSummary(categories, overallSummary) {
328318

329319
for (const [catName, cat] of sortedCategories) {
330320
const catTotal = cat.passed + cat.failed + cat.skipped;
331-
const catStatus = cat.failed > 0 ? '❌' : cat.skipped > 0 ? '⚠️' : '✅';
321+
const catStatus = cat.failed > 0 ? '❌' : '✅';
332322
const displayName = categoryNames[catName] || catName;
333323
console.log(
334324
`| ${catStatus} ${displayName} | ${cat.passed} | ${cat.failed} | ${cat.skipped} | ${catTotal} |`
@@ -340,21 +330,52 @@ function renderAggregatedSummary(categories, overallSummary) {
340330
);
341331
console.log('');
342332

343-
// Failed tests section
333+
// Failed tests section - grouped by category and app
344334
if (overallSummary.allFailedTests.length > 0) {
345335
console.log('### ❌ Failed Tests\n');
336+
337+
// Group failed tests by category, then by app
338+
const failedByCategory = new Map();
346339
for (const test of overallSummary.allFailedTests) {
347-
const catDisplay = categoryNames[test.category] || test.category;
340+
if (!failedByCategory.has(test.category)) {
341+
failedByCategory.set(test.category, new Map());
342+
}
343+
const catMap = failedByCategory.get(test.category);
344+
if (!catMap.has(test.app)) {
345+
catMap.set(test.app, []);
346+
}
347+
catMap.get(test.app).push(test);
348+
}
349+
350+
// Sort categories by defined order
351+
const sortedFailedCategories = Array.from(failedByCategory.entries()).sort(
352+
([a], [b]) =>
353+
(categoryOrder.indexOf(a) === -1 ? 999 : categoryOrder.indexOf(a)) -
354+
(categoryOrder.indexOf(b) === -1 ? 999 : categoryOrder.indexOf(b))
355+
);
356+
357+
for (const [catName, appsMap] of sortedFailedCategories) {
358+
const catDisplay = categoryNames[catName] || catName;
359+
const catFailedCount = Array.from(appsMap.values()).reduce(
360+
(sum, tests) => sum + tests.length,
361+
0
362+
);
363+
348364
console.log(`<details>`);
349365
console.log(
350-
`<summary>${test.app} (${catDisplay}): ${test.name}</summary>\n`
366+
`<summary>${catDisplay} (${catFailedCount} failed)</summary>\n`
351367
);
352-
console.log(`**File:** \`${test.file}\`\n`);
353-
if (test.message) {
354-
console.log('```');
355-
console.log(test.message);
356-
console.log('```');
368+
369+
for (const [appName, tests] of appsMap.entries()) {
370+
console.log(`**${appName}** (${tests.length} failed):\n`);
371+
for (const test of tests) {
372+
// Extract just the test name without "e2e " prefix if present
373+
const testName = test.name.replace(/^e2e\s+/, '');
374+
console.log(`- \`${testName}\``);
375+
}
376+
console.log('');
357377
}
378+
358379
console.log('</details>\n');
359380
}
360381
}
@@ -363,21 +384,27 @@ function renderAggregatedSummary(categories, overallSummary) {
363384
console.log('### Details by Category\n');
364385

365386
for (const [catName, cat] of sortedCategories) {
366-
const catStatus = cat.failed > 0 ? '❌' : cat.skipped > 0 ? '⚠️' : '✅';
387+
const catStatus = cat.failed > 0 ? '❌' : '✅';
367388
const displayName = categoryNames[catName] || catName;
368389

369390
console.log(`<details>`);
370391
console.log(`<summary>${catStatus} ${displayName}</summary>\n`);
371392
console.log('| App | Passed | Failed | Skipped |');
372393
console.log('|:----|-------:|-------:|--------:|');
373394
for (const app of cat.apps) {
374-
const appStatus = app.failed > 0 ? '❌' : app.skipped > 0 ? '⚠️' : '✅';
395+
const appStatus = app.failed > 0 ? '❌' : '✅';
375396
console.log(
376397
`| ${appStatus} ${app.name} | ${app.passed} | ${app.failed} | ${app.skipped} |`
377398
);
378399
}
379400
console.log('</details>\n');
380401
}
402+
403+
// Add link to workflow run
404+
if (runUrl) {
405+
console.log('---');
406+
console.log(`📋 [View full workflow run](${runUrl})`);
407+
}
381408
}
382409

383410
// Main

0 commit comments

Comments
 (0)