diff --git a/actions/setup/js/create_forecast_issue.cjs b/actions/setup/js/create_forecast_issue.cjs index 2094dbbbfc2..24019ebb810 100644 --- a/actions/setup/js/create_forecast_issue.cjs +++ b/actions/setup/js/create_forecast_issue.cjs @@ -37,19 +37,53 @@ function formatAIC(value) { */ function buildForecastIssueBody(report, options) { const workflows = Array.isArray(report?.workflows) ? report.workflows : []; - const rows = workflows.map(workflow => { - const p50 = workflow?.monte_carlo?.p50_projected_aic ?? workflow?.projected_aic ?? workflow?.monte_carlo?.p50_projected_effective_tokens ?? workflow?.projected_effective_tokens ?? 0; - return [escapeCell(workflow.workflow_id), workflow.sampled_runs ?? 0, Number(p50)]; + + // Build the summary table with per-run P50/P95 and weekly/monthly projected totals. + const tableRows = workflows.map(workflow => { + const p50PerRun = workflow?.p50_aic_per_run ?? 0; + const p95PerRun = workflow?.p95_aic_per_run ?? 0; + const weeklyP50 = workflow?.weekly_monte_carlo?.p50_projected_aic ?? workflow?.weekly_projected_aic ?? 0; + const monthlyP50 = workflow?.monthly_monte_carlo?.p50_projected_aic ?? workflow?.monthly_projected_aic ?? 0; + return [escapeCell(workflow.workflow_id), workflow.sampled_runs ?? 0, Number(p50PerRun), Number(p95PerRun), Number(weeklyP50), Number(monthlyP50)]; }); - const allProjectedZero = rows.length > 0 && rows.every(([, , p50]) => Number(p50) === 0); - const zeroProjectedWithSamples = rows.filter(([, sampledRuns, p50]) => Number(sampledRuns) > 0 && Number(p50) === 0).length; - const zeroWorkflowWord = zeroProjectedWithSamples === 1 ? "workflow" : "workflows"; - const zeroWorkflowVerb = zeroProjectedWithSamples === 1 ? "has" : "have"; - const reportTable = - rows.length > 0 - ? ["| Workflow | Sampled runs | Forecast AIC (P50) |", "| --- | ---: | ---: |", ...rows.map(([workflowID, sampledRuns, p50]) => `| ${workflowID} | ${sampledRuns} | ${formatAIC(p50)} |`)].join("\n") - : "_No forecast rows were produced._"; + // Legacy fallback: derive weekly/monthly from the configured-period P50 when new fields are absent. + const hasNewFields = workflows.some(w => w?.p50_aic_per_run != null || w?.weekly_projected_aic != null); + const legacyRows = hasNewFields + ? null + : workflows.map(workflow => { + const p50 = workflow?.monte_carlo?.p50_projected_aic ?? workflow?.projected_aic ?? workflow?.monte_carlo?.p50_projected_effective_tokens ?? workflow?.projected_effective_tokens ?? 0; + return [escapeCell(workflow.workflow_id), workflow.sampled_runs ?? 0, Number(p50)]; + }); + + const allWeeklyZero = tableRows.length > 0 && tableRows.every(([, , , , weekly]) => Number(weekly) === 0); + const allMonthlyZero = tableRows.length > 0 && tableRows.every(([, , , , , monthly]) => Number(monthly) === 0); + const allProjectedZero = legacyRows ? legacyRows.length > 0 && legacyRows.every(([, , p50]) => Number(p50) === 0) : allWeeklyZero && allMonthlyZero; + + let reportTable; + if (legacyRows) { + reportTable = + legacyRows.length > 0 + ? ["| Workflow | Sampled runs | Forecast AIC (P50) |", "| --- | ---: | ---: |", ...legacyRows.map(([workflowID, sampledRuns, p50]) => `| ${workflowID} | ${sampledRuns} | ${formatAIC(p50)} |`)].join("\n") + : "_No forecast rows were produced._"; + } else { + if (tableRows.length === 0) { + reportTable = "_No forecast rows were produced._"; + } else { + const totalWeekly = tableRows.reduce((s, [, , , , w]) => s + Number(w), 0); + const totalMonthly = tableRows.reduce((s, [, , , , , m]) => s + Number(m), 0); + const dataRows = tableRows.map(([workflowID, sampledRuns, p50Run, p95Run, weekly, monthly]) => + `| ${workflowID} | ${sampledRuns} | ${formatAIC(p50Run)} | ${formatAIC(p95Run)} | ${formatAIC(weekly)} | ${formatAIC(monthly)} |` + ); + if (tableRows.length > 1) { + dataRows.push(`| **TOTAL** | | | | **${formatAIC(totalWeekly)}** | **${formatAIC(totalMonthly)}** |`); + } + reportTable = ["| Workflow | Runs | P50/Run | P95/Run | Weekly (P50) | Monthly (P50) |", "| --- | ---: | ---: | ---: | ---: | ---: |", ...dataRows].join("\n"); + } + } + + // Build the detailed run samples section. + const samplesSection = buildRunSamplesSection(workflows); const repoSlug = `${options.owner}/${options.repo}`; const period = report?.period || "month"; @@ -65,15 +99,6 @@ function buildForecastIssueBody(report, options) { "", ].join("\n") : ""; - const zeroProjectedTip = - zeroProjectedWithSamples > 0 - ? [ - "> [!TIP]", - `> ${zeroProjectedWithSamples} ${zeroWorkflowWord} ${zeroWorkflowVerb} sampled runs but forecast AIC is 0. This usually indicates missing token usage in cached run summaries for sampled runs.`, - "> Increase the warm-up scope with `gh aw logs --start-date -30d --count ` if this persists.", - "", - ].join("\n") - : ""; const sourceRunLine = runURL ? `_Forecast source run: [#${runID}](${runURL})._` : ""; const errorSection = outcome === "success" ? "" : ["> [!WARNING]", `> Forecast outcome: ${outcome}.`, `> ${options.errorMessage || "Forecast computation did not complete successfully."}`].join("\n"); @@ -83,12 +108,42 @@ function buildForecastIssueBody(report, options) { period, report_table: reportTable, all_projected_zero_note: allProjectedZeroNote, - zero_projected_tip: zeroProjectedTip, + run_samples_section: samplesSection, error_section: errorSection, source_run_line: sourceRunLine, }).trim(); } +/** + * Builds a collapsed
block listing every sampled run used in the forecast. + * Returns an empty string when no workflow has run samples. + * @param {Array>} workflows + * @returns {string} + */ +function buildRunSamplesSection(workflows) { + const hasAny = workflows.some(w => Array.isArray(w?.run_samples) && w.run_samples.length > 0); + if (!hasAny) return ""; + + const lines = [ + "
", + "Sampled runs used in computation", + "", + "| Workflow | Run ID | Date | AIC |", + "| --- | ---: | --- | ---: |", + ]; + for (const wf of workflows) { + const samples = Array.isArray(wf?.run_samples) ? wf.run_samples : []; + for (const s of samples) { + const runID = s?.run_id ?? ""; + const date = s?.date ?? ""; + const aic = formatAIC(s?.aic ?? 0); + lines.push(`| ${escapeCell(wf.workflow_id)} | #${runID} | ${date} | ${aic} |`); + } + } + lines.push("", "
", ""); + return lines.join("\n"); +} + /** * @returns {Promise} */ @@ -183,6 +238,7 @@ async function main() { module.exports = { main, buildForecastIssueBody, + buildRunSamplesSection, formatAIC, escapeCell, FORECAST_REPORT_PATH, diff --git a/actions/setup/js/create_forecast_issue.test.cjs b/actions/setup/js/create_forecast_issue.test.cjs index c5e82ad1e6e..dbf0e123cde 100644 --- a/actions/setup/js/create_forecast_issue.test.cjs +++ b/actions/setup/js/create_forecast_issue.test.cjs @@ -55,14 +55,18 @@ describe("create_forecast_issue", () => { { workflow_id: "wf|a", sampled_runs: 3, - monte_carlo: { - p50_projected_aic: 12345.6, - }, + p50_aic_per_run: 4000, + p95_aic_per_run: 8000, + weekly_monte_carlo: { p50_projected_aic: 12345.6 }, + monthly_monte_carlo: { p50_projected_aic: 52000 }, }, { workflow_id: "wf-b", sampled_runs: 5, - projected_aic: 0, + p50_aic_per_run: 0, + p95_aic_per_run: 0, + weekly_projected_aic: 0, + monthly_projected_aic: 0, }, ], }, @@ -75,10 +79,10 @@ describe("create_forecast_issue", () => { } ); - expect(body).toContain("| Workflow | Sampled runs | Forecast AIC (P50) |"); - expect(body).toContain("| wf\\|a | 3 | 12,346 |"); - expect(body).toContain("> 1 workflow has sampled runs but forecast AIC is 0. This usually indicates missing token usage in cached run summaries for sampled runs."); + expect(body).toContain("| Workflow | Runs | P50/Run | P95/Run | Weekly (P50) | Monthly (P50) |"); + expect(body).toContain("| wf\\|a | 3 | 4,000 | 8,000 | 12,346 | 52,000 |"); expect(body).toContain("_Forecast source run: [#123456](https://github.com/octo/repo/actions/runs/123456)._"); + expect(body).not.toContain("sampled runs but forecast AIC is 0"); }); it("adds all-projected-zero diagnostics when every projected AIC is zero", async () => { @@ -120,6 +124,75 @@ describe("create_forecast_issue", () => { expect(body).toContain("| wf-legacy | 2 | 9,999 |"); }); + it("renders run samples section in a collapsed details block", async () => { + const module = await import("./create_forecast_issue.cjs"); + const body = module.buildForecastIssueBody( + { + period: "month", + workflows: [ + { + workflow_id: "wf-c", + sampled_runs: 2, + p50_aic_per_run: 1000, + p95_aic_per_run: 2000, + weekly_projected_aic: 5000, + monthly_projected_aic: 20000, + run_samples: [ + { run_id: 111, date: "2026-01-10", aic: 900 }, + { run_id: 222, date: "2026-01-11", aic: 1100 }, + ], + }, + ], + }, + { + owner: "octo", + repo: "repo", + serverUrl: "https://github.com", + generatedAtISO: "2026-01-01T00:00:00.000Z", + } + ); + + expect(body).toContain("
"); + expect(body).toContain("Sampled runs used in computation"); + expect(body).toContain("| wf-c | #111 | 2026-01-10 | 900 |"); + expect(body).toContain("| wf-c | #222 | 2026-01-11 | 1,100 |"); + }); + + it("renders TOTAL row when multiple workflows are present", async () => { + const module = await import("./create_forecast_issue.cjs"); + const body = module.buildForecastIssueBody( + { + period: "month", + workflows: [ + { + workflow_id: "wf-1", + sampled_runs: 3, + p50_aic_per_run: 1000, + p95_aic_per_run: 2000, + weekly_projected_aic: 7000, + monthly_projected_aic: 30000, + }, + { + workflow_id: "wf-2", + sampled_runs: 2, + p50_aic_per_run: 500, + p95_aic_per_run: 1000, + weekly_projected_aic: 3000, + monthly_projected_aic: 12000, + }, + ], + }, + { + owner: "octo", + repo: "repo", + serverUrl: "https://github.com", + generatedAtISO: "2026-01-01T00:00:00.000Z", + } + ); + + expect(body).toContain("| **TOTAL** | | | | **10,000** | **42,000** |"); + }); + it("creates an error issue when report file is missing", async () => { mockFs.existsSync.mockReturnValue(false); diff --git a/actions/setup/md/forecast_issue.md b/actions/setup/md/forecast_issue.md index a5f782fa95c..5e2213906f0 100644 --- a/actions/setup/md/forecast_issue.md +++ b/actions/setup/md/forecast_issue.md @@ -7,6 +7,6 @@ Period: {period} {report_table} {all_projected_zero_note} -{zero_projected_tip} +{run_samples_section} {error_section} {source_run_line} diff --git a/pkg/cli/forecast.go b/pkg/cli/forecast.go index 56c8e751e04..5b72809d579 100644 --- a/pkg/cli/forecast.go +++ b/pkg/cli/forecast.go @@ -24,6 +24,7 @@ import ( "github.com/github/gh-aw/pkg/console" "github.com/github/gh-aw/pkg/constants" + "github.com/github/gh-aw/pkg/fileutil" "github.com/github/gh-aw/pkg/gitutil" "github.com/github/gh-aw/pkg/logger" "github.com/github/gh-aw/pkg/parser" @@ -47,9 +48,11 @@ var ( forecastFetchGitHubWorkflows = fetchGitHubWorkflows forecastListWorkflowRunsPaginated = listWorkflowRunsWithPagination forecastLoadCachedRunAIC = loadCachedRunAIC - forecastDownloadRunArtifacts = downloadRunArtifacts - forecastAnalyzeTokenUsage = analyzeTokenUsage - forecastRateLimitSleep = func(ctx context.Context, delay time.Duration) error { + // forecastDownloadRunArtifacts uses a forecast-specific implementation that downloads + // only the usage artifact and skips workflow run log downloads (not needed for AIC computation). + forecastDownloadRunArtifacts = forecastDownloadUsageArtifact + forecastAnalyzeTokenUsage = analyzeTokenUsage + forecastRateLimitSleep = func(ctx context.Context, delay time.Duration) error { timer := time.NewTimer(delay) defer timer.Stop() @@ -62,6 +65,19 @@ var ( } ) +// ForecastRunSample holds the data for a single workflow run used in the forecast computation. +// Included in ForecastWorkflowResult.RunSamples so callers and issue templates can list +// the individual runs and their raw AI Credit values for human review. +type ForecastRunSample struct { + // RunID is the GitHub Actions run ID. + RunID int64 `json:"run_id"` + // AIC is the AI Credit cost for this individual run. + AIC float64 `json:"aic"` + // Date is the ISO-8601 calendar date the run started (YYYY-MM-DD). + // Empty when the run's start timestamp is unavailable. + Date string `json:"date,omitempty"` +} + // ForecastWorkflowResult contains the projected metrics for a single workflow. type ForecastWorkflowResult struct { // WorkflowID is the short identifier of the workflow (basename without .md). @@ -83,14 +99,32 @@ type ForecastWorkflowResult struct { AvgAIC float64 `json:"avg_aic"` AvgDurationSeconds float64 `json:"avg_duration_seconds"` - // Projected totals for the period. + // P50AIC is the 50th-percentile (median) AIC of individual sampled runs. + P50AIC float64 `json:"p50_aic_per_run"` + // P95AIC is the 95th-percentile AIC of individual sampled runs + // (conservative / budget-bound per-run cost estimate). + P95AIC float64 `json:"p95_aic_per_run"` + + // Projected totals for the configured period. ProjectedAIC float64 `json:"projected_aic"` // MonteCarlo contains the probability distribution of projected AIC totals - // derived from a Monte Carlo simulation (10 000 trials). + // for the configured period, derived from a Monte Carlo simulation (10 000 trials). // Nil when no completed runs were available. MonteCarlo *ForecastMonteCarloSummary `json:"monte_carlo,omitempty"` + // WeeklyProjectedAIC is the point-estimate projected total AIC over a 7-day window. + WeeklyProjectedAIC float64 `json:"weekly_projected_aic"` + // WeeklyMonteCarlo contains the Monte Carlo distribution for the 7-day projection. + // Nil when no completed runs were available. + WeeklyMonteCarlo *ForecastMonteCarloSummary `json:"weekly_monte_carlo,omitempty"` + + // MonthlyProjectedAIC is the point-estimate projected total AIC over a 30-day window. + MonthlyProjectedAIC float64 `json:"monthly_projected_aic"` + // MonthlyMonteCarlo contains the Monte Carlo distribution for the 30-day projection. + // Nil when no completed runs were available. + MonthlyMonteCarlo *ForecastMonteCarloSummary `json:"monthly_monte_carlo,omitempty"` + // Trigger information derived from frontmatter. ActiveTriggers []string `json:"active_triggers"` // ConcurrencyLimit is the workflow-level concurrency limit (0 = unlimited). @@ -103,6 +137,11 @@ type ForecastWorkflowResult struct { // Evaluation contains backtesting quality metrics when --eval is set. // Nil in normal forecast mode. Evaluation *ForecastEvaluation `json:"evaluation,omitempty"` + + // RunSamples holds the individual per-run data used in the forecast computation. + // Each entry records the run ID, raw AIC, and (when available) the run date. + // Populated for all runs where AIC data was obtainable; zero-AIC runs are included. + RunSamples []ForecastRunSample `json:"run_samples,omitempty"` } // ForecastVariantResult contains projected metrics split by A/B experiment variant. @@ -239,6 +278,7 @@ func RunForecast(config ForecastConfig) error { if !config.Verbose { spinner.Stop() } + emitPartialForecastResults(results, config, now) return normalizeForecastRunError(err, config) } if !config.Verbose { @@ -255,6 +295,7 @@ func RunForecast(config ForecastConfig) error { if !config.Verbose { spinner.Stop() } + emitPartialForecastResults(results, config, now) return normalizeForecastRunError(err, config) } if !config.Verbose { @@ -527,14 +568,15 @@ func forecastWorkflow(ctx context.Context, workflowName, startDate string, confi return result, nil } - // Compute per-run averages. + // Compute per-run averages and collect individual run samples. var totalAIC float64 var totalDurSec float64 successCount := 0 aicObservations := make([]int, 0, len(completed)) + samples := make([]ForecastRunSample, 0, len(completed)) for _, r := range completed { - runAIC := forecastLoadCachedRunAIC(r.DatabaseID, config.Verbose) + runAIC := forecastLoadCachedRunAIC(ctx, r.DatabaseID, config.Verbose) totalAIC += runAIC totalDurSec += r.Duration.Seconds() // Monte Carlo currently samples integer observations; keep milli-AIC precision @@ -543,24 +585,50 @@ func forecastWorkflow(ctx context.Context, workflowName, startDate string, confi if r.Conclusion == "success" { successCount++ } + sample := ForecastRunSample{RunID: r.DatabaseID, AIC: roundForecastAIC(runAIC)} + if !r.StartedAt.IsZero() { + sample.Date = r.StartedAt.Format("2006-01-02") + } + samples = append(samples, sample) } + result.RunSamples = samples n := len(completed) result.AvgAIC = roundForecastAIC(totalAIC / float64(n)) result.AvgDurationSeconds = totalDurSec / float64(n) result.SuccessRate = float64(successCount) / float64(n) + // Compute P50 and P95 of individual run AIC (per-run percentiles, not period totals). + sortedAIC := make([]int, len(aicObservations)) + copy(sortedAIC, aicObservations) + sort.Ints(sortedAIC) + result.P50AIC = roundForecastAIC(float64(percentileInt(sortedAIC, 50)) / 1000) + result.P95AIC = roundForecastAIC(float64(percentileInt(sortedAIC, 95)) / 1000) + // Compute observed run frequency: runs per calendar day over the history window, // scaled to the projection period. - result.ObservedRunsPerPeriod = float64(n) / float64(config.Days) * float64(periodDays) + observedRunsPerDay := float64(n) / float64(config.Days) + result.ObservedRunsPerPeriod = observedRunsPerDay * float64(periodDays) + + // Point estimates for weekly (7-day) and monthly (30-day) projections. + weeklyRuns := observedRunsPerDay * 7 + monthlyRuns := observedRunsPerDay * 30 + result.WeeklyProjectedAIC = roundForecastAIC(weeklyRuns * result.AvgAIC) + result.MonthlyProjectedAIC = roundForecastAIC(monthlyRuns * result.AvgAIC) - // Projected token usage (point estimate using simple means). + // Projected token usage (point estimate using simple means) for the configured period. result.ProjectedAIC = roundForecastAIC(result.ObservedRunsPerPeriod * result.AvgAIC) // Monte Carlo simulation: model run-count (Poisson), per-run token usage // (bootstrap), and per-run success (Bernoulli) to produce P10/P50/P90 ranges. - rng := rand.New(rand.NewSource(time.Now().UnixNano())) //nolint:gosec // non-cryptographic simulation RNG + // Two independent RNGs ensure the weekly and monthly simulations are uncorrelated. + seed := time.Now().UnixNano() + rng := rand.New(rand.NewSource(seed)) //nolint:gosec // non-cryptographic simulation RNG + rng2 := rand.New(rand.NewSource(seed + 1)) //nolint:gosec + rng3 := rand.New(rand.NewSource(seed + 2)) //nolint:gosec result.MonteCarlo = runMonteCarlo(aicObservations, successCount, result.ObservedRunsPerPeriod, rng) + result.WeeklyMonteCarlo = runMonteCarlo(aicObservations, successCount, weeklyRuns, rng2) + result.MonthlyMonteCarlo = runMonteCarlo(aicObservations, successCount, monthlyRuns, rng3) // Populate experiment variant fractions from run history when metadata has variants. result.ExperimentVariants = computeVariantFractions(result.ExperimentVariants, completed) @@ -749,15 +817,28 @@ func extractWorkflowIDFromName(name string) string { // // Cache location: /run-/run_summary.json // (defaultLogsOutputDir is ".github/aw/logs" — defined in logs_models.go) -func loadCachedRunAIC(runID int64, verbose bool) float64 { +func loadCachedRunAIC(ctx context.Context, runID int64, verbose bool) float64 { dir := filepath.Join(defaultLogsOutputDir, fmt.Sprintf("run-%d", runID)) summary, ok := loadRunSummary(dir, verbose) if ok && summary != nil && summary.TokenUsage != nil && summary.TokenUsage.TotalAIC > 0 { + forecastRunLog.Printf("AIC cache hit for run %d: aic=%.3f (from run_summary.json)", runID, summary.TokenUsage.TotalAIC) return summary.TokenUsage.TotalAIC } - if err := forecastDownloadRunArtifacts(context.Background(), runID, dir, verbose, "", "", "", []string{"usage"}); err != nil { - if !errors.Is(err, ErrNoArtifacts) { + forecastRunLog.Printf("AIC cache miss for run %d; downloading usage artifact to %s", runID, dir) + if verbose { + fmt.Fprintln(os.Stderr, console.FormatVerboseMessage(fmt.Sprintf("Downloading usage artifact for run %d…", runID))) + } + + if err := forecastDownloadRunArtifacts(ctx, runID, dir, verbose, "", "", "", []string{"usage"}); err != nil { + if errors.Is(err, ErrNoArtifacts) { + forecastRunLog.Printf("No usage artifact for run %d; AIC will be 0", runID) + } else if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + forecastRunLog.Printf("Usage artifact download for run %d interrupted: %v", runID, err) + if verbose { + fmt.Fprintln(os.Stderr, console.FormatVerboseMessage(fmt.Sprintf("Usage artifact download for run %d interrupted: %v", runID, err))) + } + } else { forecastRunLog.Printf("Failed to download usage artifact for run %d: %v", runID, err) if verbose { fmt.Fprintln(os.Stderr, console.FormatVerboseMessage(fmt.Sprintf("Failed to download usage artifact for run %d: %v", runID, err))) @@ -768,11 +849,130 @@ func loadCachedRunAIC(runID int64, verbose bool) float64 { tokenUsage, err := forecastAnalyzeTokenUsage(dir, verbose) if err != nil || tokenUsage == nil || tokenUsage.TotalAIC <= 0 { + forecastRunLog.Printf("No AIC data in usage artifact for run %d (err=%v, tokenUsage=%v)", runID, err, tokenUsage) return 0 } + forecastRunLog.Printf("AIC from usage artifact for run %d: aic=%.3f", runID, tokenUsage.TotalAIC) return tokenUsage.TotalAIC } +// forecastDownloadUsageArtifact is a forecast-specific replacement for +// downloadRunArtifacts. Unlike the general-purpose downloader, it: +// - Downloads only artifacts matching artifactFilter (typically ["usage"]). +// - Skips workflow run log downloads entirely — logs are not needed for +// AIC computation and downloading them wastes time when forecasting +// many runs. +// - Returns ErrNoArtifacts immediately when no matching artifact is found +// rather than falling back to log diagnostics. +// +// It is referenced by forecastDownloadRunArtifacts so that tests can substitute +// a mock implementation without modifying the general artifact download path. +func forecastDownloadUsageArtifact(ctx context.Context, runID int64, outputDir string, verbose bool, owner, repo, hostname string, artifactFilter []string) error { + forecastRunLog.Printf("Downloading usage artifact: run_id=%d, output_dir=%s, filter=%v", runID, outputDir, artifactFilter) + shouldLogProgress := IsRunningInCI() || verbose + + // Check if the requested artifacts are already on disk (cache hit from actions/cache restore). + if fileutil.DirExists(outputDir) && !fileutil.IsDirEmpty(outputDir) { + missing := findMissingFilterEntries(artifactFilter, outputDir) + if len(missing) == 0 { + forecastRunLog.Printf("Usage artifact already on disk for run %d, skipping download", runID) + if shouldLogProgress { + fmt.Fprintln(os.Stderr, console.FormatInfoMessage( + fmt.Sprintf("Usage artifact already present for run %d, skipping download", runID))) + } + return nil + } + forecastRunLog.Printf("Usage artifact partially missing for run %d: %v; downloading missing entries", runID, missing) + artifactFilter = missing + } + + if err := os.MkdirAll(outputDir, constants.DirPermPublic); err != nil { + return fmt.Errorf("failed to create output directory for run %d: %w", runID, err) + } + + // List available artifacts for the run to find which match the filter. + artifactNames, listErr := listRunArtifactNames(ctx, runID, owner, repo, hostname, verbose) + if listErr != nil { + forecastRunLog.Printf("Failed to list artifacts for run %d: %v", runID, listErr) + if fileutil.IsDirEmpty(outputDir) { + _ = os.RemoveAll(outputDir) + } + return fmt.Errorf("failed to list artifacts for run %d: %w", runID, listErr) + } + + var downloadableNames []string + for _, name := range artifactNames { + if !isDockerBuildArtifact(name) && artifactMatchesFilter(name, artifactFilter) { + downloadableNames = append(downloadableNames, name) + } + } + + forecastRunLog.Printf("Run %d: found %d downloadable artifact(s) matching filter %v: %v", runID, len(downloadableNames), artifactFilter, downloadableNames) + + if len(downloadableNames) == 0 { + // No usage artifact — clean up empty directory and report. + if fileutil.IsDirEmpty(outputDir) { + _ = os.RemoveAll(outputDir) + } + return ErrNoArtifacts + } + + if shouldLogProgress { + fmt.Fprintln(os.Stderr, console.FormatInfoMessage( + fmt.Sprintf("Downloading usage artifact(s) for run %d: %v", runID, downloadableNames))) + } + + if err := downloadArtifactsByName(ctx, runID, outputDir, downloadableNames, verbose, owner, repo, hostname); err != nil { + return fmt.Errorf("failed to download usage artifact for run %d: %w", runID, err) + } + + if fileutil.IsDirEmpty(outputDir) { + return ErrNoArtifacts + } + + forecastRunLog.Printf("Downloaded usage artifact for run %d to %s", runID, outputDir) + return nil +} + +// emitPartialForecastResults outputs whatever workflow results have been collected so +// far when the forecast computation is interrupted (timeout or user cancellation). +// Partial results are only meaningful when at least one workflow has been fully +// processed; the function is a no-op when results is empty so callers do not need to +// guard against it. +func emitPartialForecastResults(results []ForecastWorkflowResult, config ForecastConfig, now time.Time) { + if len(results) == 0 { + return + } + forecastRunLog.Printf("Emitting %d partial forecast result(s) before early exit", len(results)) + fmt.Fprintln(os.Stderr, console.FormatWarningMessage( + fmt.Sprintf("Forecast interrupted; emitting partial results for %d workflow(s) processed so far.", len(results)))) + + // Sort partial results by Monte Carlo P50 descending (mirrors the full-results sort). + sort.Slice(results, func(i, j int) bool { + pi := results[i].ProjectedAIC + if mc := results[i].MonteCarlo; mc != nil { + pi = mc.P50ProjectedAIC + } + pj := results[j].ProjectedAIC + if mc := results[j].MonteCarlo; mc != nil { + pj = mc.P50ProjectedAIC + } + return pi > pj + }) + + output := ForecastResult{ + Period: config.Period, + AsOf: now.UTC().Format(time.RFC3339), + EvalMode: config.EvalMode, + Workflows: results, + } + if config.JSONOutput { + _ = renderForecastJSON(output) + } else { + _ = renderForecastTable(output, config) + } +} + func isCompletedNonSkippedRun(r WorkflowRun) bool { return r.Status == "completed" && r.Conclusion != "skipped" } @@ -838,7 +1038,7 @@ func evaluateForecast(ctx context.Context, workflowName string, forecast Forecas continue } eval.ActualRuns++ - eval.ActualAIC += forecastLoadCachedRunAIC(r.DatabaseID, config.Verbose) + eval.ActualAIC += forecastLoadCachedRunAIC(ctx, r.DatabaseID, config.Verbose) } // Compute error metrics against P50 (falls back to point estimate). @@ -874,58 +1074,71 @@ func renderForecastJSON(output ForecastResult) error { // forecastTableRow is a flattened struct used for console table rendering. type forecastTableRow struct { - Workflow string `json:"workflow" console:"header:Workflow"` - Runs int `json:"runs" console:"header:Sampled Runs"` - SuccessRate string `json:"success_rate" console:"header:Success Rate"` - AvgAIC string `json:"avg_aic" console:"header:Avg AIC"` - ProjectedAIC string `json:"projected_aic" console:"header:Proj. AIC (P50)"` - AICRange string `json:"aic_range" console:"header:80% CI (P10–P90)"` - Triggers string `json:"triggers" console:"header:Triggers"` + Workflow string `json:"workflow" console:"header:Workflow"` + Runs int `json:"runs" console:"header:Runs"` + P50PerRun string `json:"p50_per_run" console:"header:P50/Run"` + P95PerRun string `json:"p95_per_run" console:"header:P95/Run"` + WeeklyP50 string `json:"weekly_p50" console:"header:Weekly (P50)"` + MonthlyP50 string `json:"monthly_p50" console:"header:Monthly (P50)"` + SuccessRate string `json:"success_rate" console:"header:Success Rate"` + Triggers string `json:"triggers" console:"header:Triggers"` } // renderForecastTable renders the forecast result as a human-readable table. func renderForecastTable(output ForecastResult, config ForecastConfig) error { - periodLabel := strings.ToUpper(output.Period[:1]) + output.Period[1:] fmt.Fprintln(os.Stderr, console.FormatInfoMessage( - fmt.Sprintf("Workflow Forecast — per %s (based on last %d days of history)", periodLabel, config.Days))) + fmt.Sprintf("Workflow Forecast — weekly & monthly projections (based on last %d days of history)", config.Days))) fmt.Fprintln(os.Stderr, "") anyUnreliable := false - rows := make([]forecastTableRow, 0, len(output.Workflows)) + var totalWeeklyP50, totalMonthlyP50 float64 + rows := make([]forecastTableRow, 0, len(output.Workflows)+1) for _, wf := range output.Workflows { - // Use Monte Carlo P50 as the primary AIC estimate when available. - projAICStr := formatForecastAIC(wf.ProjectedAIC) - aicRangeStr := "-" unreliableMark := "" - if mc := wf.MonteCarlo; mc != nil { - projAICStr = formatForecastAIC(mc.P50ProjectedAIC) - if mc.P10ProjectedAIC == 0 && mc.P90ProjectedAIC == 0 { - aicRangeStr = "-" - } else { - aicRangeStr = fmt.Sprintf("%s–%s", - formatForecastAIC(mc.P10ProjectedAIC), - formatForecastAIC(mc.P90ProjectedAIC)) - } + + weeklyP50 := wf.WeeklyProjectedAIC + if mc := wf.WeeklyMonteCarlo; mc != nil { + weeklyP50 = mc.P50ProjectedAIC if !mc.IsReliable { anyUnreliable = true unreliableMark = "*" } } + monthlyP50 := wf.MonthlyProjectedAIC + if mc := wf.MonthlyMonteCarlo; mc != nil { + monthlyP50 = mc.P50ProjectedAIC + } + totalWeeklyP50 += weeklyP50 + totalMonthlyP50 += monthlyP50 + row := forecastTableRow{ - Workflow: wf.WorkflowID + unreliableMark, - Runs: wf.SampledRuns, - SuccessRate: formatForecastPercent(wf.SuccessRate, wf.SampledRuns > 0), - AvgAIC: formatForecastAIC(wf.AvgAIC), - ProjectedAIC: projAICStr, - AICRange: aicRangeStr, - Triggers: formatTriggerList(wf.ActiveTriggers), + Workflow: wf.WorkflowID + unreliableMark, + Runs: wf.SampledRuns, + P50PerRun: formatForecastAIC(wf.P50AIC), + P95PerRun: formatForecastAIC(wf.P95AIC), + WeeklyP50: formatForecastAIC(weeklyP50), + MonthlyP50: formatForecastAIC(monthlyP50), + SuccessRate: formatForecastPercent(wf.SuccessRate, wf.SampledRuns > 0), + Triggers: formatTriggerList(wf.ActiveTriggers), } rows = append(rows, row) } + // Append a totals row when more than one workflow is present. + if len(output.Workflows) > 1 { + rows = append(rows, forecastTableRow{ + Workflow: "TOTAL", + WeeklyP50: formatForecastAIC(totalWeeklyP50), + MonthlyP50: formatForecastAIC(totalMonthlyP50), + }) + } + fmt.Fprint(os.Stderr, console.RenderStruct(rows)) fmt.Fprintln(os.Stderr, "") + // Show detailed per-run samples section. + printRunSamplesSection(output.Workflows) + // Show experiment variant details when present. for _, wf := range output.Workflows { if len(wf.ExperimentVariants) > 0 { @@ -939,16 +1152,55 @@ func renderForecastTable(output ForecastResult, config ForecastConfig) error { } fmt.Fprintln(os.Stderr, console.FormatInfoMessage( - fmt.Sprintf("P50 = median; 80%% CI = P10–P90 from %d-trial Monte Carlo simulation (Gamma–Poisson model accounts for rate estimation uncertainty).", monteCarloIterations))) + fmt.Sprintf("P50/Run = per-run median AIC; P95/Run = 95th-percentile per-run AIC; Weekly/Monthly = projected P50 from %d-trial Monte Carlo simulation.", monteCarloIterations))) if anyUnreliable { fmt.Fprintln(os.Stderr, console.FormatWarningMessage( fmt.Sprintf("* Fewer than %d sampled runs — confidence intervals may be unreliable.", minObservationsForReliableForecast))) } fmt.Fprintln(os.Stderr, console.FormatInfoMessage( - fmt.Sprintf("Run '%s forecast --json' for full output.", string(constants.CLIExtensionPrefix)))) + fmt.Sprintf("Run '%s forecast --json' for full Monte Carlo output including P10/P90 confidence intervals.", string(constants.CLIExtensionPrefix)))) return nil } +// printRunSamplesSection prints a detailed table of the sampled runs used in the forecast, +// including the run ID, date, and raw AIC for each run. Workflows with no samples are skipped. +func printRunSamplesSection(workflows []ForecastWorkflowResult) { + type runRow struct { + RunID string `json:"run_id" console:"header:Run ID"` + Date string `json:"date" console:"header:Date"` + AIC string `json:"aic" console:"header:AIC"` + } + + hasSamples := false + for _, wf := range workflows { + if len(wf.RunSamples) > 0 { + hasSamples = true + break + } + } + if !hasSamples { + return + } + + fmt.Fprintln(os.Stderr, console.FormatInfoMessage("Sampled runs used in computation:")) + for _, wf := range workflows { + if len(wf.RunSamples) == 0 { + continue + } + fmt.Fprintf(os.Stderr, " %s (%d run(s)):\n", wf.WorkflowID, len(wf.RunSamples)) + rows := make([]runRow, 0, len(wf.RunSamples)) + for _, s := range wf.RunSamples { + rows = append(rows, runRow{ + RunID: fmt.Sprintf("#%d", s.RunID), + Date: s.Date, + AIC: formatForecastAIC(s.AIC), + }) + } + fmt.Fprint(os.Stderr, console.RenderStruct(rows)) + fmt.Fprintln(os.Stderr, "") + } +} + // printEvalBreakdown renders the backtesting comparison table. func printEvalBreakdown(workflows []ForecastWorkflowResult) { type evalRow struct { diff --git a/pkg/cli/forecast_test.go b/pkg/cli/forecast_test.go index 19019ac04bc..9bb6462ad7a 100644 --- a/pkg/cli/forecast_test.go +++ b/pkg/cli/forecast_test.go @@ -256,7 +256,7 @@ func TestForecastWorkflow_LambdaConsistencyAcrossOutputFormats(t *testing.T) { 4: 4.6, 5: 4.1, } - forecastLoadCachedRunAIC = func(runID int64, _ bool) float64 { + forecastLoadCachedRunAIC = func(_ context.Context, runID int64, _ bool) float64 { return runAIC[runID] } forecastListWorkflowRunsPaginated = func(_ ListWorkflowRunsOptions) ([]WorkflowRun, int, error) { @@ -324,7 +324,7 @@ func TestForecastWorkflow_IgnoresSkippedRuns(t *testing.T) { 12: 1.0, 13: 2.0, } - forecastLoadCachedRunAIC = func(runID int64, _ bool) float64 { + forecastLoadCachedRunAIC = func(_ context.Context, runID int64, _ bool) float64 { return runAIC[runID] } @@ -356,7 +356,7 @@ func TestForecastWorkflow_RequestsSuccessfulRuns(t *testing.T) { } return runs, len(runs), nil } - forecastLoadCachedRunAIC = func(runID int64, _ bool) float64 { + forecastLoadCachedRunAIC = func(_ context.Context, runID int64, _ bool) float64 { if runID == 12 { return 1.0 } @@ -421,7 +421,7 @@ func TestLoadCachedRunAIC_UsageArtifactFirst(t *testing.T) { return &TokenUsageSummary{TotalAIC: 12.34}, nil } - aic := loadCachedRunAIC(999_000_001, false) + aic := loadCachedRunAIC(context.Background(), 999_000_001, false) require.InDelta(t, 12.34, aic, 1e-9) require.Equal(t, []string{"usage"}, downloaded) } @@ -446,7 +446,7 @@ func TestLoadCachedRunAIC_DoesNotFallbackToLegacyAgentArtifacts(t *testing.T) { return &TokenUsageSummary{}, nil } - aic := loadCachedRunAIC(999_000_002, false) + aic := loadCachedRunAIC(context.Background(), 999_000_002, false) require.Zero(t, aic) require.Equal(t, []string{"usage"}, downloaded) }