diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 26dc9471a..ac89d779b 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -211,6 +211,7 @@ export async function writePerTestArtifacts( experiment?: string; runId?: string; duplicatePolicy?: ExportDuplicatePolicy; + resultGroup?: string; cwd?: string; repoRoot?: string; sourceTests?: readonly EvalTest[]; @@ -219,6 +220,7 @@ export async function writePerTestArtifacts( ): Promise { await writeCorePerTestArtifacts(results, outputDir, { experiment: options?.experiment, + resultGroup: options?.resultGroup, runId: options?.runId, duplicatePolicy: options?.duplicatePolicy, sourceTests: options?.sourceTests, @@ -236,6 +238,7 @@ export async function writeArtifactsFromResults( plannedTestCount?: number; runId?: string; duplicatePolicy?: ExportDuplicatePolicy; + resultGroup?: string; cwd?: string; repoRoot?: string; sourceTests?: readonly EvalTest[]; @@ -253,6 +256,7 @@ export async function writeArtifactsFromResults( plannedTestCount: options?.plannedTestCount, runId: options?.runId, duplicatePolicy: options?.duplicatePolicy, + resultGroup: options?.resultGroup, sourceTests: options?.sourceTests, additionalArtifacts: createTaskBundleArtifactsWriter(options), }); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index ccb767b71..182a3a3ed 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1,4 +1,3 @@ -import { spawn } from 'node:child_process'; import { constants, existsSync, mkdirSync } from 'node:fs'; import { access, readFile } from 'node:fs/promises'; import { createRequire as createNodeRequire } from 'node:module'; @@ -7,6 +6,7 @@ import { pathToFileURL } from 'node:url'; import { DEFAULT_THRESHOLD, + type EvalRunOverride, type EvalTargetRef, type EvalTest, type EvaluationCache, @@ -14,7 +14,6 @@ import { type ExecutionDefaults, type ExperimentArtifactMetadata, type ExperimentConfig, - type ExperimentScript, type FailOnError, type OtelTraceExporter as OtelTraceExporterType, type ResolvedTarget, @@ -25,14 +24,10 @@ import { buildTraceFromMessages, runEvaluation as defaultRunEvaluation, deriveCategory, - deriveExperimentNameFromPath, ensureVSCodeSubagents, - isExperimentFileReference, loadConfig, - loadExperimentConfig, loadTestSuite, loadTsConfig, - resolveDefaultExperimentReference, resolveTargetDefinition, shouldEnableCache, shouldSkipCacheForTemperature, @@ -124,6 +119,7 @@ interface NormalizedOptions { readonly dryRunDelayMin: number; readonly dryRunDelayMax: number; readonly agentTimeoutSeconds?: number; + readonly cliAgentTimeoutSeconds?: number; readonly maxRetries: number; readonly cache: boolean; readonly cachePath?: string; @@ -150,6 +146,7 @@ interface NormalizedOptions { readonly model?: string; readonly outputMessages: number | 'all'; readonly threshold?: number; + readonly cliThreshold?: number; readonly tags: readonly string[]; readonly excludeTags: readonly string[]; readonly transcript?: string; @@ -160,8 +157,8 @@ interface NormalizedOptions { readonly experimentMetadata?: ExperimentArtifactMetadata; readonly experimentTargetRefs?: readonly EvalTargetRef[]; readonly experimentTrialsConfig?: TrialsConfig; - readonly suiteFiltersByEvalFile?: ReadonlyMap; readonly budgetUsd?: number; + readonly cliBudgetUsd?: number; readonly sourceMetadataByEvalFile?: ReadonlyMap>; readonly resultsOverrides?: ResultsPublishOverrides; } @@ -422,6 +419,8 @@ function normalizeOptions( } const cliAgentTimeout = normalizeOptionalNumber(rawOptions.agentTimeout); + const cliThreshold = normalizeOptionalNumber(rawOptions.threshold); + const cliBudgetUsd = normalizeOptionalNumber(rawOptions.budgetUsd); const configAgentTimeoutSeconds = config?.execution?.agentTimeoutMs != null ? config.execution.agentTimeoutMs / 1000 : undefined; @@ -479,6 +478,7 @@ function normalizeOptions( dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0), dryRunDelayMax: normalizeNumber(rawOptions.dryRunDelayMax, 0), agentTimeoutSeconds: cliAgentTimeout ?? configAgentTimeoutSeconds, + cliAgentTimeoutSeconds: cliAgentTimeout, maxRetries: cliMaxRetries ?? configMaxRetries ?? 2, cache: cliCache, cachePath: cliCachePath, @@ -523,14 +523,16 @@ function normalizeOptions( graderTarget: normalizeString(rawOptions.graderTarget), model: normalizeString(rawOptions.model), outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)), - threshold: normalizeOptionalNumber(rawOptions.threshold), + threshold: cliThreshold, + cliThreshold, tags: normalizeStringArray(rawOptions.tag), excludeTags: normalizeStringArray(rawOptions.excludeTag), transcript: normalizeString(rawOptions.transcript), recordReplay: normalizeString(rawOptions.recordReplay), recordReplayVariant: normalizeString(rawOptions.recordReplayVariant), experiment: normalizeString(rawOptions.experiment), - budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd), + budgetUsd: cliBudgetUsd, + cliBudgetUsd, sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile( rawOptions.sourceMetadataByEvalFile, ), @@ -566,69 +568,33 @@ async function ensureFileExists(filePath: string, description: string): Promise< function buildDefaultOutputPathForExperiment( cwd: string, - experiment: string | undefined, + resultGroup: string | undefined, runDirName: string, ): string { - const runDir = buildDefaultRunDirFromName(cwd, experiment, runDirName); + const runDir = buildDefaultRunDirFromName(cwd, resultGroup, runDirName); mkdirSync(runDir, { recursive: true }); return path.join(runDir, 'index.jsonl'); } -function normalizeTsDefaultExperiment( - config: Awaited> | null, -): string | undefined { +function deriveEvalResultGroupName(evalFilePath: string | undefined): string { + if (!evalFilePath) { + return 'eval'; + } return ( - normalizeString(config?.experiments?.default) ?? normalizeString(config?.defaultExperiment) + path + .basename(evalFilePath) + .replace(/\.eval\.ya?ml$/i, '') + .replace(/\.ya?ml$/i, '') + .replace(/[^A-Za-z0-9._-]/g, '-') || 'eval' ); } type ResolvedExperimentForRun = { readonly name?: string; - readonly config?: ExperimentConfig; }; -async function resolveExperimentForRun(params: { - readonly cwd: string; - readonly explicitExperiment?: string; - readonly yamlDefaultExperiment?: string; - readonly tsDefaultExperiment?: string; -}): Promise { - const experimentRef = - params.explicitExperiment ?? params.yamlDefaultExperiment ?? params.tsDefaultExperiment; - if (!experimentRef) { - return {}; - } - - const experimentPath = resolveExperimentFilePath(params.cwd, experimentRef); - if (!experimentPath) { - if (isExperimentFileReference(experimentRef)) { - throw new Error(`Experiment file not found: ${experimentRef}`); - } - return { name: experimentRef }; - } - - const config = await loadExperimentConfig(experimentPath); - return { - name: config.name ?? deriveExperimentNameFromPath(experimentPath), - config, - }; -} - -function resolveExperimentFilePath(cwd: string, experimentRef: string): string | undefined { - if (isExperimentFileReference(experimentRef)) { - const experimentPath = path.isAbsolute(experimentRef) - ? experimentRef - : path.resolve(cwd, experimentRef); - return existsSync(experimentPath) ? experimentPath : undefined; - } - - for (const ext of ['yaml', 'yml', 'ts', 'js', 'mts', 'mjs']) { - const candidate = path.resolve(cwd, 'experiments', `${experimentRef}.${ext}`); - if (existsSync(candidate)) { - return candidate; - } - } - return undefined; +function resolveExperimentForRun(explicitExperiment?: string): ResolvedExperimentForRun { + return explicitExperiment ? { name: explicitExperiment } : {}; } function applyExperimentOptions( @@ -664,6 +630,7 @@ function applyExperimentOptions( workspaceMode: workspacePath ? 'static' : workspaceMode, workspacePath, budgetUsd: options.budgetUsd ?? experiment.budgetUsd, + threshold: options.threshold ?? experiment.threshold, experimentConfig: experiment, experimentMetadata: buildExperimentArtifactMetadata(experiment), experimentTargetRefs: options.cliTargets.length === 0 ? experimentTargetRefs : undefined, @@ -715,212 +682,104 @@ function buildExperimentTrialsConfig(experiment: ExperimentConfig): TrialsConfig }; } -function readExperimentWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' | undefined { - return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined; -} - -function readExperimentWorkspacePath( - workspace: Record | undefined, -): string | undefined { - const value = workspace?.path; - return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined; -} - -type ExperimentSuiteSelection = { - readonly testFiles: readonly string[]; - readonly filtersByEvalFile: ReadonlyMap; +type EffectiveRunPolicy = { + readonly trialsConfig?: TrialsConfig; + readonly threshold?: number; + readonly timeoutSeconds?: number; + readonly budgetUsd?: number; + readonly hasScopedOverride: boolean; }; -function matchesTestFilter(id: string, filter: string | readonly string[]): boolean { - return typeof filter === 'string' - ? micromatch.isMatch(id, filter) - : filter.some((pattern) => micromatch.isMatch(id, pattern)); -} - -async function resolveExperimentSuiteSelection( - suites: ExperimentConfig['suites'] | undefined, - cwd: string, -): Promise { - if (!suites || suites.length === 0) { +function buildRunOverrideTrialsConfig(run: EvalRunOverride | undefined): TrialsConfig | undefined { + const repeat = run?.repeat; + if (!repeat || repeat.count <= 1) { return undefined; } - - const testFiles = new Set(); - const selectedTestIdsByEvalFile = new Map(); - - for (const suite of suites) { - const resolvedSuiteFiles = await resolveEvalPaths([suite.ref], cwd); - for (const testFilePath of resolvedSuiteFiles) { - const resolvedPath = path.resolve(testFilePath); - testFiles.add(resolvedPath); - if (suite.select?.testIds && suite.select.testIds.length > 0) { - const existing = selectedTestIdsByEvalFile.get(resolvedPath) ?? []; - selectedTestIdsByEvalFile.set(resolvedPath, [...existing, ...suite.select.testIds]); - } - } - } - - const filtersByEvalFile = new Map(); - for (const [testFilePath, testIds] of selectedTestIdsByEvalFile.entries()) { - const uniqueTestIds = [...new Set(testIds)]; - filtersByEvalFile.set( - testFilePath, - uniqueTestIds.length === 1 ? uniqueTestIds[0] : uniqueTestIds, - ); - } - return { - testFiles: [...testFiles], - filtersByEvalFile, + count: repeat.count, + strategy: repeat.strategy, + ...(repeat.costLimitUsd !== undefined && { costLimitUsd: repeat.costLimitUsd }), + ...(repeat.earlyExit !== undefined && { earlyExit: repeat.earlyExit }), }; } -async function runExperimentSteps(params: { - readonly label: 'setup' | 'script'; - readonly steps: readonly ExperimentScript[] | undefined; - readonly cwd: string; - readonly experimentConfig?: ExperimentConfig; -}): Promise { - const steps = params.steps ?? []; - if (steps.length === 0) { - return; - } - - for (let index = 0; index < steps.length; index++) { - const step = steps[index]; - const command = buildExperimentStepCommand(step); - const cwd = resolveExperimentStepCwd(params.cwd, params.experimentConfig, step.cwd); - console.log(`Experiment ${params.label} ${index + 1}/${steps.length}: ${command.display}`); - await runExperimentCommand(command.argv, { - cwd, - env: step.env, - timeoutMs: step.timeoutSeconds ? step.timeoutSeconds * 1000 : undefined, - label: `experiment ${params.label}`, - }); - } +function resolveEffectiveRunPolicy(params: { + readonly test: EvalTest; + readonly options: NormalizedOptions; + readonly defaultTrialsConfig?: TrialsConfig; + readonly defaultThreshold?: number; + readonly defaultTimeoutSeconds?: number; + readonly defaultBudgetUsd?: number; +}): EffectiveRunPolicy { + const { test, options, defaultTrialsConfig, defaultThreshold, defaultTimeoutSeconds } = params; + const run = test.run; + const threshold = options.cliThreshold ?? run?.threshold ?? test.threshold ?? defaultThreshold; + const timeoutSeconds = + options.cliAgentTimeoutSeconds ?? run?.timeoutSeconds ?? defaultTimeoutSeconds; + const budgetUsd = run?.budgetUsd ?? params.defaultBudgetUsd; + const trialsConfig = buildRunOverrideTrialsConfig(run) ?? defaultTrialsConfig; + return { + ...(trialsConfig !== undefined && { trialsConfig }), + ...(threshold !== undefined && { threshold }), + ...(timeoutSeconds !== undefined && { timeoutSeconds }), + ...(budgetUsd !== undefined && { budgetUsd }), + hasScopedOverride: run !== undefined || test.threshold !== undefined, + }; } -async function runExperimentSetup(params: { - readonly config: ExperimentConfig | undefined; - readonly cwd: string; - readonly runDir: string; -}): Promise { - const setup = params.config?.setup; - if (typeof setup === 'function') { - console.log('Experiment setup: running TypeScript setup()'); - await setup({ - cwd: params.cwd, - runDir: params.runDir, - experiment: params.config, - env: process.env, - }); - return; - } - await runExperimentSteps({ - label: 'setup', - steps: setup, - cwd: params.cwd, - experimentConfig: params.config, +function runPolicyKey(policy: EffectiveRunPolicy): string { + return JSON.stringify({ + trialsConfig: policy.trialsConfig, + threshold: policy.threshold, + timeoutSeconds: policy.timeoutSeconds, + budgetUsd: policy.budgetUsd, }); } -function buildExperimentStepCommand(step: ExperimentScript): { - readonly argv: readonly string[]; - readonly display: string; -} { - if (step.command && step.command.length > 0) { - return { argv: step.command, display: step.command.join(' ') }; - } - if (typeof step.script === 'string' && step.script.trim().length > 0) { - return { - argv: shellCommand(step.script), - display: step.script, - }; - } - if (Array.isArray(step.script) && step.script.length > 0) { - return { argv: step.script, display: step.script.join(' ') }; +function groupTestsByRunPolicy(params: { + readonly tests: readonly EvalTest[]; + readonly options: NormalizedOptions; + readonly defaultTrialsConfig?: TrialsConfig; + readonly defaultThreshold?: number; + readonly defaultTimeoutSeconds?: number; + readonly defaultBudgetUsd?: number; +}): readonly { readonly policy: EffectiveRunPolicy; readonly tests: readonly EvalTest[] }[] { + const groups = new Map(); + for (const test of params.tests) { + const policy = resolveEffectiveRunPolicy({ + test, + options: params.options, + defaultTrialsConfig: params.defaultTrialsConfig, + defaultThreshold: params.defaultThreshold, + defaultTimeoutSeconds: params.defaultTimeoutSeconds, + defaultBudgetUsd: params.defaultBudgetUsd, + }); + const key = runPolicyKey(policy); + const existing = groups.get(key); + if (existing) { + existing.tests.push(test); + } else { + groups.set(key, { policy, tests: [test] }); + } } - throw new Error('Experiment step must define command or script.'); + return [...groups.values()]; } -function shellCommand(script: string): readonly string[] { - return process.platform === 'win32' ? ['cmd', '/c', script] : ['sh', '-c', script]; +function readExperimentWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' | undefined { + return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined; } -function resolveExperimentStepCwd( - cwd: string, - experimentConfig: ExperimentConfig | undefined, - stepCwd: string | undefined, -): string { - const base = experimentConfig?.sourcePath ? path.dirname(experimentConfig.sourcePath) : cwd; - if (!stepCwd) { - return base; - } - return path.isAbsolute(stepCwd) ? stepCwd : path.resolve(base, stepCwd); +function readExperimentWorkspacePath( + workspace: Record | undefined, +): string | undefined { + const value = workspace?.path; + return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined; } -async function runExperimentCommand( - argv: readonly string[], - options: { - readonly cwd: string; - readonly env?: Record; - readonly timeoutMs?: number; - readonly label: string; - }, -): Promise { - if (argv.length === 0) { - throw new Error(`${options.label} command must not be empty.`); - } - - await new Promise((resolve, reject) => { - const cmd = argv[0]; - if (!cmd) { - reject(new Error(`${options.label} command must not be empty.`)); - return; - } - const args = argv.slice(1); - const child = spawn(cmd, args, { - cwd: options.cwd, - env: options.env ? { ...process.env, ...options.env } : process.env, - stdio: 'inherit', - }); - let completed = false; - const timeout = - options.timeoutMs !== undefined - ? setTimeout(() => { - if (!completed) { - completed = true; - child.kill('SIGKILL'); - reject(new Error(`${options.label} timed out after ${options.timeoutMs}ms`)); - } - }, options.timeoutMs) - : undefined; - - child.on('error', (error) => { - if (completed) { - return; - } - completed = true; - if (timeout !== undefined) { - clearTimeout(timeout); - } - reject(error); - }); - child.on('exit', (code) => { - if (completed) { - return; - } - completed = true; - if (timeout !== undefined) { - clearTimeout(timeout); - } - if (code === 0) { - resolve(); - } else { - reject(new Error(`${options.label} exited with code ${code ?? 'unknown'}`)); - } - }); - }); +function matchesTestFilter(id: string, filter: string | readonly string[]): boolean { + return typeof filter === 'string' + ? micromatch.isMatch(id, filter) + : filter.some((pattern) => micromatch.isMatch(id, pattern)); } type ProgressReporter = { @@ -1033,6 +892,7 @@ async function prepareFileMetadata(params: { readonly options: NormalizedOptions; readonly suiteFilter?: string | readonly string[]; }): Promise<{ + readonly options: NormalizedOptions; readonly testIds: readonly string[]; readonly testCases: readonly EvalTest[]; readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[]; @@ -1066,24 +926,32 @@ async function prepareFileMetadata(params: { filter: suiteFilter ?? options.filter, category, }); + const effectiveOptions = applyExperimentOptions(options, suite.experimentConfig); const testCases = - suiteFilter && options.filter - ? suite.tests.filter((testCase) => matchesTestFilter(testCase.id, options.filter ?? '')) + suiteFilter && effectiveOptions.filter + ? suite.tests.filter((testCase) => + matchesTestFilter(testCase.id, effectiveOptions.filter ?? ''), + ) : suite.tests; const testIds = testCases.map((value) => value.id); const suiteTargets = suite.targets; + const defaultBudgetUsd = + effectiveOptions.cliBudgetUsd === undefined + ? (effectiveOptions.budgetUsd ?? suite.budgetUsd) + : suite.budgetUsd; if (testCases.length === 0) { return { + options: effectiveOptions, testIds, testCases, selections: [], - trialsConfig: options.experimentTrialsConfig, + trialsConfig: effectiveOptions.experimentTrialsConfig, suiteTargets, yamlWorkers: suite.workers, yamlCache: suite.cacheConfig?.enabled, yamlCachePath: suite.cacheConfig?.cachePath, - budgetUsd: suite.budgetUsd, + budgetUsd: defaultBudgetUsd, failOnError: suite.failOnError, threshold: suite.threshold, tags: suite.metadata?.tags, @@ -1093,7 +961,7 @@ async function prepareFileMetadata(params: { let selections: { selection: TargetSelection; inlineTargetLabel: string }[]; - if (options.transcript) { + if (effectiveOptions.transcript) { // --transcript mode: bypass target resolution entirely. // Create a synthetic TargetSelection for the transcript provider. const transcriptSelection: TargetSelection = { @@ -1105,15 +973,15 @@ async function prepareFileMetadata(params: { }, targetName: 'transcript', targetSource: 'cli', - targetsFilePath: options.transcript, + targetsFilePath: effectiveOptions.transcript, }; selections = [ { selection: transcriptSelection, - inlineTargetLabel: `transcript (${path.basename(options.transcript)})`, + inlineTargetLabel: `transcript (${path.basename(effectiveOptions.transcript)})`, }, ]; - } else if (suite.inlineTarget && options.cliTargets.length === 0) { + } else if (suite.inlineTarget && effectiveOptions.cliTargets.length === 0) { const targetDefinition = suite.inlineTarget; const resolvedTarget = options.dryRun ? ({ @@ -1144,7 +1012,7 @@ async function prepareFileMetadata(params: { inlineTargetLabel: resolveTargetLabel(targetDefinition.name, resolvedTarget.name), }, ]; - } else if (suite.providerFactory && options.cliTargets.length === 0) { + } else if (suite.providerFactory && effectiveOptions.cliTargets.length === 0) { const taskTarget: ResolvedTarget = { kind: 'mock', name: 'custom-task', @@ -1165,10 +1033,10 @@ async function prepareFileMetadata(params: { ]; } else { // Determine target names: CLI --target flags override YAML - const cliTargets = options.cliTargets; + const cliTargets = effectiveOptions.cliTargets; const suiteTargets = suite.targets; const suiteTargetRefs = suite.targetRefs; - const experimentTargetRefs = options.experimentTargetRefs; + const experimentTargetRefs = effectiveOptions.experimentTargetRefs; // Resolve which target names to use (precedence: CLI/experiment > suite YAML targets > default) let targetNames: readonly string[]; @@ -1190,11 +1058,11 @@ async function prepareFileMetadata(params: { testFilePath, repoRoot, cwd, - explicitTargetsPath: options.targetsPath, - dryRun: options.dryRun, - dryRunDelay: options.dryRunDelay, - dryRunDelayMin: options.dryRunDelayMin, - dryRunDelayMax: options.dryRunDelayMax, + explicitTargetsPath: effectiveOptions.targetsPath, + dryRun: effectiveOptions.dryRun, + dryRunDelay: effectiveOptions.dryRunDelay, + dryRunDelayMin: effectiveOptions.dryRunDelayMin, + dryRunDelayMax: effectiveOptions.dryRunDelayMax, env: process.env, targetNames, targetRefs, @@ -1210,12 +1078,12 @@ async function prepareFileMetadata(params: { testFilePath, repoRoot, cwd, - explicitTargetsPath: options.targetsPath, - cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target, - dryRun: options.dryRun, - dryRunDelay: options.dryRunDelay, - dryRunDelayMin: options.dryRunDelayMin, - dryRunDelayMax: options.dryRunDelayMax, + explicitTargetsPath: effectiveOptions.targetsPath, + cliTargetName: targetNames.length === 1 ? targetNames[0] : effectiveOptions.target, + dryRun: effectiveOptions.dryRun, + dryRunDelay: effectiveOptions.dryRunDelay, + dryRunDelayMin: effectiveOptions.dryRunDelayMin, + dryRunDelayMax: effectiveOptions.dryRunDelayMax, env: process.env, }); @@ -1238,15 +1106,16 @@ async function prepareFileMetadata(params: { } return { + options: effectiveOptions, testIds, testCases, selections, - trialsConfig: options.experimentTrialsConfig, + trialsConfig: effectiveOptions.experimentTrialsConfig, suiteTargets, yamlWorkers: suite.workers, yamlCache: suite.cacheConfig?.enabled, yamlCachePath: suite.cacheConfig?.cachePath, - budgetUsd: suite.budgetUsd, + budgetUsd: defaultBudgetUsd, failOnError: suite.failOnError, threshold: suite.threshold, tags: suite.metadata?.tags, @@ -1293,6 +1162,7 @@ async function runSingleEvalFile(params: { readonly inlineTargetLabel: string; readonly testCases: readonly EvalTest[]; readonly trialsConfig?: TrialsConfig; + readonly agentTimeoutSeconds?: number; readonly matrixMode?: boolean; readonly budgetUsd?: number; readonly runBudgetTracker?: RunBudgetTracker; @@ -1320,6 +1190,7 @@ async function runSingleEvalFile(params: { inlineTargetLabel, testCases, trialsConfig, + agentTimeoutSeconds, matrixMode, budgetUsd, runBudgetTracker, @@ -1361,9 +1232,7 @@ async function runSingleEvalFile(params: { } const agentTimeoutMs = - options.agentTimeoutSeconds != null - ? Math.max(0, options.agentTimeoutSeconds) * 1000 - : undefined; + agentTimeoutSeconds != null ? Math.max(0, agentTimeoutSeconds) * 1000 : undefined; // Resolve workers: CLI flag > eval YAML execution.workers > target setting > default const workerPreference = workersOverride ?? options.workers; @@ -1440,7 +1309,7 @@ async function runSingleEvalFile(params: { failOnError, graderTarget: options.graderTarget, model: options.model, - threshold: options.threshold, + threshold: params.threshold, targetHooks: resolvedTargetSelection.targetHooks, replayRecording, providerFactory, @@ -1562,38 +1431,31 @@ export async function runEvalCommand( } let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution); - const resolvedExperiment = await resolveExperimentForRun({ - cwd, - explicitExperiment: options.experiment, - yamlDefaultExperiment: resolveDefaultExperimentReference(yamlConfig), - tsDefaultExperiment: normalizeTsDefaultExperiment(config), - }); - options = { - ...applyExperimentOptions(options, resolvedExperiment.config), - experiment: resolvedExperiment.name, - }; - - const suiteSelection = await resolveExperimentSuiteSelection( - options.experimentConfig?.suites, - cwd, - ); - const evalPathInputs = - input.testFiles.length > 0 - ? [...input.testFiles] - : suiteSelection - ? [...suiteSelection.testFiles] - : []; + const resolvedExperiment = resolveExperimentForRun(options.experiment); + const evalPathInputs = input.testFiles.length > 0 ? [...input.testFiles] : []; if (evalPathInputs.length === 0 && process.stdin.isTTY) { const { launchInteractiveWizard } = await import('./interactive.js'); await launchInteractiveWizard(); return undefined; } const resolvedTestFiles = await resolveEvalPaths(evalPathInputs, cwd); + const fallbackResultGroupName = + resolvedTestFiles.length === 1 ? deriveEvalResultGroupName(resolvedTestFiles[0]) : 'multi-eval'; + const primarySuite = + resolvedTestFiles.length > 0 + ? await loadTestSuite(resolvedTestFiles[0], repoRoot, { + verbose: options.verbose, + filter: options.filter, + category: deriveCategory(path.relative(cwd, resolvedTestFiles[0])), + }) + : undefined; + const resultGroupName = + resolvedTestFiles.length === 1 + ? (primarySuite?.metadata?.name ?? fallbackResultGroupName) + : fallbackResultGroupName; options = { ...options, - ...(suiteSelection !== undefined && { - suiteFiltersByEvalFile: suiteSelection.filtersByEvalFile, - }), + experiment: resolvedExperiment.name ?? resultGroupName, }; if (!process.env.AGENTV_EXPERIMENT) { @@ -1732,8 +1594,8 @@ export async function runEvalCommand( mkdirSync(runDir, { recursive: true }); outputPath = path.join(runDir, 'index.jsonl'); } else { - // Default: .agentv/results///, using "default" when unspecified. - outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment, runDirName); + // Default: .agentv/results///. + outputPath = buildDefaultOutputPathForExperiment(cwd, resultGroupName, runDirName); runDir = path.dirname(outputPath); } if (!process.env.AGENTV_RUN_TIMESTAMP) { @@ -1811,12 +1673,6 @@ export async function runEvalCommand( console.log(`Artifact directory: ${runDir}`); - await runExperimentSetup({ - config: options.experimentConfig, - cwd, - runDir, - }); - // Log file export paths if (options.otelFile) { console.log(`OTLP JSON file: ${path.resolve(options.otelFile)}`); @@ -1830,17 +1686,19 @@ export async function runEvalCommand( const seenTestCases = new Set(); const displayIdTracker = createDisplayIdTracker(); - // Run-level budget tracker: caps total cost across all eval files in this run. - const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : undefined; + // CLI --budget-usd is invocation-wide. Inline experiment.budget_usd is handled per eval file. + const runBudgetTracker = options.cliBudgetUsd + ? new RunBudgetTracker(options.cliBudgetUsd) + : undefined; if (runBudgetTracker) { console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`); } - // Each file gets the full worker budget — no splitting across files - const perFileWorkers = options.workers; + // Each file gets its own worker policy from CLI/config or that file's experiment block. const fileMetadata = new Map< string, { + readonly options: NormalizedOptions; readonly testIds: readonly string[]; readonly testCases: readonly EvalTest[]; readonly selections: readonly { @@ -1867,7 +1725,7 @@ export async function runEvalCommand( repoRoot, cwd, options, - suiteFilter: options.suiteFiltersByEvalFile?.get(path.resolve(testFilePath)), + suiteFilter: undefined, }); fileMetadata.set(testFilePath, meta); } @@ -1916,7 +1774,9 @@ export async function runEvalCommand( console.log(`Replay recording: ${path.resolve(options.recordReplay)}`); } - // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold. + // Resolve a global summary threshold only when the CLI supplies one or the first + // active eval file is the only source of runtime policy. Multi-file runs with + // inline thresholds are summarized from per-result execution status instead. const yamlThreshold = firstMeta?.threshold; const resolvedThreshold = options.threshold ?? yamlThreshold; if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) { @@ -2025,6 +1885,14 @@ export async function runEvalCommand( // Use only files that survived tag filtering. const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); + const singleActiveFileMetadata = + activeTestFiles.length === 1 ? fileMetadata.get(activeTestFiles[0]) : undefined; + const runExperimentMetadata = singleActiveFileMetadata?.options.experimentMetadata; + const hasPerFileRuntimeThresholds = + options.cliThreshold === undefined && + activeTestFiles.some( + (activeTestFile) => fileMetadata.get(activeTestFile)?.options.threshold !== undefined, + ); // --transcript: create a shared TranscriptProvider and validate entry count let transcriptProviderFactory: @@ -2063,7 +1931,7 @@ export async function runEvalCommand( evalFile, plannedTestCount: totalEvalCount, experiment: normalizeExperimentName(options.experiment), - experimentMetadata: options.experimentMetadata, + experimentMetadata: runExperimentMetadata, }); } @@ -2092,13 +1960,22 @@ export async function runEvalCommand( // Eval files run sequentially; within each file, --workers N test cases run in parallel. // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file // workspace races without any grouping complexity. + let hasScopedRunPolicies = false; try { for (const testFilePath of activeTestFiles) { + const targetPrep = fileMetadata.get(testFilePath); + if (!targetPrep) { + throw new Error(`Missing metadata for ${testFilePath}`); + } + const fileOptions = targetPrep.options; + const fileBudgetTracker = + runBudgetTracker ?? + (fileOptions.budgetUsd !== undefined + ? new RunBudgetTracker(fileOptions.budgetUsd) + : undefined); // Run-level budget check: skip remaining files if budget exceeded - if (runBudgetTracker?.isExceeded()) { - const targetPrep = fileMetadata.get(testFilePath); - if (!targetPrep) continue; - const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`; + if (fileBudgetTracker?.isExceeded()) { + const budgetMsg = `Run budget exceeded ($${fileBudgetTracker.currentCostUsd.toFixed(4)} / $${fileBudgetTracker.budgetCapUsd.toFixed(4)})`; console.log(`\n⚠ ${budgetMsg} — skipping ${path.basename(testFilePath)}`); for (const { selection } of targetPrep.selections) { const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({ @@ -2125,20 +2002,15 @@ export async function runEvalCommand( target: selection.targetName, })); for (const r of skippedResults) { - await outputWriter.append(withSourceMetadata(r, testFilePath, options)); + await outputWriter.append(withSourceMetadata(r, testFilePath, fileOptions)); } allResults.push( - ...skippedResults.map((r) => withSourceMetadata(r, testFilePath, options)), + ...skippedResults.map((r) => withSourceMetadata(r, testFilePath, fileOptions)), ); } continue; } - const targetPrep = fileMetadata.get(testFilePath); - if (!targetPrep) { - throw new Error(`Missing metadata for ${testFilePath}`); - } - // Run all targets concurrently (each target has its own worker limit) const targetResults = await Promise.all( targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { @@ -2166,45 +2038,59 @@ export async function runEvalCommand( } try { - const result = await runSingleEvalFile({ - testFilePath, - cwd, - repoRoot, - options, - outputWriter, - otelExporter, - cache, - evaluationRunner, - workersOverride: perFileWorkers, - yamlWorkers: targetPrep.yamlWorkers, - progressReporter, - seenTestCases, - displayIdTracker, - selection, - inlineTargetLabel, - testCases: filteredTestCases, - trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, - matrixMode: targetPrep.selections.length > 1, - budgetUsd: targetPrep.budgetUsd, - runBudgetTracker, - failOnError: targetPrep.failOnError, - threshold: resolvedThreshold, - providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory, + const runGroups = groupTestsByRunPolicy({ + tests: filteredTestCases, + options: fileOptions, + defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig, + defaultThreshold: fileOptions.threshold ?? targetPrep.threshold, + defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds, + defaultBudgetUsd: targetPrep.budgetUsd, }); + const groupResults: EvaluationResult[] = []; + for (const group of runGroups) { + hasScopedRunPolicies ||= group.policy.hasScopedOverride; + const result = await runSingleEvalFile({ + testFilePath, + cwd, + repoRoot, + options: fileOptions, + outputWriter, + otelExporter, + cache, + evaluationRunner, + workersOverride: fileOptions.workers, + yamlWorkers: targetPrep.yamlWorkers, + progressReporter, + seenTestCases, + displayIdTracker, + selection, + inlineTargetLabel, + testCases: group.tests, + trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig, + agentTimeoutSeconds: group.policy.timeoutSeconds, + matrixMode: targetPrep.selections.length > 1, + budgetUsd: group.policy.budgetUsd, + runBudgetTracker: fileBudgetTracker, + failOnError: targetPrep.failOnError, + threshold: group.policy.threshold, + providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory, + }); + groupResults.push(...result.results); + } const evalFile = path.relative(cwd, testFilePath); const existingSummary = remoteEvalSummaries.find( (summary) => summary.evalFile === evalFile, ); if (existingSummary) { - existingSummary.results.push(...result.results); + existingSummary.results.push(...groupResults); } else { remoteEvalSummaries.push({ evalFile, - results: [...result.results], + results: [...groupResults], }); } - return result.results; + return groupResults; } catch (fileError) { // before_all or other setup failures should not abort the entire run. // Mark all tests in this file as errors and continue with other files. @@ -2239,7 +2125,7 @@ export async function runEvalCommand( target: selection.targetName, }, testFilePath, - options, + fileOptions, ), ); for (const errResult of errorResults) { @@ -2278,7 +2164,11 @@ export async function runEvalCommand( } const thresholdOpts = - resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined; + hasScopedRunPolicies || hasPerFileRuntimeThresholds + ? { thresholdLabel: 'configured threshold(s)', useExecutionStatus: true } + : resolvedThreshold !== undefined + ? { threshold: resolvedThreshold } + : undefined; const summary = calculateEvaluationSummary(summaryResults, thresholdOpts); console.log(formatEvaluationSummary(summary, thresholdOpts)); if ( @@ -2292,7 +2182,9 @@ export async function runEvalCommand( // Exit code: 2 when all tests are execution errors (no evaluation performed), // 1 when any test scored below threshold. const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total; - const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0; + const thresholdFailed = + (thresholdOpts?.useExecutionStatus === true || resolvedThreshold !== undefined) && + summary.qualityFailureCount > 0; // Print matrix summary when multiple targets were evaluated if (isMatrixMode && summaryResults.length > 0) { @@ -2312,6 +2204,7 @@ export async function runEvalCommand( const { writePerTestArtifacts } = await import('./artifact-writer.js'); await writePerTestArtifacts(allResults, runDir, { experiment: normalizeExperimentName(options.experiment), + resultGroup: resultGroupName, cwd, repoRoot, sourceTests, @@ -2320,7 +2213,7 @@ export async function runEvalCommand( const { summaryPath } = await aggregateRunDir(runDir, { evalFile, experiment: normalizeExperimentName(options.experiment), - experimentMetadata: options.experimentMetadata, + experimentMetadata: runExperimentMetadata, }); const indexPath = path.join(runDir, 'index.jsonl'); console.log(`Artifact workspace updated: ${runDir}`); @@ -2334,7 +2227,8 @@ export async function runEvalCommand( { evalFile, experiment: normalizeExperimentName(options.experiment), - experimentMetadata: options.experimentMetadata, + experimentMetadata: runExperimentMetadata, + resultGroup: resultGroupName, cwd, repoRoot, sourceTests, @@ -2437,13 +2331,6 @@ export async function runEvalCommand( await wipLoop.stopAndDeleteWipBranch(); } - await runExperimentSteps({ - label: 'script', - steps: options.experimentConfig?.scripts, - cwd, - experimentConfig: options.experimentConfig, - }); - return { executionErrorCount: summary.executionErrorCount, outputPath, diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 54d6d373c..13d64f508 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -84,7 +84,7 @@ function buildHistogram(values: readonly number[]): readonly HistogramBin[] { export function calculateEvaluationSummary( results: readonly EvaluationResult[], - options?: { threshold?: number }, + options?: { threshold?: number; thresholdLabel?: string; useExecutionStatus?: boolean }, ): EvaluationSummary { const total = results.length; @@ -139,11 +139,11 @@ export function calculateEvaluationSummary( const executionErrorCount = executionErrors.length; const scoreThreshold = options?.threshold; const passedCount = - scoreThreshold !== undefined + scoreThreshold !== undefined && options?.useExecutionStatus !== true ? qualityResults.filter((r) => r.score >= scoreThreshold).length : results.filter((r) => r.executionStatus === 'ok').length; const qualityFailureCount = - scoreThreshold !== undefined + scoreThreshold !== undefined && options?.useExecutionStatus !== true ? qualityResults.filter((r) => r.score < scoreThreshold).length : results.filter((r) => r.executionStatus === 'quality_failure').length; @@ -186,7 +186,7 @@ function formatScore(value: number): string { export function formatEvaluationSummary( summary: EvaluationSummary, - options?: { threshold?: number }, + options?: { threshold?: number; thresholdLabel?: string; useExecutionStatus?: boolean }, ): string { if (summary.total === 0) { return '\nNo results to summarize'; @@ -209,6 +209,7 @@ export function formatEvaluationSummary( // Overall verdict: all non-error cases must score >= per-test threshold. const gradedCount = summary.total - summary.executionErrorCount; const threshold = options?.threshold ?? 0.8; + const thresholdText = options?.thresholdLabel ?? `${Math.round(threshold * 100)}%`; const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total; const overallPassed = !allExecutionErrors && @@ -226,7 +227,7 @@ export function formatEvaluationSummary( } else { overallVerdict = overallPassed ? 'PASS' : 'FAIL'; verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; - verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} scored >= ${Math.round(threshold * 100)}%, mean: ${formatScore(summary.mean)})`; + verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} scored >= ${thresholdText}, mean: ${formatScore(summary.mean)})`; } lines.push('\n=================================================='); diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts index a716eeda7..ae81fb721 100644 --- a/apps/cli/src/commands/eval/task-bundle.ts +++ b/apps/cli/src/commands/eval/task-bundle.ts @@ -736,6 +736,9 @@ function buildPortableEvalCase( if (test.metadata && Object.keys(test.metadata).length > 0) { testCase.metadata = rewritePathsDeep(test.metadata, rewrites); } + if (test.run && Object.keys(test.run).length > 0) { + testCase.run = rewritePathsDeep(test.run, rewrites); + } if (test.conversation_id) { testCase.conversation_id = test.conversation_id; } diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 6c9b67c1c..0bad4ce0f 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -190,6 +190,23 @@ describe('buildGradingArtifact', () => { passed_attempts: 1, total_attempts: 2, }); + + const passAll = buildGradingArtifact( + makeResult({ + aggregation: { + strategy: 'pass_all', + passedAttempts: 1, + totalAttempts: 2, + min: 0.4, + }, + }), + ); + expect(passAll.aggregation).toEqual({ + strategy: 'pass_all', + passed_attempts: 1, + total_attempts: 2, + min: 0.4, + }); }); it('uses top-level assertions when no grader scores', () => { @@ -1726,6 +1743,80 @@ describe('writeArtifactsFromResults', () => { expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json'); }); + it('does not prefix artifact paths with suite when it matches the result group', async () => { + const paths = await writeArtifactsFromResults( + [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], + testDir, + { resultGroup: 'eval-top-months-chart' }, + ); + + const [indexLine] = (await readFile(paths.indexPath, 'utf8')) + .trim() + .split('\n') + .map(JSON.parse); + expect(indexLine.suite).toBe('eval-top-months-chart'); + expect(indexLine.grading_path).toBe('shared-id/run-1/grading.json'); + }); + + it('prefixes imported suite artifacts even when the suite matches the result group', async () => { + const sourceTests = [ + { + id: 'shared-id', + suite: 'eval-top-months-chart', + source: { + evalFilePath: 'evals/imported.eval.yaml', + evalFileAbsolutePath: path.join(testDir, 'evals/imported.eval.yaml'), + importedSuiteName: 'eval-top-months-chart', + testId: 'shared-id', + testSnapshotYaml: 'id: shared-id', + graderDefinitions: [], + references: [], + }, + } as EvalTest, + ]; + const paths = await writeArtifactsFromResults( + [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], + testDir, + { resultGroup: 'eval-top-months-chart', sourceTests }, + ); + + const [indexLine] = (await readFile(paths.indexPath, 'utf8')) + .trim() + .split('\n') + .map(JSON.parse); + expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json'); + }); + + it('uses the imported suite name for wrapper suite artifact paths', async () => { + const sourceTests = [ + { + id: 'shared-id', + suite: 'wrapper-suite', + source: { + evalFilePath: 'evals/imported.eval.yaml', + evalFileAbsolutePath: path.join(testDir, 'evals/imported.eval.yaml'), + importedSuiteName: 'imported-suite', + testId: 'shared-id', + testSnapshotYaml: 'id: shared-id', + graderDefinitions: [], + references: [], + }, + } as EvalTest, + ]; + const paths = await writeArtifactsFromResults( + [makeResult({ suite: 'wrapper-suite', testId: 'shared-id', target: 'baseline' })], + testDir, + { resultGroup: 'wrapper-suite', sourceTests }, + ); + + const [indexLine] = (await readFile(paths.indexPath, 'utf8')) + .trim() + .split('\n') + .map(JSON.parse); + expect(indexLine.artifact_dir).toBe('imported-suite/shared-id'); + expect(indexLine.grading_path).toBe('imported-suite/shared-id/run-1/grading.json'); + }); + it('writes task bundle artifacts with local source paths when source metadata is provided', async () => { const sourceRoot = path.join(testDir, 'src'); await mkdir(sourceRoot, { recursive: true }); diff --git a/apps/cli/test/commands/eval/result-layout.test.ts b/apps/cli/test/commands/eval/result-layout.test.ts index 97424c4c1..79dfd805d 100644 --- a/apps/cli/test/commands/eval/result-layout.test.ts +++ b/apps/cli/test/commands/eval/result-layout.test.ts @@ -9,7 +9,7 @@ import { } from '../../../src/commands/eval/result-layout.js'; describe('result layout', () => { - it('groups default run directories under the default experiment', () => { + it('groups default run directories under the default result group', () => { const cwd = '/repo'; const timestamp = new Date('2026-06-22T12:34:56.789Z'); @@ -18,7 +18,7 @@ describe('result layout', () => { ); }); - it('groups named experiment run directories under the experiment', () => { + it('groups named run directories under the result group', () => { expect(buildDefaultRunDirFromName('/repo', 'with-skills', '2026-run')).toBe( path.join('/repo', '.agentv', 'results', 'with-skills', '2026-run'), ); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index bf19b2b0e..484095448 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -408,8 +408,8 @@ describe('agentv eval CLI', () => { await expectFileExists(path.join(outputDir, 'summary.json')); for (const row of canonicalResults) { expect(row.transcript_path).toMatch(/run-1\/transcript\.jsonl$/); - expect(row.transcript_raw_path).toMatch(/run-1\/transcript-raw\.jsonl$/); await expectFileExists(path.join(outputDir, row.transcript_path as string)); + expect(row.transcript_raw_path).toMatch(/run-1\/transcript-raw\.jsonl$/); await expectFileExists(path.join(outputDir, row.transcript_raw_path as string)); } } finally { @@ -521,11 +521,9 @@ describe('agentv eval CLI', () => { } }, 30_000); - it('runs a native experiment file with suite test selection and run knobs', async () => { + it('runs inline experiment config with suite test selection and run knobs', async () => { const fixture = await createFixture(); try { - const experimentsDir = path.join(fixture.suiteDir, 'experiments'); - await mkdir(experimentsDir, { recursive: true }); await writeFile( path.join(fixture.suiteDir, '.agentv', 'config.yaml'), 'eval_patterns:\n - sample.test.yaml\n - unused.test.yaml\n', @@ -545,38 +543,40 @@ describe('agentv eval CLI', () => { ].join('\n'), 'utf8', ); - const experimentPath = path.join(experimentsDir, 'default.yaml'); + const wrapperPath = path.join(fixture.suiteDir, 'native-exp.eval.yaml'); await writeFile( - experimentPath, + wrapperPath, [ 'name: native-exp', - 'target: cli-target', - 'suites:', - ' - ref: sample.test.yaml', - ' select:', - ' test_ids:', - ' - case-alpha', - 'timeout_seconds: 12', - 'workers: 4', - 'repeat:', - ' count: 2', - ' strategy: mean', - ' cost_limit_usd: 1.25', - 'early_exit: false', - 'setup:', - ' - script: "printf setup > ../experiment-setup.txt"', - 'scripts:', - ' - script: "printf script > ../experiment-script.txt"', + 'experiment:', + ' name: native-exp', + ' target: cli-target', + ' timeout_seconds: 12', + ' workers: 4', + ' threshold: 0.8', + ' budget_usd: 3', + ' repeat:', + ' count: 2', + ' strategy: mean', + ' cost_limit_usd: 1.25', + ' early_exit: false', + 'tests:', + ' - include: sample.test.yaml', + ' type: suite', + ' select: case-alpha', + ' run:', + ' threshold: 1.0', + ' timeout_seconds: 5', + ' budget_usd: 0.75', + ' repeat:', + ' count: 3', + ' strategy: pass_all', '', ].join('\n'), 'utf8', ); - const { stdout, exitCode } = await runCli(fixture, [ - 'eval', - '--experiment', - 'experiments/default.yaml', - ]); + const { stdout, exitCode } = await runCli(fixture, ['eval', wrapperPath]); expect(exitCode).toBe(0); const outputPath = extractOutputPath(stdout); @@ -585,36 +585,24 @@ describe('agentv eval CLI', () => { const diagnostics = await readDiagnostics(fixture); expect(diagnostics).toMatchObject({ target: 'cli-target', - agentTimeoutMs: 12000, + agentTimeoutMs: 5000, maxConcurrency: 4, evalCaseIds: ['case-alpha'], + budgetUsd: 0.75, + threshold: 1, trials: { - count: 2, - strategy: 'mean', - costLimitUsd: 1.25, - earlyExit: false, + count: 3, + strategy: 'pass_all', }, }); - await expectFileExists(path.join(fixture.suiteDir, 'experiment-setup.txt')); - await expectFileExists(path.join(fixture.suiteDir, 'experiment-script.txt')); - const benchmark = JSON.parse( await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'), ) as { metadata?: Record }; expect(benchmark.metadata?.experiment).toBe('native-exp'); expect(benchmark.metadata?.experiment_config).toMatchObject({ name: 'native-exp', - source_path: experimentPath, target: 'cli-target', - suites: [ - { - ref: 'sample.test.yaml', - select: { - test_ids: ['case-alpha'], - }, - }, - ], repeat: { count: 2, strategy: 'mean', @@ -632,6 +620,76 @@ describe('agentv eval CLI', () => { } }, 30_000); + it('keeps inline experiment runtime isolated across multiple eval files', async () => { + const fixture = await createFixture(); + try { + const firstPath = path.join(fixture.suiteDir, 'first.eval.yaml'); + const secondPath = path.join(fixture.suiteDir, 'second.eval.yaml'); + await writeFile( + firstPath, + [ + 'name: first', + 'experiment:', + ' target: cli-target', + ' timeout_seconds: 11', + ' workers: 1', + ' budget_usd: 0.11', + 'tests:', + ' - id: first-case', + ' input: first', + ' criteria: ok', + '', + ].join('\n'), + 'utf8', + ); + await writeFile( + secondPath, + [ + 'name: second', + 'experiment:', + ' target: file-target', + ' timeout_seconds: 22', + ' workers: 2', + ' budget_usd: 0.22', + 'tests:', + ' - id: second-case', + ' input: second', + ' criteria: ok', + '', + ].join('\n'), + 'utf8', + ); + + const { stdout, exitCode } = await runCli(fixture, ['eval', firstPath, secondPath]); + + expect(exitCode).toBe(0); + const outputPath = extractOutputPath(stdout); + expect(outputPath).toContain(`${path.sep}multi-eval${path.sep}`); + + const diagnostics = await readDiagnostics(fixture); + const calls = diagnostics.calls as Array>; + expect(calls).toHaveLength(2); + expect(calls[0]).toMatchObject({ + target: 'cli-target', + agentTimeoutMs: 11_000, + maxConcurrency: 1, + budgetUsd: 0.11, + runBudgetCapUsd: 0.11, + evalCaseIds: ['first-case'], + }); + expect(calls[1]).toMatchObject({ + target: 'file-target', + agentTimeoutMs: 22_000, + maxConcurrency: 2, + budgetUsd: 0.22, + runBudgetCapUsd: 0.22, + evalCaseIds: ['second-case'], + }); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + it('honors agentv.config.ts cache.path when response cache is enabled there', async () => { const fixture = await createFixture(); try { diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 32162888e..b7ce3515f 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -1,4 +1,4 @@ -import { mkdir, writeFile } from 'node:fs/promises'; +import { mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; interface ResolvedTargetLike { @@ -25,6 +25,7 @@ interface RunEvaluationOptionsLike { readonly costLimitUsd?: number; readonly earlyExit?: boolean; }; + readonly threshold?: number; readonly budgetUsd?: number; readonly runBudgetTracker?: { readonly budgetCapUsd?: number; @@ -181,6 +182,7 @@ async function maybeWriteDiagnostics( budgetUsd: options.budgetUsd ?? null, maxConcurrency: options.maxConcurrency ?? null, trials: options.trials ?? null, + threshold: options.threshold ?? null, hasRunBudgetTracker: options.runBudgetTracker !== undefined, runBudgetCapUsd: options.runBudgetTracker?.budgetCapUsd ?? null, replayRecording: options.replayRecording ?? null, @@ -199,7 +201,17 @@ async function maybeWriteDiagnostics( resultCount: results.length, } satisfies Record; - await writeFile(diagnosticsPath, JSON.stringify(payload, null, 2), 'utf8'); + const priorCalls = await readFile(diagnosticsPath, 'utf8') + .then((raw) => { + const parsed = JSON.parse(raw) as { readonly calls?: unknown }; + return Array.isArray(parsed.calls) ? parsed.calls : [parsed]; + }) + .catch(() => []); + await writeFile( + diagnosticsPath, + JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2), + 'utf8', + ); } async function maybeWritePromptDump( diff --git a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx index 1bbb4698b..e5db2e8f1 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx @@ -25,8 +25,8 @@ tests: |-------|----------|-------------| | `id` | Yes | Unique identifier for the test | | `criteria` | Yes | Description of what a correct response should contain | -| `input` | Yes | Input sent to the target (string, object, or message array). Alias: `input` | -| `expected_output` | No | Expected response for comparison (string, object, or message array). Alias: `expected_output` | +| `input` | Yes | Input sent to the target (string, object, or message array) | +| `expected_output` | No | Expected response for comparison (string, object, or message array) | | `execution` | No | Per-case execution overrides (for example `target`, `skip_defaults`) | | `workspace` | No | Per-case workspace config (overrides suite-level) | | `metadata` | No | Arbitrary key-value pairs passed to graders and workspace scripts | @@ -189,9 +189,29 @@ supporting files, see [Benchmark Provenance](/docs/guides/benchmark-provenance/) The `assertions` field defines graders directly on a test. It supports both deterministic assertion types and LLM-based rubric evaluation. +### Rubric Shorthand + +For semantic or agent-behavior checks, prefer plain strings in `assertions`. +AgentV groups the strings into a rubric grader automatically: + +```yaml +tests: + - id: bug-fix-review + criteria: Finds and fixes the bug + input: Review this failing parser implementation. + assertions: + - Identifies the root cause of the parser failure + - Proposes a concrete code change + - Adds or updates a regression test +``` + +Use this shape for qualitative requirements. It is less brittle than checking +for exact substrings in an agent response. + ### Deterministic Assertions -These graders run without an LLM call and produce binary (0 or 1) scores: +Use deterministic assertions for exact machine-verifiable outputs. These graders +run without an LLM call and produce binary (0 or 1) scores: | Type | Value | Description | |------|-------|-------------| @@ -278,9 +298,10 @@ tests: Assertion graders auto-generate a `name` when one is not provided (e.g., `contains-DENIED`, `is_json`). -### Rubric Assertions +### Advanced Rubric Assertions -Use `type: rubrics` with a `criteria` array to define structured LLM-graded evaluation criteria inline: +Use `type: rubrics` with a `criteria` array only when you need weights, +required flags, or score ranges: ```yaml tests: @@ -374,8 +395,8 @@ tests: When `assertions` is defined, only the declared graders run. No implicit grader is added because `criteria` or `expected_output` exists. Graders that are declared (such as -`llm-grader`, `code-grader`, or `rubrics`) receive the case context, including -`criteria` and `expected_output`, as input automatically. +plain rubric strings, `llm-grader`, `code-grader`, or `rubrics`) receive the case +context, including `criteria` and `expected_output`, as input automatically. This means a case with `expected_output` and only deterministic assertions evaluates only those deterministic assertions: @@ -394,11 +415,12 @@ If `assertions` contains only deterministic graders (like `contains` or `regex`) ``` Warning: Test 'my-test': criteria is defined but no grader in assertions -will evaluate it. Add 'type: llm-grader' to assertions, or remove criteria -if it is documentation-only. +will evaluate it. Add a rubric assertion string or another grader to assertions, +or remove criteria if it is documentation-only. ``` -To use `criteria` alongside deterministic checks, add a grader explicitly: +To use `criteria` alongside deterministic checks, add a rubric assertion string +or another grader explicitly: ```yaml tests: @@ -406,7 +428,7 @@ tests: criteria: Response is helpful and mentions the fix input: "Debug this function..." assertions: - - type: llm-grader # explicit — receives criteria automatically + - Explains why the bug happens - type: contains value: "fix" ``` @@ -423,7 +445,7 @@ tests: criteria: Response is helpful and mentions the fix input: "Debug this function..." assertions: - - type: llm-grader + - type: llm-grader # use explicit form for custom preprocessors preprocessors: - type: xlsx command: ["bun", "run", "scripts/preprocessors/xlsx-to-json.ts"] diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index e4d932da0..84a8bf437 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -5,21 +5,21 @@ sidebar: order: 1 --- -Evaluation files define the test cases and graders for an evaluation run. Runtime choices such as target matrices, setup, scripts, and repeat runs belong in [experiments](/docs/evaluation/experiments/). AgentV supports two eval formats: YAML and JSONL. +Evaluation files define the test cases, graders, workspace lifecycle, and inline runtime block for an evaluation run. Runtime choices such as target matrices, thresholds, budgets, and repeat runs belong under top-level [`experiment:`](/docs/evaluation/experiments/). Install, build, and reset commands belong under `workspace.hooks`; runner-specific setup belongs under `targets[].hooks`. AgentV supports two eval formats: YAML and JSONL. YAML is the canonical portable model. TypeScript helpers, generated fixtures, and Python scripts should lower to the same YAML/JSONL shapes rather than inventing a separate eval contract. ## Suites -An eval file is a **suite**: it binds test cases to task context, assertions, and reusable fixtures. Runtime choices such as target matrices, setup, and run counts belong in experiments. Test cases can be inline or loaded from an external file via `tests: ./cases.yaml` for reuse across suites. +An eval file is a **suite**: it binds test cases to task context, assertions, reusable fixtures, and the inline runtime block. Test cases can be inline, loaded from an external file via `tests: ./cases.yaml`, or imported with `tests[].include`. ## YAML Format -The primary format. A single file contains metadata, execution config, and tests: +The primary format. A single file contains metadata, inline runtime config, and tests: ```yaml description: Math problem solving evaluation -execution: +experiment: target: default assertions: @@ -40,9 +40,9 @@ tests: |-------|-------------| | `description` | Human-readable description of the evaluation | | `suite` | Optional suite identifier | -| `execution` | Default execution config (`target`, `fail_on_error`, `threshold`, etc.) | +| `experiment` | Runtime policy (`target`, `targets`, `workers`, `repeat`, `threshold`, `timeout_seconds`, `budget_usd`, etc.) | | `workspace` | Suite-level workspace config — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). | -| `tests` | Array of individual tests, or a string path to an external file or directory | +| `tests` | Array of individual tests, include entries, or a string path to an external file or directory. Tests and include entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. | | `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test | | `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test | @@ -79,6 +79,9 @@ tests: ### Suite-level Assertions The `assertions` field is the canonical way to define suite-level graders. Suite-level assertions are appended to every test's graders unless a test sets `execution.skip_defaults: true`. +For semantic or agent-behavior checks, prefer plain assertion strings first; +AgentV treats them as rubric criteria. Use deterministic assertions or code +graders when the expected output is exact or requires programmatic inspection. ```yaml description: API response validation @@ -87,6 +90,8 @@ assertions: required: true - type: contains value: "status" + - Correctly answers the user's question + - Explains the reasoning clearly tests: - id: health-check @@ -94,7 +99,10 @@ tests: input: Check API health ``` -`assertions` supports all grader types, including deterministic assertion types (`contains`, `regex`, `is_json`, `equals`) and `rubrics`. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for per-test assertions usage. +`assertions` supports rubric shorthand strings, deterministic assertion types +(`contains`, `regex`, `is_json`, `equals`), `rubrics`, LLM graders, and code +graders. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for +per-test assertions usage. ### Assertion Includes @@ -188,8 +196,8 @@ Per-test `input_files` overrides the suite-level value (it does not merge). To o ### PROMPT.md Fallback -For Vercel-style eval directories, a test may omit `input` and keep the task -prompt in Markdown instead. AgentV resolves the prompt in this order: +For directory-style evals, a test may omit `input` and keep the task prompt in +Markdown instead. AgentV resolves the prompt in this order: 1. If the effective `input_files` contains a file named exactly `PROMPT.md`, that file becomes the test prompt. 2. Otherwise, if a `PROMPT.md` exists beside the `EVAL.yaml`, that file becomes the test prompt. @@ -222,12 +230,25 @@ Instead of inlining tests in the same file, you can point `tests` to an external ```yaml name: my-eval description: My evaluation suite -execution: +experiment: target: default tests: ./cases.yaml ``` -The path is resolved relative to the eval file's directory. The external file should contain a YAML array of test objects or a JSONL file with one test per line. +The path is resolved relative to the eval file's directory. The external file +should contain a YAML array of test objects or a JSONL file with one test per +line. String entries inside a `tests:` list work the same way and may use direct +paths, directories, or globs: + +```yaml +tests: + - ./cases/*.cases.yaml + - include: ./suites/*.eval.yaml + type: suite +``` + +String shorthand is raw-case-only. Import eval suites with object entries using +`include:` and `type: suite`. ### Tests as Directory Path @@ -360,7 +381,7 @@ An optional YAML sidecar file provides metadata and execution config. Place it a ```yaml description: Math evaluation dataset suite: math-tests -execution: +experiment: target: azure-base assertions: - name: correctness diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx index 23a9f5cd2..d85077d87 100644 --- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx +++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx @@ -5,175 +5,219 @@ sidebar: order: 2 --- -Experiments define **how** eval cases run: target or target matrix, setup, -scripts, timeout, sandbox, case filters, and repeat-run policy. Eval files stay -focused on **what** is tested: prompts, datasets, assertions, and task fixtures. - -## Experiment YAML - -Committed experiments conventionally live under `experiments/`: +AgentV eval files are the only runnable authoring artifact. Use top-level +`experiment:` inside `eval.yaml` for runtime choices: targets, workers, +timeout, sandbox/runtime knobs, budgets, thresholds, and repeat-run policy. ```yaml -name: baseline -target: codex-gpt5 -suites: - - ref: evals/support-regression.eval.yaml - select: - test_ids: - - refund-eligibility - - missing-order-date -timeout_seconds: 720 -repeat: - count: 4 - strategy: pass_at_k - cost_limit_usd: 2.00 -setup: - - script: bun install -scripts: - - build +name: support-regression + +experiment: + targets: [codex-gpt5, claude-sonnet] + workers: 2 + timeout_seconds: 720 + repeat: + count: 4 + strategy: pass_at_k + cost_limit_usd: 2.00 + +workspace: + hooks: + before_all: + command: ["bash", "-lc", "bun install && bun run build"] + +tests: + - id: refund-eligibility + input: Can this customer get a refund? + criteria: Applies the refund policy correctly ``` -Wire fields use `snake_case`. AgentV translates to internal `camelCase` when it -loads the file. +`execution:` is accepted only as a legacy top-level alias for existing eval +files. Do not use both `experiment:` and `execution:` in the same eval. -## Suites and test selection +## Tests Imports -Eval files keep `tests[]` as the canonical atomic test definition. Experiments -reference one or more reusable eval suites through `suites[]`: +Use `tests[]` for composition, imports, and selection. ```yaml -suites: - - ref: evals/support-regression.eval.yaml - - ref: evals/billing-*.eval.yaml -``` - -Use suite-local `select.test_ids[]` to run only specific tests from a suite. The -values match `tests[].id` inside that suite and use the same glob semantics as -`--test-id`: - -```yaml -suites: - - ref: evals/support-regression.eval.yaml +tests: + - include: evals/support/*.eval.yaml + type: suite select: test_ids: - refund-* - missing-order-date + tags: regression + metadata: + priority: high + run: + threshold: 1.0 + repeat: + count: 2 + strategy: pass_all + - include: cases/*.cases.yaml + type: tests + - include: cases/regression.jsonl + type: tests + - cases/smoke/*.cases.yaml ``` -## Repeat runs +`type: suite` preserves the imported suite's task contract: metadata, +`workspace`, shared `input`, shared `assertions`, and tests. The child suite's +`experiment:` or legacy `execution:` runtime block is ignored; the parent eval's +runtime block controls the run. -`repeat` is the full AgentV replacement for the old eval-level -`execution.trials` shape. It supports the same core strategies: +`type: tests` imports only raw test entries. It intentionally drops shared +context from an imported eval suite, so parent suite fields apply to those raw +cases. -```yaml -repeat: - count: 3 - strategy: mean - cost_limit_usd: 1.50 -``` +`tests[].select.test_ids` filters imported test IDs with glob patterns. +`tests[].select.tags` filters each imported case's effective `metadata.tags`. +Effective case tags are suite-first and deduped: +`suite.tags + suite.metadata.tags + test.metadata.tags`. Top-level suite `tags` +still remain suite identity metadata for discovery and reporting; selection reads +the merged case metadata view. `tests[].select.metadata` filters case metadata by +key/value, where selector values may be scalars or lists. Globbed include paths +are resolved in deterministic path order, then test order. -Supported strategies: +String-valued `tests` and string entries inside `tests[]` are raw-case import +shorthand. They are equivalent to `include` with `type: tests` and may point at +raw case files, directories, or globs. Importing another eval suite must use +object form with `include:` and `type: suite`. -| Strategy | Behavior | -| --- | --- | -| `pass_at_k` | Uses the best passing attempt; early-exits by default unless the experiment sets `early_exit: false` | -| `mean` | Aggregates repeated attempt scores by mean | -| `confidence_interval` | Uses the lower bound of a 95% confidence interval as the conservative score | +Suite imports are resolved as a deterministic include graph. Circular `type: +suite` imports fail validation with the import chain; raw-case shorthand does +not recursively load suite runtime blocks. -`repeat.cost_limit_usd` caps repeat-run spend. `repeat.costLimitUsd` is also -accepted for prerelease trial-schema parity, but new YAML should use -`cost_limit_usd`. +Imported suite artifacts are nested under the source suite name inside a wrapper +eval result directory, for example +`.agentv/results/////...`. +Direct tests owned by the wrapper eval and raw case imports live directly under +`/...`. -## Vercel-compatible shorthand +## Scoped Run Overrides -AgentV also accepts Vercel-style top-level `runs` and `early_exit`: +Use scoped `run:` blocks for result interpretation and scheduling policies that +vary by include group or test case. Precedence is: -```yaml -runs: 4 -early_exit: true +```text +test.run > tests[].run > experiment ``` -This is shorthand for a `pass_at_k` repeat run. Use `repeat` when you need -AgentV-specific strategy or cost-limit fields. - -Do not set both `repeat` and `runs` in the same experiment. `repeat` is the -canonical AgentV shape; `runs` exists only for Vercel-compatible shorthand. - -Vercel defines the requested run count at the experiment level. Some result -summaries show fewer actual runs for a case because `earlyExit: true` stops -remaining attempts after the first pass; smoke runs can also force one run. -AgentV follows the same experiment-level placement while keeping the richer -`repeat` block for AgentV strategies. - -Repeat-enabled cases use a Vercel-style physical layout with AgentV aggregate -provenance: - -```text -/index.jsonl -/summary.json -///summary.json -///run-1/result.json -///run-1/grading.json -///run-1/metrics.json -///run-1/timing.json -///run-1/transcript.json -///run-1/transcript-raw.jsonl -///run-1/outputs/answer.md +```yaml +experiment: + target: agent + threshold: 0.8 + repeat: + count: 3 + strategy: pass_at_k + +tests: + - include: ./evals/flaky-agentic/**/*.eval.yaml + type: suite + select: + tags: [agentic] + run: + repeat: + count: 3 + strategy: pass_at_k + + - include: ./evals/regression/**/*.eval.yaml + type: suite + select: + tags: [must-pass] + run: + threshold: 1.0 + repeat: + count: 2 + strategy: pass_all + + - id: critical-case + input: "..." + criteria: Must pass exactly + run: + threshold: 1.0 + repeat: + count: 1 ``` -The repeated case aggregate folder uses `summary.json` for run-count, pass-rate, -fingerprint, and flattened snake_case timing fields such as -`mean_duration_ms`. -Each `run-N/result.json` is the per-attempt manifest and includes -`grading_path`, transcript/output paths, and embedded timing/o11y metrics. Each -attempt also keeps AgentV `grading.json`, `metrics.json`, and `timing.json` -sidecars for detailed inspection. -Root `index.jsonl` and root `summary.json` remain stable for existing CI -summary scripts and uploaded artifact consumers. +Scoped `run:` supports `threshold`, `repeat`, `timeout_seconds`, and +`budget_usd`. Candidate-changing fields such as `target` and `targets` stay +parent-level under `experiment:`. Workspace mutation belongs in +`workspace.hooks`, and runner-specific setup belongs in `targets[].hooks`. -## Targets and setup +## Lifecycle Ownership -Experiments reuse targets from `.agentv/targets.yaml`; they do not define a new -provider registry. +`experiment:` configures evaluation policy. It does not own commands that +prepare files, dependencies, repos, or target-specific runner state. + +| Need | Put it in | +| --- | --- | +| Install dependencies, build the repo, seed files | `workspace.hooks.before_all` | +| Reset or apply per-case state | `workspace.hooks.before_each` / `workspace.hooks.after_each` | +| Configure an agent runner or provider variant | `targets[].hooks` | +| Choose targets, repeats, pass policy, budget, threshold | `experiment` | ```yaml +workspace: + hooks: + before_all: + command: ["bash", "-lc", "bun install && bun run build"] + targets: - - copilot - - claude - - name: gemini-with-hooks - use_target: gemini + - name: agent-with-skills + provider: codex + hooks: + before_each: + command: ["sh", "-c", "cp -R skills \"{{workspace_path}}/.codex/skills\""] + +experiment: + target: agent-with-skills + repeat: + count: 3 + strategy: pass_at_k ``` -Setup and scripts belong on the experiment because they are often the A/B -variable: +## Repeat Runs + +`repeat` supports the same core strategies as repeated attempts: ```yaml -setup: - - script: cp skills/with-docs/AGENTS.md AGENTS.md -scripts: - - script: bun test - timeout_seconds: 120 +experiment: + repeat: + count: 3 + strategy: mean + cost_limit_usd: 1.50 ``` -## Running experiments - -Run a specific experiment: +Supported strategies: -```bash -bun agentv eval --experiment experiments/default.yaml -``` +| Strategy | Behavior | +| --- | --- | +| `pass_at_k` | Uses the best passing attempt; early-exits by default unless `early_exit: false` is set | +| `pass_all` | Uses the weakest attempt score, so every repeated attempt must meet the threshold | +| `mean` | Aggregates repeated attempt scores by mean | +| `confidence_interval` | Uses the lower bound of a 95% confidence interval as the conservative score | -If no experiment is passed, AgentV checks `.agentv/config.yaml` for a default: +AgentV also accepts `runs` and `early_exit` under `experiment:` as shorthand for +repeat-run policy: ```yaml -experiments: - default: experiments/default.yaml +experiment: + runs: 4 + early_exit: true ``` -If no default is configured, AgentV keeps the old behavior and uses the -`default` experiment label. +Do not set both `repeat` and `runs` in the same runtime block. + +## Result Layout + +Default eval runs write to: -## Schema +```text +.agentv/results/// +``` -The generated JSON Schema is available at -`skills-data/agentv-eval-writer/references/experiment-schema.json`. +Imported source suite metadata appears in `index.jsonl` rows and manifests. +AgentV does not add a redundant suite directory when the result group is already +the eval name. diff --git a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx index 91f2f937d..ddd7ceedb 100644 --- a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx +++ b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx @@ -43,7 +43,7 @@ Create `./evals/example.yaml`: ```yaml description: Math problem solving evaluation -execution: +experiment: target: default tests: diff --git a/apps/web/src/content/docs/docs/graders/custom-graders.mdx b/apps/web/src/content/docs/docs/graders/custom-graders.mdx index 8c958eba0..42e9d9865 100644 --- a/apps/web/src/content/docs/docs/graders/custom-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/custom-graders.mdx @@ -57,11 +57,9 @@ tests: criteria: Generates correct Python code input: Write a sorting function assertions: - - type: rubrics - criteria: - - Code is syntactically valid - - Handles edge cases (empty list, single element) - - Uses appropriate algorithm + - Code is syntactically valid + - Handles edge cases such as empty lists and single-element lists + - Uses an appropriate algorithm - name: syntax_check type: code-grader command: [./validators/check_syntax.py] @@ -83,6 +81,7 @@ If any grader has `required: true` (or `required: `) and scores below ## Best Practices +- **Use plain assertion strings first for semantic checks** — AgentV treats them as rubric criteria - **Use code graders for deterministic checks** — exact value matching, format validation, schema compliance - **Use LLM graders for semantic evaluation** — meaning, quality, helpfulness - **Use rubrics for structured multi-criteria grading** — when you need weighted, itemized scoring diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx index b2a2e0317..6ff210675 100644 --- a/apps/web/src/content/docs/docs/targets/configuration.mdx +++ b/apps/web/src/content/docs/docs/targets/configuration.mdx @@ -91,6 +91,9 @@ targets: ### Workspace Lifecycle Hooks Run commands and reset/cleanup policies at different lifecycle points using `workspace.hooks`. This can be defined at the suite level (applies to all tests) or per test (overrides suite-level). +Use workspace hooks for repo preparation such as dependency installs, builds, +fixture generation, and per-case resets. Use target hooks for runner-specific +setup. ```yaml workspace: @@ -239,12 +242,12 @@ Use `cwd` on a target to run in an existing directory (shared across tests). If Eval files can define per-target hooks that run setup/teardown scripts to customize the workspace for each target variant. This enables comparing different harness configurations (e.g., baseline vs with-plugins) in a single eval file. -Targets do not declare `repos`. Repositories belong to the shared eval workspace so every target runs in the same world; target hooks customize the harness under evaluation. Use hooks for per-target setup such as copying skills, enabling wrappers, or changing provider-local config. +Targets do not declare `repos`. Repositories belong to the shared eval workspace so every target runs in the same world; target hooks customize the harness under evaluation. Use hooks for per-target setup such as copying skills, enabling wrappers, or changing provider-local config. Keep installs, builds, fixture generation, and case resets in `workspace.hooks`. -Target hooks are defined in the eval file's `execution.targets` array using object form: +Target hooks are defined in the eval file's `experiment.targets` array using object form: ```yaml -execution: +experiment: targets: - baseline # string shorthand (no hooks) - name: with-skills # object form with hooks diff --git a/docs/adr/0006-separate-experiments-from-eval-definitions.md b/docs/adr/0006-separate-experiments-from-eval-definitions.md index 103872e8c..752495b71 100644 --- a/docs/adr/0006-separate-experiments-from-eval-definitions.md +++ b/docs/adr/0006-separate-experiments-from-eval-definitions.md @@ -59,8 +59,6 @@ experiment: strategy: pass_at_k timeout_seconds: 900 budget_usd: 2.00 - setup: - - command: ./scripts/install-skills.sh tests: - include: ./evals/cargowise/**/*.eval.yaml @@ -100,12 +98,32 @@ The old experiment runtime fields are ported into the parent eval file: - repeat policy such as `count` and `pass_at_k` - timeout - budget -- runtime setup commands - other run-time controls that do not define the task itself Suite or case workspace fields remain task-owned when they define what is being -evaluated. Experiment setup remains parent-owned when it changes the candidate -or run condition being measured against the same task. +evaluated. + +## Lifecycle Ownership + +`experiment:` owns evaluation policy, not lifecycle mutation. Commands that +prepare or reset files, dependencies, repos, or runner-specific configuration +must stay with the lifecycle surface that actually owns that work: + +- `workspace.hooks` prepare or reset the workspace under test. Dependency + installs, builds, fixture generation, case resets, and repo seeding belong + here. +- `targets[].hooks` prepare the target runner or provider variant. Agent + discovery files, provider-specific config, and target-specific harness setup + belong here. +- `experiment:` selects runtime policy: target or target matrix, workers, + repeat strategy, threshold, timeout, budget, sandbox/runtime knobs, and result + identity. + +This differs from external experiment formats that allow generic scripts on the +experiment object. AgentV keeps those scripts in workspace or target hooks so a +multi-file command such as `agentv eval a.eval.yaml b.eval.yaml` remains a batch +of independent eval-suite runs, rather than one implicit wrapper experiment with +shared mutable setup. ## Tests Import Surface @@ -259,10 +277,11 @@ scheduling: - `timeout_seconds` - `budget_usd` -Fields that change the candidate or system under test, such as `target`, -`targets`, runtime setup, and workspace mutation, should remain at the parent -`experiment:` level unless a later ADR accepts narrower per-group semantics. -Keeping candidate-changing knobs out of scoped overrides preserves comparable +Fields that change the candidate or system under test, such as `target` and +`targets`, should remain at the parent `experiment:` level unless a later ADR +accepts narrower per-group semantics. Workspace mutation stays in +`workspace.hooks`; runner setup stays in `targets[].hooks`. Keeping +candidate-changing knobs out of scoped overrides preserves comparable experiment groups and avoids silently mixing different systems under one result group. diff --git a/docs/plans/2026-06-23-002-experiments-separation-plan.md b/docs/plans/2026-06-23-002-experiments-separation-plan.md deleted file mode 100644 index a2e22f6fe..000000000 --- a/docs/plans/2026-06-23-002-experiments-separation-plan.md +++ /dev/null @@ -1,407 +0,0 @@ ---- -title: "feat: Separate experiments from eval definitions" -type: feat -date: 2026-06-23 -origin: docs/adr/0006-separate-experiments-from-eval-definitions.md ---- - -# feat: Separate experiments from eval definitions - -## Summary - -AgentV should separate eval task definitions from experiment run definitions. -Eval YAML stays the canonical authoring layer for prompts, datasets, assertions, -and task fixtures. Experiments become first-class committed files that select the -agent or target under test, model, harness options, setup injection, run knobs, -and case filter. - -This should ship in phases. Phase 1 adds the non-breaking foundation: -experiment contract types, default experiment resolution, and artifact -attribution by resolved experiment name. Later phases move runtime controls out -of `eval.yaml execution`, teach the CLI to run experiment matrices, and record -full experiment provenance and fingerprints in run bundles. - -## Problem Frame - -Today `experiment` is a string label passed through -`packages/core/src/evaluation/evaluate.ts`, `packages/core/src/evaluation/run-artifacts.ts`, -`packages/core/src/evaluation/results-repo.ts`, and -`packages/core/src/evaluation/trace-envelope.ts`. Runtime choices are still -scattered across CLI flags, TypeScript config, `.agentv/config.yaml`, and -`eval.yaml execution`. - -That makes it hard to review A/B variants such as `baseline` versus -`agents-md`, because the variable under test can be hidden inside the eval -definition. The desired model is: - -- Eval equals what is tested. -- Experiment equals how and with what it is run. -- Setup that changes the agent's environment belongs to the experiment. -- Existing eval-only repositories keep working through a default experiment - fallback. - -## Requirements - -- R1. Existing `eval.yaml` files validate and run without modification. -- R2. Experiment wire config uses `snake_case`; TypeScript types use - `camelCase`. -- R3. `config.yaml` can point at a default experiment, with no pointer falling - back to the current `default` experiment label. -- R4. `agentv eval --experiment