diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 26dc9471a..ac89d779b 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -211,6 +211,7 @@ export async function writePerTestArtifacts(
     experiment?: string;
     runId?: string;
     duplicatePolicy?: ExportDuplicatePolicy;
+    resultGroup?: string;
     cwd?: string;
     repoRoot?: string;
     sourceTests?: readonly EvalTest[];
@@ -219,6 +220,7 @@ export async function writePerTestArtifacts(
 ): Promise<void> {
   await writeCorePerTestArtifacts(results, outputDir, {
     experiment: options?.experiment,
+    resultGroup: options?.resultGroup,
     runId: options?.runId,
     duplicatePolicy: options?.duplicatePolicy,
     sourceTests: options?.sourceTests,
@@ -236,6 +238,7 @@ export async function writeArtifactsFromResults(
     plannedTestCount?: number;
     runId?: string;
     duplicatePolicy?: ExportDuplicatePolicy;
+    resultGroup?: string;
     cwd?: string;
     repoRoot?: string;
     sourceTests?: readonly EvalTest[];
@@ -253,6 +256,7 @@ export async function writeArtifactsFromResults(
     plannedTestCount: options?.plannedTestCount,
     runId: options?.runId,
     duplicatePolicy: options?.duplicatePolicy,
+    resultGroup: options?.resultGroup,
     sourceTests: options?.sourceTests,
     additionalArtifacts: createTaskBundleArtifactsWriter(options),
   });
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index ccb767b71..182a3a3ed 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1,4 +1,3 @@
-import { spawn } from 'node:child_process';
 import { constants, existsSync, mkdirSync } from 'node:fs';
 import { access, readFile } from 'node:fs/promises';
 import { createRequire as createNodeRequire } from 'node:module';
@@ -7,6 +6,7 @@ import { pathToFileURL } from 'node:url';
 
 import {
   DEFAULT_THRESHOLD,
+  type EvalRunOverride,
   type EvalTargetRef,
   type EvalTest,
   type EvaluationCache,
@@ -14,7 +14,6 @@ import {
   type ExecutionDefaults,
   type ExperimentArtifactMetadata,
   type ExperimentConfig,
-  type ExperimentScript,
   type FailOnError,
   type OtelTraceExporter as OtelTraceExporterType,
   type ResolvedTarget,
@@ -25,14 +24,10 @@ import {
   buildTraceFromMessages,
   runEvaluation as defaultRunEvaluation,
   deriveCategory,
-  deriveExperimentNameFromPath,
   ensureVSCodeSubagents,
-  isExperimentFileReference,
   loadConfig,
-  loadExperimentConfig,
   loadTestSuite,
   loadTsConfig,
-  resolveDefaultExperimentReference,
   resolveTargetDefinition,
   shouldEnableCache,
   shouldSkipCacheForTemperature,
@@ -124,6 +119,7 @@ interface NormalizedOptions {
   readonly dryRunDelayMin: number;
   readonly dryRunDelayMax: number;
   readonly agentTimeoutSeconds?: number;
+  readonly cliAgentTimeoutSeconds?: number;
   readonly maxRetries: number;
   readonly cache: boolean;
   readonly cachePath?: string;
@@ -150,6 +146,7 @@ interface NormalizedOptions {
   readonly model?: string;
   readonly outputMessages: number | 'all';
   readonly threshold?: number;
+  readonly cliThreshold?: number;
   readonly tags: readonly string[];
   readonly excludeTags: readonly string[];
   readonly transcript?: string;
@@ -160,8 +157,8 @@ interface NormalizedOptions {
   readonly experimentMetadata?: ExperimentArtifactMetadata;
   readonly experimentTargetRefs?: readonly EvalTargetRef[];
   readonly experimentTrialsConfig?: TrialsConfig;
-  readonly suiteFiltersByEvalFile?: ReadonlyMap<string, string | readonly string[]>;
   readonly budgetUsd?: number;
+  readonly cliBudgetUsd?: number;
   readonly sourceMetadataByEvalFile?: ReadonlyMap<string, Record<string, unknown>>;
   readonly resultsOverrides?: ResultsPublishOverrides;
 }
@@ -422,6 +419,8 @@ function normalizeOptions(
   }
 
   const cliAgentTimeout = normalizeOptionalNumber(rawOptions.agentTimeout);
+  const cliThreshold = normalizeOptionalNumber(rawOptions.threshold);
+  const cliBudgetUsd = normalizeOptionalNumber(rawOptions.budgetUsd);
   const configAgentTimeoutSeconds =
     config?.execution?.agentTimeoutMs != null ? config.execution.agentTimeoutMs / 1000 : undefined;
 
@@ -479,6 +478,7 @@ function normalizeOptions(
     dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0),
     dryRunDelayMax: normalizeNumber(rawOptions.dryRunDelayMax, 0),
     agentTimeoutSeconds: cliAgentTimeout ?? configAgentTimeoutSeconds,
+    cliAgentTimeoutSeconds: cliAgentTimeout,
     maxRetries: cliMaxRetries ?? configMaxRetries ?? 2,
     cache: cliCache,
     cachePath: cliCachePath,
@@ -523,14 +523,16 @@ function normalizeOptions(
     graderTarget: normalizeString(rawOptions.graderTarget),
     model: normalizeString(rawOptions.model),
     outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
-    threshold: normalizeOptionalNumber(rawOptions.threshold),
+    threshold: cliThreshold,
+    cliThreshold,
     tags: normalizeStringArray(rawOptions.tag),
     excludeTags: normalizeStringArray(rawOptions.excludeTag),
     transcript: normalizeString(rawOptions.transcript),
     recordReplay: normalizeString(rawOptions.recordReplay),
     recordReplayVariant: normalizeString(rawOptions.recordReplayVariant),
     experiment: normalizeString(rawOptions.experiment),
-    budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
+    budgetUsd: cliBudgetUsd,
+    cliBudgetUsd,
     sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile(
       rawOptions.sourceMetadataByEvalFile,
     ),
@@ -566,69 +568,33 @@ async function ensureFileExists(filePath: string, description: string): Promise<
 
 function buildDefaultOutputPathForExperiment(
   cwd: string,
-  experiment: string | undefined,
+  resultGroup: string | undefined,
   runDirName: string,
 ): string {
-  const runDir = buildDefaultRunDirFromName(cwd, experiment, runDirName);
+  const runDir = buildDefaultRunDirFromName(cwd, resultGroup, runDirName);
   mkdirSync(runDir, { recursive: true });
   return path.join(runDir, 'index.jsonl');
 }
 
-function normalizeTsDefaultExperiment(
-  config: Awaited<ReturnType<typeof loadTsConfig>> | null,
-): string | undefined {
+function deriveEvalResultGroupName(evalFilePath: string | undefined): string {
+  if (!evalFilePath) {
+    return 'eval';
+  }
   return (
-    normalizeString(config?.experiments?.default) ?? normalizeString(config?.defaultExperiment)
+    path
+      .basename(evalFilePath)
+      .replace(/\.eval\.ya?ml$/i, '')
+      .replace(/\.ya?ml$/i, '')
+      .replace(/[^A-Za-z0-9._-]/g, '-') || 'eval'
   );
 }
 
 type ResolvedExperimentForRun = {
   readonly name?: string;
-  readonly config?: ExperimentConfig;
 };
 
-async function resolveExperimentForRun(params: {
-  readonly cwd: string;
-  readonly explicitExperiment?: string;
-  readonly yamlDefaultExperiment?: string;
-  readonly tsDefaultExperiment?: string;
-}): Promise<ResolvedExperimentForRun> {
-  const experimentRef =
-    params.explicitExperiment ?? params.yamlDefaultExperiment ?? params.tsDefaultExperiment;
-  if (!experimentRef) {
-    return {};
-  }
-
-  const experimentPath = resolveExperimentFilePath(params.cwd, experimentRef);
-  if (!experimentPath) {
-    if (isExperimentFileReference(experimentRef)) {
-      throw new Error(`Experiment file not found: ${experimentRef}`);
-    }
-    return { name: experimentRef };
-  }
-
-  const config = await loadExperimentConfig(experimentPath);
-  return {
-    name: config.name ?? deriveExperimentNameFromPath(experimentPath),
-    config,
-  };
-}
-
-function resolveExperimentFilePath(cwd: string, experimentRef: string): string | undefined {
-  if (isExperimentFileReference(experimentRef)) {
-    const experimentPath = path.isAbsolute(experimentRef)
-      ? experimentRef
-      : path.resolve(cwd, experimentRef);
-    return existsSync(experimentPath) ? experimentPath : undefined;
-  }
-
-  for (const ext of ['yaml', 'yml', 'ts', 'js', 'mts', 'mjs']) {
-    const candidate = path.resolve(cwd, 'experiments', `${experimentRef}.${ext}`);
-    if (existsSync(candidate)) {
-      return candidate;
-    }
-  }
-  return undefined;
+function resolveExperimentForRun(explicitExperiment?: string): ResolvedExperimentForRun {
+  return explicitExperiment ? { name: explicitExperiment } : {};
 }
 
 function applyExperimentOptions(
@@ -664,6 +630,7 @@ function applyExperimentOptions(
     workspaceMode: workspacePath ? 'static' : workspaceMode,
     workspacePath,
     budgetUsd: options.budgetUsd ?? experiment.budgetUsd,
+    threshold: options.threshold ?? experiment.threshold,
     experimentConfig: experiment,
     experimentMetadata: buildExperimentArtifactMetadata(experiment),
     experimentTargetRefs: options.cliTargets.length === 0 ? experimentTargetRefs : undefined,
@@ -715,212 +682,104 @@ function buildExperimentTrialsConfig(experiment: ExperimentConfig): TrialsConfig
   };
 }
 
-function readExperimentWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' | undefined {
-  return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined;
-}
-
-function readExperimentWorkspacePath(
-  workspace: Record<string, unknown> | undefined,
-): string | undefined {
-  const value = workspace?.path;
-  return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined;
-}
-
-type ExperimentSuiteSelection = {
-  readonly testFiles: readonly string[];
-  readonly filtersByEvalFile: ReadonlyMap<string, string | readonly string[]>;
+type EffectiveRunPolicy = {
+  readonly trialsConfig?: TrialsConfig;
+  readonly threshold?: number;
+  readonly timeoutSeconds?: number;
+  readonly budgetUsd?: number;
+  readonly hasScopedOverride: boolean;
 };
 
-function matchesTestFilter(id: string, filter: string | readonly string[]): boolean {
-  return typeof filter === 'string'
-    ? micromatch.isMatch(id, filter)
-    : filter.some((pattern) => micromatch.isMatch(id, pattern));
-}
-
-async function resolveExperimentSuiteSelection(
-  suites: ExperimentConfig['suites'] | undefined,
-  cwd: string,
-): Promise<ExperimentSuiteSelection | undefined> {
-  if (!suites || suites.length === 0) {
+function buildRunOverrideTrialsConfig(run: EvalRunOverride | undefined): TrialsConfig | undefined {
+  const repeat = run?.repeat;
+  if (!repeat || repeat.count <= 1) {
     return undefined;
   }
-
-  const testFiles = new Set<string>();
-  const selectedTestIdsByEvalFile = new Map<string, string[]>();
-
-  for (const suite of suites) {
-    const resolvedSuiteFiles = await resolveEvalPaths([suite.ref], cwd);
-    for (const testFilePath of resolvedSuiteFiles) {
-      const resolvedPath = path.resolve(testFilePath);
-      testFiles.add(resolvedPath);
-      if (suite.select?.testIds && suite.select.testIds.length > 0) {
-        const existing = selectedTestIdsByEvalFile.get(resolvedPath) ?? [];
-        selectedTestIdsByEvalFile.set(resolvedPath, [...existing, ...suite.select.testIds]);
-      }
-    }
-  }
-
-  const filtersByEvalFile = new Map<string, string | readonly string[]>();
-  for (const [testFilePath, testIds] of selectedTestIdsByEvalFile.entries()) {
-    const uniqueTestIds = [...new Set(testIds)];
-    filtersByEvalFile.set(
-      testFilePath,
-      uniqueTestIds.length === 1 ? uniqueTestIds[0] : uniqueTestIds,
-    );
-  }
-
   return {
-    testFiles: [...testFiles],
-    filtersByEvalFile,
+    count: repeat.count,
+    strategy: repeat.strategy,
+    ...(repeat.costLimitUsd !== undefined && { costLimitUsd: repeat.costLimitUsd }),
+    ...(repeat.earlyExit !== undefined && { earlyExit: repeat.earlyExit }),
   };
 }
 
-async function runExperimentSteps(params: {
-  readonly label: 'setup' | 'script';
-  readonly steps: readonly ExperimentScript[] | undefined;
-  readonly cwd: string;
-  readonly experimentConfig?: ExperimentConfig;
-}): Promise<void> {
-  const steps = params.steps ?? [];
-  if (steps.length === 0) {
-    return;
-  }
-
-  for (let index = 0; index < steps.length; index++) {
-    const step = steps[index];
-    const command = buildExperimentStepCommand(step);
-    const cwd = resolveExperimentStepCwd(params.cwd, params.experimentConfig, step.cwd);
-    console.log(`Experiment ${params.label} ${index + 1}/${steps.length}: ${command.display}`);
-    await runExperimentCommand(command.argv, {
-      cwd,
-      env: step.env,
-      timeoutMs: step.timeoutSeconds ? step.timeoutSeconds * 1000 : undefined,
-      label: `experiment ${params.label}`,
-    });
-  }
+function resolveEffectiveRunPolicy(params: {
+  readonly test: EvalTest;
+  readonly options: NormalizedOptions;
+  readonly defaultTrialsConfig?: TrialsConfig;
+  readonly defaultThreshold?: number;
+  readonly defaultTimeoutSeconds?: number;
+  readonly defaultBudgetUsd?: number;
+}): EffectiveRunPolicy {
+  const { test, options, defaultTrialsConfig, defaultThreshold, defaultTimeoutSeconds } = params;
+  const run = test.run;
+  const threshold = options.cliThreshold ?? run?.threshold ?? test.threshold ?? defaultThreshold;
+  const timeoutSeconds =
+    options.cliAgentTimeoutSeconds ?? run?.timeoutSeconds ?? defaultTimeoutSeconds;
+  const budgetUsd = run?.budgetUsd ?? params.defaultBudgetUsd;
+  const trialsConfig = buildRunOverrideTrialsConfig(run) ?? defaultTrialsConfig;
+  return {
+    ...(trialsConfig !== undefined && { trialsConfig }),
+    ...(threshold !== undefined && { threshold }),
+    ...(timeoutSeconds !== undefined && { timeoutSeconds }),
+    ...(budgetUsd !== undefined && { budgetUsd }),
+    hasScopedOverride: run !== undefined || test.threshold !== undefined,
+  };
 }
 
-async function runExperimentSetup(params: {
-  readonly config: ExperimentConfig | undefined;
-  readonly cwd: string;
-  readonly runDir: string;
-}): Promise<void> {
-  const setup = params.config?.setup;
-  if (typeof setup === 'function') {
-    console.log('Experiment setup: running TypeScript setup()');
-    await setup({
-      cwd: params.cwd,
-      runDir: params.runDir,
-      experiment: params.config,
-      env: process.env,
-    });
-    return;
-  }
-  await runExperimentSteps({
-    label: 'setup',
-    steps: setup,
-    cwd: params.cwd,
-    experimentConfig: params.config,
+function runPolicyKey(policy: EffectiveRunPolicy): string {
+  return JSON.stringify({
+    trialsConfig: policy.trialsConfig,
+    threshold: policy.threshold,
+    timeoutSeconds: policy.timeoutSeconds,
+    budgetUsd: policy.budgetUsd,
   });
 }
 
-function buildExperimentStepCommand(step: ExperimentScript): {
-  readonly argv: readonly string[];
-  readonly display: string;
-} {
-  if (step.command && step.command.length > 0) {
-    return { argv: step.command, display: step.command.join(' ') };
-  }
-  if (typeof step.script === 'string' && step.script.trim().length > 0) {
-    return {
-      argv: shellCommand(step.script),
-      display: step.script,
-    };
-  }
-  if (Array.isArray(step.script) && step.script.length > 0) {
-    return { argv: step.script, display: step.script.join(' ') };
+function groupTestsByRunPolicy(params: {
+  readonly tests: readonly EvalTest[];
+  readonly options: NormalizedOptions;
+  readonly defaultTrialsConfig?: TrialsConfig;
+  readonly defaultThreshold?: number;
+  readonly defaultTimeoutSeconds?: number;
+  readonly defaultBudgetUsd?: number;
+}): readonly { readonly policy: EffectiveRunPolicy; readonly tests: readonly EvalTest[] }[] {
+  const groups = new Map<string, { policy: EffectiveRunPolicy; tests: EvalTest[] }>();
+  for (const test of params.tests) {
+    const policy = resolveEffectiveRunPolicy({
+      test,
+      options: params.options,
+      defaultTrialsConfig: params.defaultTrialsConfig,
+      defaultThreshold: params.defaultThreshold,
+      defaultTimeoutSeconds: params.defaultTimeoutSeconds,
+      defaultBudgetUsd: params.defaultBudgetUsd,
+    });
+    const key = runPolicyKey(policy);
+    const existing = groups.get(key);
+    if (existing) {
+      existing.tests.push(test);
+    } else {
+      groups.set(key, { policy, tests: [test] });
+    }
   }
-  throw new Error('Experiment step must define command or script.');
+  return [...groups.values()];
 }
 
-function shellCommand(script: string): readonly string[] {
-  return process.platform === 'win32' ? ['cmd', '/c', script] : ['sh', '-c', script];
+function readExperimentWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' | undefined {
+  return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined;
 }
 
-function resolveExperimentStepCwd(
-  cwd: string,
-  experimentConfig: ExperimentConfig | undefined,
-  stepCwd: string | undefined,
-): string {
-  const base = experimentConfig?.sourcePath ? path.dirname(experimentConfig.sourcePath) : cwd;
-  if (!stepCwd) {
-    return base;
-  }
-  return path.isAbsolute(stepCwd) ? stepCwd : path.resolve(base, stepCwd);
+function readExperimentWorkspacePath(
+  workspace: Record<string, unknown> | undefined,
+): string | undefined {
+  const value = workspace?.path;
+  return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined;
 }
 
-async function runExperimentCommand(
-  argv: readonly string[],
-  options: {
-    readonly cwd: string;
-    readonly env?: Record<string, string>;
-    readonly timeoutMs?: number;
-    readonly label: string;
-  },
-): Promise<void> {
-  if (argv.length === 0) {
-    throw new Error(`${options.label} command must not be empty.`);
-  }
-
-  await new Promise<void>((resolve, reject) => {
-    const cmd = argv[0];
-    if (!cmd) {
-      reject(new Error(`${options.label} command must not be empty.`));
-      return;
-    }
-    const args = argv.slice(1);
-    const child = spawn(cmd, args, {
-      cwd: options.cwd,
-      env: options.env ? { ...process.env, ...options.env } : process.env,
-      stdio: 'inherit',
-    });
-    let completed = false;
-    const timeout =
-      options.timeoutMs !== undefined
-        ? setTimeout(() => {
-            if (!completed) {
-              completed = true;
-              child.kill('SIGKILL');
-              reject(new Error(`${options.label} timed out after ${options.timeoutMs}ms`));
-            }
-          }, options.timeoutMs)
-        : undefined;
-
-    child.on('error', (error) => {
-      if (completed) {
-        return;
-      }
-      completed = true;
-      if (timeout !== undefined) {
-        clearTimeout(timeout);
-      }
-      reject(error);
-    });
-    child.on('exit', (code) => {
-      if (completed) {
-        return;
-      }
-      completed = true;
-      if (timeout !== undefined) {
-        clearTimeout(timeout);
-      }
-      if (code === 0) {
-        resolve();
-      } else {
-        reject(new Error(`${options.label} exited with code ${code ?? 'unknown'}`));
-      }
-    });
-  });
+function matchesTestFilter(id: string, filter: string | readonly string[]): boolean {
+  return typeof filter === 'string'
+    ? micromatch.isMatch(id, filter)
+    : filter.some((pattern) => micromatch.isMatch(id, pattern));
 }
 
 type ProgressReporter = {
@@ -1033,6 +892,7 @@ async function prepareFileMetadata(params: {
   readonly options: NormalizedOptions;
   readonly suiteFilter?: string | readonly string[];
 }): Promise<{
+  readonly options: NormalizedOptions;
   readonly testIds: readonly string[];
   readonly testCases: readonly EvalTest[];
   readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[];
@@ -1066,24 +926,32 @@ async function prepareFileMetadata(params: {
     filter: suiteFilter ?? options.filter,
     category,
   });
+  const effectiveOptions = applyExperimentOptions(options, suite.experimentConfig);
   const testCases =
-    suiteFilter && options.filter
-      ? suite.tests.filter((testCase) => matchesTestFilter(testCase.id, options.filter ?? ''))
+    suiteFilter && effectiveOptions.filter
+      ? suite.tests.filter((testCase) =>
+          matchesTestFilter(testCase.id, effectiveOptions.filter ?? ''),
+        )
       : suite.tests;
   const testIds = testCases.map((value) => value.id);
   const suiteTargets = suite.targets;
+  const defaultBudgetUsd =
+    effectiveOptions.cliBudgetUsd === undefined
+      ? (effectiveOptions.budgetUsd ?? suite.budgetUsd)
+      : suite.budgetUsd;
 
   if (testCases.length === 0) {
     return {
+      options: effectiveOptions,
       testIds,
       testCases,
       selections: [],
-      trialsConfig: options.experimentTrialsConfig,
+      trialsConfig: effectiveOptions.experimentTrialsConfig,
       suiteTargets,
       yamlWorkers: suite.workers,
       yamlCache: suite.cacheConfig?.enabled,
       yamlCachePath: suite.cacheConfig?.cachePath,
-      budgetUsd: suite.budgetUsd,
+      budgetUsd: defaultBudgetUsd,
       failOnError: suite.failOnError,
       threshold: suite.threshold,
       tags: suite.metadata?.tags,
@@ -1093,7 +961,7 @@ async function prepareFileMetadata(params: {
 
   let selections: { selection: TargetSelection; inlineTargetLabel: string }[];
 
-  if (options.transcript) {
+  if (effectiveOptions.transcript) {
     // --transcript mode: bypass target resolution entirely.
     // Create a synthetic TargetSelection for the transcript provider.
     const transcriptSelection: TargetSelection = {
@@ -1105,15 +973,15 @@ async function prepareFileMetadata(params: {
       },
       targetName: 'transcript',
       targetSource: 'cli',
-      targetsFilePath: options.transcript,
+      targetsFilePath: effectiveOptions.transcript,
     };
     selections = [
       {
         selection: transcriptSelection,
-        inlineTargetLabel: `transcript (${path.basename(options.transcript)})`,
+        inlineTargetLabel: `transcript (${path.basename(effectiveOptions.transcript)})`,
       },
     ];
-  } else if (suite.inlineTarget && options.cliTargets.length === 0) {
+  } else if (suite.inlineTarget && effectiveOptions.cliTargets.length === 0) {
     const targetDefinition = suite.inlineTarget;
     const resolvedTarget = options.dryRun
       ? ({
@@ -1144,7 +1012,7 @@ async function prepareFileMetadata(params: {
         inlineTargetLabel: resolveTargetLabel(targetDefinition.name, resolvedTarget.name),
       },
     ];
-  } else if (suite.providerFactory && options.cliTargets.length === 0) {
+  } else if (suite.providerFactory && effectiveOptions.cliTargets.length === 0) {
     const taskTarget: ResolvedTarget = {
       kind: 'mock',
       name: 'custom-task',
@@ -1165,10 +1033,10 @@ async function prepareFileMetadata(params: {
     ];
   } else {
     // Determine target names: CLI --target flags override YAML
-    const cliTargets = options.cliTargets;
+    const cliTargets = effectiveOptions.cliTargets;
     const suiteTargets = suite.targets;
     const suiteTargetRefs = suite.targetRefs;
-    const experimentTargetRefs = options.experimentTargetRefs;
+    const experimentTargetRefs = effectiveOptions.experimentTargetRefs;
 
     // Resolve which target names to use (precedence: CLI/experiment > suite YAML targets > default)
     let targetNames: readonly string[];
@@ -1190,11 +1058,11 @@ async function prepareFileMetadata(params: {
         testFilePath,
         repoRoot,
         cwd,
-        explicitTargetsPath: options.targetsPath,
-        dryRun: options.dryRun,
-        dryRunDelay: options.dryRunDelay,
-        dryRunDelayMin: options.dryRunDelayMin,
-        dryRunDelayMax: options.dryRunDelayMax,
+        explicitTargetsPath: effectiveOptions.targetsPath,
+        dryRun: effectiveOptions.dryRun,
+        dryRunDelay: effectiveOptions.dryRunDelay,
+        dryRunDelayMin: effectiveOptions.dryRunDelayMin,
+        dryRunDelayMax: effectiveOptions.dryRunDelayMax,
         env: process.env,
         targetNames,
         targetRefs,
@@ -1210,12 +1078,12 @@ async function prepareFileMetadata(params: {
         testFilePath,
         repoRoot,
         cwd,
-        explicitTargetsPath: options.targetsPath,
-        cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
-        dryRun: options.dryRun,
-        dryRunDelay: options.dryRunDelay,
-        dryRunDelayMin: options.dryRunDelayMin,
-        dryRunDelayMax: options.dryRunDelayMax,
+        explicitTargetsPath: effectiveOptions.targetsPath,
+        cliTargetName: targetNames.length === 1 ? targetNames[0] : effectiveOptions.target,
+        dryRun: effectiveOptions.dryRun,
+        dryRunDelay: effectiveOptions.dryRunDelay,
+        dryRunDelayMin: effectiveOptions.dryRunDelayMin,
+        dryRunDelayMax: effectiveOptions.dryRunDelayMax,
         env: process.env,
       });
 
@@ -1238,15 +1106,16 @@ async function prepareFileMetadata(params: {
   }
 
   return {
+    options: effectiveOptions,
     testIds,
     testCases,
     selections,
-    trialsConfig: options.experimentTrialsConfig,
+    trialsConfig: effectiveOptions.experimentTrialsConfig,
     suiteTargets,
     yamlWorkers: suite.workers,
     yamlCache: suite.cacheConfig?.enabled,
     yamlCachePath: suite.cacheConfig?.cachePath,
-    budgetUsd: suite.budgetUsd,
+    budgetUsd: defaultBudgetUsd,
     failOnError: suite.failOnError,
     threshold: suite.threshold,
     tags: suite.metadata?.tags,
@@ -1293,6 +1162,7 @@ async function runSingleEvalFile(params: {
   readonly inlineTargetLabel: string;
   readonly testCases: readonly EvalTest[];
   readonly trialsConfig?: TrialsConfig;
+  readonly agentTimeoutSeconds?: number;
   readonly matrixMode?: boolean;
   readonly budgetUsd?: number;
   readonly runBudgetTracker?: RunBudgetTracker;
@@ -1320,6 +1190,7 @@ async function runSingleEvalFile(params: {
     inlineTargetLabel,
     testCases,
     trialsConfig,
+    agentTimeoutSeconds,
     matrixMode,
     budgetUsd,
     runBudgetTracker,
@@ -1361,9 +1232,7 @@ async function runSingleEvalFile(params: {
   }
 
   const agentTimeoutMs =
-    options.agentTimeoutSeconds != null
-      ? Math.max(0, options.agentTimeoutSeconds) * 1000
-      : undefined;
+    agentTimeoutSeconds != null ? Math.max(0, agentTimeoutSeconds) * 1000 : undefined;
 
   // Resolve workers: CLI flag > eval YAML execution.workers > target setting > default
   const workerPreference = workersOverride ?? options.workers;
@@ -1440,7 +1309,7 @@ async function runSingleEvalFile(params: {
     failOnError,
     graderTarget: options.graderTarget,
     model: options.model,
-    threshold: options.threshold,
+    threshold: params.threshold,
     targetHooks: resolvedTargetSelection.targetHooks,
     replayRecording,
     providerFactory,
@@ -1562,38 +1431,31 @@ export async function runEvalCommand(
   }
 
   let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
-  const resolvedExperiment = await resolveExperimentForRun({
-    cwd,
-    explicitExperiment: options.experiment,
-    yamlDefaultExperiment: resolveDefaultExperimentReference(yamlConfig),
-    tsDefaultExperiment: normalizeTsDefaultExperiment(config),
-  });
-  options = {
-    ...applyExperimentOptions(options, resolvedExperiment.config),
-    experiment: resolvedExperiment.name,
-  };
-
-  const suiteSelection = await resolveExperimentSuiteSelection(
-    options.experimentConfig?.suites,
-    cwd,
-  );
-  const evalPathInputs =
-    input.testFiles.length > 0
-      ? [...input.testFiles]
-      : suiteSelection
-        ? [...suiteSelection.testFiles]
-        : [];
+  const resolvedExperiment = resolveExperimentForRun(options.experiment);
+  const evalPathInputs = input.testFiles.length > 0 ? [...input.testFiles] : [];
   if (evalPathInputs.length === 0 && process.stdin.isTTY) {
     const { launchInteractiveWizard } = await import('./interactive.js');
     await launchInteractiveWizard();
     return undefined;
   }
   const resolvedTestFiles = await resolveEvalPaths(evalPathInputs, cwd);
+  const fallbackResultGroupName =
+    resolvedTestFiles.length === 1 ? deriveEvalResultGroupName(resolvedTestFiles[0]) : 'multi-eval';
+  const primarySuite =
+    resolvedTestFiles.length > 0
+      ? await loadTestSuite(resolvedTestFiles[0], repoRoot, {
+          verbose: options.verbose,
+          filter: options.filter,
+          category: deriveCategory(path.relative(cwd, resolvedTestFiles[0])),
+        })
+      : undefined;
+  const resultGroupName =
+    resolvedTestFiles.length === 1
+      ? (primarySuite?.metadata?.name ?? fallbackResultGroupName)
+      : fallbackResultGroupName;
   options = {
     ...options,
-    ...(suiteSelection !== undefined && {
-      suiteFiltersByEvalFile: suiteSelection.filtersByEvalFile,
-    }),
+    experiment: resolvedExperiment.name ?? resultGroupName,
   };
 
   if (!process.env.AGENTV_EXPERIMENT) {
@@ -1732,8 +1594,8 @@ export async function runEvalCommand(
     mkdirSync(runDir, { recursive: true });
     outputPath = path.join(runDir, 'index.jsonl');
   } else {
-    // Default: .agentv/results/<experiment>/<timestamp>/, using "default" when unspecified.
-    outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment, runDirName);
+    // Default: .agentv/results/<eval-name>/<timestamp>/.
+    outputPath = buildDefaultOutputPathForExperiment(cwd, resultGroupName, runDirName);
     runDir = path.dirname(outputPath);
   }
   if (!process.env.AGENTV_RUN_TIMESTAMP) {
@@ -1811,12 +1673,6 @@ export async function runEvalCommand(
 
   console.log(`Artifact directory: ${runDir}`);
 
-  await runExperimentSetup({
-    config: options.experimentConfig,
-    cwd,
-    runDir,
-  });
-
   // Log file export paths
   if (options.otelFile) {
     console.log(`OTLP JSON file: ${path.resolve(options.otelFile)}`);
@@ -1830,17 +1686,19 @@ export async function runEvalCommand(
   const seenTestCases = new Set<string>();
   const displayIdTracker = createDisplayIdTracker();
 
-  // Run-level budget tracker: caps total cost across all eval files in this run.
-  const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : undefined;
+  // CLI --budget-usd is invocation-wide. Inline experiment.budget_usd is handled per eval file.
+  const runBudgetTracker = options.cliBudgetUsd
+    ? new RunBudgetTracker(options.cliBudgetUsd)
+    : undefined;
   if (runBudgetTracker) {
     console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`);
   }
 
-  // Each file gets the full worker budget — no splitting across files
-  const perFileWorkers = options.workers;
+  // Each file gets its own worker policy from CLI/config or that file's experiment block.
   const fileMetadata = new Map<
     string,
     {
+      readonly options: NormalizedOptions;
       readonly testIds: readonly string[];
       readonly testCases: readonly EvalTest[];
       readonly selections: readonly {
@@ -1867,7 +1725,7 @@ export async function runEvalCommand(
       repoRoot,
       cwd,
       options,
-      suiteFilter: options.suiteFiltersByEvalFile?.get(path.resolve(testFilePath)),
+      suiteFilter: undefined,
     });
     fileMetadata.set(testFilePath, meta);
   }
@@ -1916,7 +1774,9 @@ export async function runEvalCommand(
     console.log(`Replay recording: ${path.resolve(options.recordReplay)}`);
   }
 
-  // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
+  // Resolve a global summary threshold only when the CLI supplies one or the first
+  // active eval file is the only source of runtime policy. Multi-file runs with
+  // inline thresholds are summarized from per-result execution status instead.
   const yamlThreshold = firstMeta?.threshold;
   const resolvedThreshold = options.threshold ?? yamlThreshold;
   if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
@@ -2025,6 +1885,14 @@ export async function runEvalCommand(
 
   // Use only files that survived tag filtering.
   const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
+  const singleActiveFileMetadata =
+    activeTestFiles.length === 1 ? fileMetadata.get(activeTestFiles[0]) : undefined;
+  const runExperimentMetadata = singleActiveFileMetadata?.options.experimentMetadata;
+  const hasPerFileRuntimeThresholds =
+    options.cliThreshold === undefined &&
+    activeTestFiles.some(
+      (activeTestFile) => fileMetadata.get(activeTestFile)?.options.threshold !== undefined,
+    );
 
   // --transcript: create a shared TranscriptProvider and validate entry count
   let transcriptProviderFactory:
@@ -2063,7 +1931,7 @@ export async function runEvalCommand(
       evalFile,
       plannedTestCount: totalEvalCount,
       experiment: normalizeExperimentName(options.experiment),
-      experimentMetadata: options.experimentMetadata,
+      experimentMetadata: runExperimentMetadata,
     });
   }
 
@@ -2092,13 +1960,22 @@ export async function runEvalCommand(
   // Eval files run sequentially; within each file, --workers N test cases run in parallel.
   // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
   // workspace races without any grouping complexity.
+  let hasScopedRunPolicies = false;
   try {
     for (const testFilePath of activeTestFiles) {
+      const targetPrep = fileMetadata.get(testFilePath);
+      if (!targetPrep) {
+        throw new Error(`Missing metadata for ${testFilePath}`);
+      }
+      const fileOptions = targetPrep.options;
+      const fileBudgetTracker =
+        runBudgetTracker ??
+        (fileOptions.budgetUsd !== undefined
+          ? new RunBudgetTracker(fileOptions.budgetUsd)
+          : undefined);
       // Run-level budget check: skip remaining files if budget exceeded
-      if (runBudgetTracker?.isExceeded()) {
-        const targetPrep = fileMetadata.get(testFilePath);
-        if (!targetPrep) continue;
-        const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`;
+      if (fileBudgetTracker?.isExceeded()) {
+        const budgetMsg = `Run budget exceeded ($${fileBudgetTracker.currentCostUsd.toFixed(4)} / $${fileBudgetTracker.budgetCapUsd.toFixed(4)})`;
         console.log(`\n⚠ ${budgetMsg} — skipping ${path.basename(testFilePath)}`);
         for (const { selection } of targetPrep.selections) {
           const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({
@@ -2125,20 +2002,15 @@ export async function runEvalCommand(
             target: selection.targetName,
           }));
           for (const r of skippedResults) {
-            await outputWriter.append(withSourceMetadata(r, testFilePath, options));
+            await outputWriter.append(withSourceMetadata(r, testFilePath, fileOptions));
           }
           allResults.push(
-            ...skippedResults.map((r) => withSourceMetadata(r, testFilePath, options)),
+            ...skippedResults.map((r) => withSourceMetadata(r, testFilePath, fileOptions)),
           );
         }
         continue;
       }
 
-      const targetPrep = fileMetadata.get(testFilePath);
-      if (!targetPrep) {
-        throw new Error(`Missing metadata for ${testFilePath}`);
-      }
-
       // Run all targets concurrently (each target has its own worker limit)
       const targetResults = await Promise.all(
         targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
@@ -2166,45 +2038,59 @@ export async function runEvalCommand(
           }
 
           try {
-            const result = await runSingleEvalFile({
-              testFilePath,
-              cwd,
-              repoRoot,
-              options,
-              outputWriter,
-              otelExporter,
-              cache,
-              evaluationRunner,
-              workersOverride: perFileWorkers,
-              yamlWorkers: targetPrep.yamlWorkers,
-              progressReporter,
-              seenTestCases,
-              displayIdTracker,
-              selection,
-              inlineTargetLabel,
-              testCases: filteredTestCases,
-              trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
-              matrixMode: targetPrep.selections.length > 1,
-              budgetUsd: targetPrep.budgetUsd,
-              runBudgetTracker,
-              failOnError: targetPrep.failOnError,
-              threshold: resolvedThreshold,
-              providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory,
+            const runGroups = groupTestsByRunPolicy({
+              tests: filteredTestCases,
+              options: fileOptions,
+              defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig,
+              defaultThreshold: fileOptions.threshold ?? targetPrep.threshold,
+              defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds,
+              defaultBudgetUsd: targetPrep.budgetUsd,
             });
+            const groupResults: EvaluationResult[] = [];
+            for (const group of runGroups) {
+              hasScopedRunPolicies ||= group.policy.hasScopedOverride;
+              const result = await runSingleEvalFile({
+                testFilePath,
+                cwd,
+                repoRoot,
+                options: fileOptions,
+                outputWriter,
+                otelExporter,
+                cache,
+                evaluationRunner,
+                workersOverride: fileOptions.workers,
+                yamlWorkers: targetPrep.yamlWorkers,
+                progressReporter,
+                seenTestCases,
+                displayIdTracker,
+                selection,
+                inlineTargetLabel,
+                testCases: group.tests,
+                trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig,
+                agentTimeoutSeconds: group.policy.timeoutSeconds,
+                matrixMode: targetPrep.selections.length > 1,
+                budgetUsd: group.policy.budgetUsd,
+                runBudgetTracker: fileBudgetTracker,
+                failOnError: targetPrep.failOnError,
+                threshold: group.policy.threshold,
+                providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory,
+              });
+              groupResults.push(...result.results);
+            }
             const evalFile = path.relative(cwd, testFilePath);
             const existingSummary = remoteEvalSummaries.find(
               (summary) => summary.evalFile === evalFile,
             );
             if (existingSummary) {
-              existingSummary.results.push(...result.results);
+              existingSummary.results.push(...groupResults);
             } else {
               remoteEvalSummaries.push({
                 evalFile,
-                results: [...result.results],
+                results: [...groupResults],
               });
             }
 
-            return result.results;
+            return groupResults;
           } catch (fileError) {
             // before_all or other setup failures should not abort the entire run.
             // Mark all tests in this file as errors and continue with other files.
@@ -2239,7 +2125,7 @@ export async function runEvalCommand(
                   target: selection.targetName,
                 },
                 testFilePath,
-                options,
+                fileOptions,
               ),
             );
             for (const errResult of errorResults) {
@@ -2278,7 +2164,11 @@ export async function runEvalCommand(
     }
 
     const thresholdOpts =
-      resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined;
+      hasScopedRunPolicies || hasPerFileRuntimeThresholds
+        ? { thresholdLabel: 'configured threshold(s)', useExecutionStatus: true }
+        : resolvedThreshold !== undefined
+          ? { threshold: resolvedThreshold }
+          : undefined;
     const summary = calculateEvaluationSummary(summaryResults, thresholdOpts);
     console.log(formatEvaluationSummary(summary, thresholdOpts));
     if (
@@ -2292,7 +2182,9 @@ export async function runEvalCommand(
     // Exit code: 2 when all tests are execution errors (no evaluation performed),
     // 1 when any test scored below threshold.
     const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
-    const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0;
+    const thresholdFailed =
+      (thresholdOpts?.useExecutionStatus === true || resolvedThreshold !== undefined) &&
+      summary.qualityFailureCount > 0;
 
     // Print matrix summary when multiple targets were evaluated
     if (isMatrixMode && summaryResults.length > 0) {
@@ -2312,6 +2204,7 @@ export async function runEvalCommand(
         const { writePerTestArtifacts } = await import('./artifact-writer.js');
         await writePerTestArtifacts(allResults, runDir, {
           experiment: normalizeExperimentName(options.experiment),
+          resultGroup: resultGroupName,
           cwd,
           repoRoot,
           sourceTests,
@@ -2320,7 +2213,7 @@ export async function runEvalCommand(
         const { summaryPath } = await aggregateRunDir(runDir, {
           evalFile,
           experiment: normalizeExperimentName(options.experiment),
-          experimentMetadata: options.experimentMetadata,
+          experimentMetadata: runExperimentMetadata,
         });
         const indexPath = path.join(runDir, 'index.jsonl');
         console.log(`Artifact workspace updated: ${runDir}`);
@@ -2334,7 +2227,8 @@ export async function runEvalCommand(
           {
             evalFile,
             experiment: normalizeExperimentName(options.experiment),
-            experimentMetadata: options.experimentMetadata,
+            experimentMetadata: runExperimentMetadata,
+            resultGroup: resultGroupName,
             cwd,
             repoRoot,
             sourceTests,
@@ -2437,13 +2331,6 @@ export async function runEvalCommand(
       await wipLoop.stopAndDeleteWipBranch();
     }
 
-    await runExperimentSteps({
-      label: 'script',
-      steps: options.experimentConfig?.scripts,
-      cwd,
-      experimentConfig: options.experimentConfig,
-    });
-
     return {
       executionErrorCount: summary.executionErrorCount,
       outputPath,
diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 54d6d373c..13d64f508 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -84,7 +84,7 @@ function buildHistogram(values: readonly number[]): readonly HistogramBin[] {
 
 export function calculateEvaluationSummary(
   results: readonly EvaluationResult[],
-  options?: { threshold?: number },
+  options?: { threshold?: number; thresholdLabel?: string; useExecutionStatus?: boolean },
 ): EvaluationSummary {
   const total = results.length;
 
@@ -139,11 +139,11 @@ export function calculateEvaluationSummary(
   const executionErrorCount = executionErrors.length;
   const scoreThreshold = options?.threshold;
   const passedCount =
-    scoreThreshold !== undefined
+    scoreThreshold !== undefined && options?.useExecutionStatus !== true
       ? qualityResults.filter((r) => r.score >= scoreThreshold).length
       : results.filter((r) => r.executionStatus === 'ok').length;
   const qualityFailureCount =
-    scoreThreshold !== undefined
+    scoreThreshold !== undefined && options?.useExecutionStatus !== true
       ? qualityResults.filter((r) => r.score < scoreThreshold).length
       : results.filter((r) => r.executionStatus === 'quality_failure').length;
 
@@ -186,7 +186,7 @@ function formatScore(value: number): string {
 
 export function formatEvaluationSummary(
   summary: EvaluationSummary,
-  options?: { threshold?: number },
+  options?: { threshold?: number; thresholdLabel?: string; useExecutionStatus?: boolean },
 ): string {
   if (summary.total === 0) {
     return '\nNo results to summarize';
@@ -209,6 +209,7 @@ export function formatEvaluationSummary(
   // Overall verdict: all non-error cases must score >= per-test threshold.
   const gradedCount = summary.total - summary.executionErrorCount;
   const threshold = options?.threshold ?? 0.8;
+  const thresholdText = options?.thresholdLabel ?? `${Math.round(threshold * 100)}%`;
   const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
   const overallPassed =
     !allExecutionErrors &&
@@ -226,7 +227,7 @@ export function formatEvaluationSummary(
   } else {
     overallVerdict = overallPassed ? 'PASS' : 'FAIL';
     verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
-    verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${summary.total} scored >= ${Math.round(threshold * 100)}%, mean: ${formatScore(summary.mean)})`;
+    verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${summary.total} scored >= ${thresholdText}, mean: ${formatScore(summary.mean)})`;
   }
 
   lines.push('\n==================================================');
diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts
index a716eeda7..ae81fb721 100644
--- a/apps/cli/src/commands/eval/task-bundle.ts
+++ b/apps/cli/src/commands/eval/task-bundle.ts
@@ -736,6 +736,9 @@ function buildPortableEvalCase(
   if (test.metadata && Object.keys(test.metadata).length > 0) {
     testCase.metadata = rewritePathsDeep(test.metadata, rewrites);
   }
+  if (test.run && Object.keys(test.run).length > 0) {
+    testCase.run = rewritePathsDeep(test.run, rewrites);
+  }
   if (test.conversation_id) {
     testCase.conversation_id = test.conversation_id;
   }
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 6c9b67c1c..0bad4ce0f 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -190,6 +190,23 @@ describe('buildGradingArtifact', () => {
       passed_attempts: 1,
       total_attempts: 2,
     });
+
+    const passAll = buildGradingArtifact(
+      makeResult({
+        aggregation: {
+          strategy: 'pass_all',
+          passedAttempts: 1,
+          totalAttempts: 2,
+          min: 0.4,
+        },
+      }),
+    );
+    expect(passAll.aggregation).toEqual({
+      strategy: 'pass_all',
+      passed_attempts: 1,
+      total_attempts: 2,
+      min: 0.4,
+    });
   });
 
   it('uses top-level assertions when no grader scores', () => {
@@ -1726,6 +1743,80 @@ describe('writeArtifactsFromResults', () => {
     expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json');
   });
 
+  it('does not prefix artifact paths with suite when it matches the result group', async () => {
+    const paths = await writeArtifactsFromResults(
+      [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
+      testDir,
+      { resultGroup: 'eval-top-months-chart' },
+    );
+
+    const [indexLine] = (await readFile(paths.indexPath, 'utf8'))
+      .trim()
+      .split('\n')
+      .map(JSON.parse);
+    expect(indexLine.suite).toBe('eval-top-months-chart');
+    expect(indexLine.grading_path).toBe('shared-id/run-1/grading.json');
+  });
+
+  it('prefixes imported suite artifacts even when the suite matches the result group', async () => {
+    const sourceTests = [
+      {
+        id: 'shared-id',
+        suite: 'eval-top-months-chart',
+        source: {
+          evalFilePath: 'evals/imported.eval.yaml',
+          evalFileAbsolutePath: path.join(testDir, 'evals/imported.eval.yaml'),
+          importedSuiteName: 'eval-top-months-chart',
+          testId: 'shared-id',
+          testSnapshotYaml: 'id: shared-id',
+          graderDefinitions: [],
+          references: [],
+        },
+      } as EvalTest,
+    ];
+    const paths = await writeArtifactsFromResults(
+      [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
+      testDir,
+      { resultGroup: 'eval-top-months-chart', sourceTests },
+    );
+
+    const [indexLine] = (await readFile(paths.indexPath, 'utf8'))
+      .trim()
+      .split('\n')
+      .map(JSON.parse);
+    expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json');
+  });
+
+  it('uses the imported suite name for wrapper suite artifact paths', async () => {
+    const sourceTests = [
+      {
+        id: 'shared-id',
+        suite: 'wrapper-suite',
+        source: {
+          evalFilePath: 'evals/imported.eval.yaml',
+          evalFileAbsolutePath: path.join(testDir, 'evals/imported.eval.yaml'),
+          importedSuiteName: 'imported-suite',
+          testId: 'shared-id',
+          testSnapshotYaml: 'id: shared-id',
+          graderDefinitions: [],
+          references: [],
+        },
+      } as EvalTest,
+    ];
+    const paths = await writeArtifactsFromResults(
+      [makeResult({ suite: 'wrapper-suite', testId: 'shared-id', target: 'baseline' })],
+      testDir,
+      { resultGroup: 'wrapper-suite', sourceTests },
+    );
+
+    const [indexLine] = (await readFile(paths.indexPath, 'utf8'))
+      .trim()
+      .split('\n')
+      .map(JSON.parse);
+    expect(indexLine.artifact_dir).toBe('imported-suite/shared-id');
+    expect(indexLine.grading_path).toBe('imported-suite/shared-id/run-1/grading.json');
+  });
+
   it('writes task bundle artifacts with local source paths when source metadata is provided', async () => {
     const sourceRoot = path.join(testDir, 'src');
     await mkdir(sourceRoot, { recursive: true });
diff --git a/apps/cli/test/commands/eval/result-layout.test.ts b/apps/cli/test/commands/eval/result-layout.test.ts
index 97424c4c1..79dfd805d 100644
--- a/apps/cli/test/commands/eval/result-layout.test.ts
+++ b/apps/cli/test/commands/eval/result-layout.test.ts
@@ -9,7 +9,7 @@ import {
 } from '../../../src/commands/eval/result-layout.js';
 
 describe('result layout', () => {
-  it('groups default run directories under the default experiment', () => {
+  it('groups default run directories under the default result group', () => {
     const cwd = '/repo';
     const timestamp = new Date('2026-06-22T12:34:56.789Z');
 
@@ -18,7 +18,7 @@ describe('result layout', () => {
     );
   });
 
-  it('groups named experiment run directories under the experiment', () => {
+  it('groups named run directories under the result group', () => {
     expect(buildDefaultRunDirFromName('/repo', 'with-skills', '2026-run')).toBe(
       path.join('/repo', '.agentv', 'results', 'with-skills', '2026-run'),
     );
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index bf19b2b0e..484095448 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -408,8 +408,8 @@ describe('agentv eval CLI', () => {
       await expectFileExists(path.join(outputDir, 'summary.json'));
       for (const row of canonicalResults) {
         expect(row.transcript_path).toMatch(/run-1\/transcript\.jsonl$/);
-        expect(row.transcript_raw_path).toMatch(/run-1\/transcript-raw\.jsonl$/);
         await expectFileExists(path.join(outputDir, row.transcript_path as string));
+        expect(row.transcript_raw_path).toMatch(/run-1\/transcript-raw\.jsonl$/);
         await expectFileExists(path.join(outputDir, row.transcript_raw_path as string));
       }
     } finally {
@@ -521,11 +521,9 @@ describe('agentv eval CLI', () => {
     }
   }, 30_000);
 
-  it('runs a native experiment file with suite test selection and run knobs', async () => {
+  it('runs inline experiment config with suite test selection and run knobs', async () => {
     const fixture = await createFixture();
     try {
-      const experimentsDir = path.join(fixture.suiteDir, 'experiments');
-      await mkdir(experimentsDir, { recursive: true });
       await writeFile(
         path.join(fixture.suiteDir, '.agentv', 'config.yaml'),
         'eval_patterns:\n  - sample.test.yaml\n  - unused.test.yaml\n',
@@ -545,38 +543,40 @@ describe('agentv eval CLI', () => {
         ].join('\n'),
         'utf8',
       );
-      const experimentPath = path.join(experimentsDir, 'default.yaml');
+      const wrapperPath = path.join(fixture.suiteDir, 'native-exp.eval.yaml');
       await writeFile(
-        experimentPath,
+        wrapperPath,
         [
           'name: native-exp',
-          'target: cli-target',
-          'suites:',
-          '  - ref: sample.test.yaml',
-          '    select:',
-          '      test_ids:',
-          '        - case-alpha',
-          'timeout_seconds: 12',
-          'workers: 4',
-          'repeat:',
-          '  count: 2',
-          '  strategy: mean',
-          '  cost_limit_usd: 1.25',
-          'early_exit: false',
-          'setup:',
-          '  - script: "printf setup > ../experiment-setup.txt"',
-          'scripts:',
-          '  - script: "printf script > ../experiment-script.txt"',
+          'experiment:',
+          '  name: native-exp',
+          '  target: cli-target',
+          '  timeout_seconds: 12',
+          '  workers: 4',
+          '  threshold: 0.8',
+          '  budget_usd: 3',
+          '  repeat:',
+          '    count: 2',
+          '    strategy: mean',
+          '    cost_limit_usd: 1.25',
+          '  early_exit: false',
+          'tests:',
+          '  - include: sample.test.yaml',
+          '    type: suite',
+          '    select: case-alpha',
+          '    run:',
+          '      threshold: 1.0',
+          '      timeout_seconds: 5',
+          '      budget_usd: 0.75',
+          '      repeat:',
+          '        count: 3',
+          '        strategy: pass_all',
           '',
         ].join('\n'),
         'utf8',
       );
 
-      const { stdout, exitCode } = await runCli(fixture, [
-        'eval',
-        '--experiment',
-        'experiments/default.yaml',
-      ]);
+      const { stdout, exitCode } = await runCli(fixture, ['eval', wrapperPath]);
 
       expect(exitCode).toBe(0);
       const outputPath = extractOutputPath(stdout);
@@ -585,36 +585,24 @@ describe('agentv eval CLI', () => {
       const diagnostics = await readDiagnostics(fixture);
       expect(diagnostics).toMatchObject({
         target: 'cli-target',
-        agentTimeoutMs: 12000,
+        agentTimeoutMs: 5000,
         maxConcurrency: 4,
         evalCaseIds: ['case-alpha'],
+        budgetUsd: 0.75,
+        threshold: 1,
         trials: {
-          count: 2,
-          strategy: 'mean',
-          costLimitUsd: 1.25,
-          earlyExit: false,
+          count: 3,
+          strategy: 'pass_all',
         },
       });
 
-      await expectFileExists(path.join(fixture.suiteDir, 'experiment-setup.txt'));
-      await expectFileExists(path.join(fixture.suiteDir, 'experiment-script.txt'));
-
       const benchmark = JSON.parse(
         await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'),
       ) as { metadata?: Record<string, unknown> };
       expect(benchmark.metadata?.experiment).toBe('native-exp');
       expect(benchmark.metadata?.experiment_config).toMatchObject({
         name: 'native-exp',
-        source_path: experimentPath,
         target: 'cli-target',
-        suites: [
-          {
-            ref: 'sample.test.yaml',
-            select: {
-              test_ids: ['case-alpha'],
-            },
-          },
-        ],
         repeat: {
           count: 2,
           strategy: 'mean',
@@ -632,6 +620,76 @@ describe('agentv eval CLI', () => {
     }
   }, 30_000);
 
+  it('keeps inline experiment runtime isolated across multiple eval files', async () => {
+    const fixture = await createFixture();
+    try {
+      const firstPath = path.join(fixture.suiteDir, 'first.eval.yaml');
+      const secondPath = path.join(fixture.suiteDir, 'second.eval.yaml');
+      await writeFile(
+        firstPath,
+        [
+          'name: first',
+          'experiment:',
+          '  target: cli-target',
+          '  timeout_seconds: 11',
+          '  workers: 1',
+          '  budget_usd: 0.11',
+          'tests:',
+          '  - id: first-case',
+          '    input: first',
+          '    criteria: ok',
+          '',
+        ].join('\n'),
+        'utf8',
+      );
+      await writeFile(
+        secondPath,
+        [
+          'name: second',
+          'experiment:',
+          '  target: file-target',
+          '  timeout_seconds: 22',
+          '  workers: 2',
+          '  budget_usd: 0.22',
+          'tests:',
+          '  - id: second-case',
+          '    input: second',
+          '    criteria: ok',
+          '',
+        ].join('\n'),
+        'utf8',
+      );
+
+      const { stdout, exitCode } = await runCli(fixture, ['eval', firstPath, secondPath]);
+
+      expect(exitCode).toBe(0);
+      const outputPath = extractOutputPath(stdout);
+      expect(outputPath).toContain(`${path.sep}multi-eval${path.sep}`);
+
+      const diagnostics = await readDiagnostics(fixture);
+      const calls = diagnostics.calls as Array<Record<string, unknown>>;
+      expect(calls).toHaveLength(2);
+      expect(calls[0]).toMatchObject({
+        target: 'cli-target',
+        agentTimeoutMs: 11_000,
+        maxConcurrency: 1,
+        budgetUsd: 0.11,
+        runBudgetCapUsd: 0.11,
+        evalCaseIds: ['first-case'],
+      });
+      expect(calls[1]).toMatchObject({
+        target: 'file-target',
+        agentTimeoutMs: 22_000,
+        maxConcurrency: 2,
+        budgetUsd: 0.22,
+        runBudgetCapUsd: 0.22,
+        evalCaseIds: ['second-case'],
+      });
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
   it('honors agentv.config.ts cache.path when response cache is enabled there', async () => {
     const fixture = await createFixture();
     try {
diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
index 32162888e..b7ce3515f 100644
--- a/apps/cli/test/fixtures/mock-run-evaluation.ts
+++ b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -1,4 +1,4 @@
-import { mkdir, writeFile } from 'node:fs/promises';
+import { mkdir, readFile, writeFile } from 'node:fs/promises';
 import path from 'node:path';
 
 interface ResolvedTargetLike {
@@ -25,6 +25,7 @@ interface RunEvaluationOptionsLike {
     readonly costLimitUsd?: number;
     readonly earlyExit?: boolean;
   };
+  readonly threshold?: number;
   readonly budgetUsd?: number;
   readonly runBudgetTracker?: {
     readonly budgetCapUsd?: number;
@@ -181,6 +182,7 @@ async function maybeWriteDiagnostics(
     budgetUsd: options.budgetUsd ?? null,
     maxConcurrency: options.maxConcurrency ?? null,
     trials: options.trials ?? null,
+    threshold: options.threshold ?? null,
     hasRunBudgetTracker: options.runBudgetTracker !== undefined,
     runBudgetCapUsd: options.runBudgetTracker?.budgetCapUsd ?? null,
     replayRecording: options.replayRecording ?? null,
@@ -199,7 +201,17 @@ async function maybeWriteDiagnostics(
     resultCount: results.length,
   } satisfies Record<string, unknown>;
 
-  await writeFile(diagnosticsPath, JSON.stringify(payload, null, 2), 'utf8');
+  const priorCalls = await readFile(diagnosticsPath, 'utf8')
+    .then((raw) => {
+      const parsed = JSON.parse(raw) as { readonly calls?: unknown };
+      return Array.isArray(parsed.calls) ? parsed.calls : [parsed];
+    })
+    .catch(() => []);
+  await writeFile(
+    diagnosticsPath,
+    JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2),
+    'utf8',
+  );
 }
 
 async function maybeWritePromptDump(
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
index 1bbb4698b..e5db2e8f1 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
@@ -25,8 +25,8 @@ tests:
 |-------|----------|-------------|
 | `id` | Yes | Unique identifier for the test |
 | `criteria` | Yes | Description of what a correct response should contain |
-| `input` | Yes | Input sent to the target (string, object, or message array). Alias: `input` |
-| `expected_output` | No | Expected response for comparison (string, object, or message array). Alias: `expected_output` |
+| `input` | Yes | Input sent to the target (string, object, or message array) |
+| `expected_output` | No | Expected response for comparison (string, object, or message array) |
 | `execution` | No | Per-case execution overrides (for example `target`, `skip_defaults`) |
 | `workspace` | No | Per-case workspace config (overrides suite-level) |
 | `metadata` | No | Arbitrary key-value pairs passed to graders and workspace scripts |
@@ -189,9 +189,29 @@ supporting files, see [Benchmark Provenance](/docs/guides/benchmark-provenance/)
 
 The `assertions` field defines graders directly on a test. It supports both deterministic assertion types and LLM-based rubric evaluation.
 
+### Rubric Shorthand
+
+For semantic or agent-behavior checks, prefer plain strings in `assertions`.
+AgentV groups the strings into a rubric grader automatically:
+
+```yaml
+tests:
+  - id: bug-fix-review
+    criteria: Finds and fixes the bug
+    input: Review this failing parser implementation.
+    assertions:
+      - Identifies the root cause of the parser failure
+      - Proposes a concrete code change
+      - Adds or updates a regression test
+```
+
+Use this shape for qualitative requirements. It is less brittle than checking
+for exact substrings in an agent response.
+
 ### Deterministic Assertions
 
-These graders run without an LLM call and produce binary (0 or 1) scores:
+Use deterministic assertions for exact machine-verifiable outputs. These graders
+run without an LLM call and produce binary (0 or 1) scores:
 
 | Type | Value | Description |
 |------|-------|-------------|
@@ -278,9 +298,10 @@ tests:
 
 Assertion graders auto-generate a `name` when one is not provided (e.g., `contains-DENIED`, `is_json`).
 
-### Rubric Assertions
+### Advanced Rubric Assertions
 
-Use `type: rubrics` with a `criteria` array to define structured LLM-graded evaluation criteria inline:
+Use `type: rubrics` with a `criteria` array only when you need weights,
+required flags, or score ranges:
 
 ```yaml
 tests:
@@ -374,8 +395,8 @@ tests:
 
 When `assertions` is defined, only the declared graders run. No implicit grader is added
 because `criteria` or `expected_output` exists. Graders that are declared (such as
-`llm-grader`, `code-grader`, or `rubrics`) receive the case context, including
-`criteria` and `expected_output`, as input automatically.
+plain rubric strings, `llm-grader`, `code-grader`, or `rubrics`) receive the case
+context, including `criteria` and `expected_output`, as input automatically.
 
 This means a case with `expected_output` and only deterministic assertions evaluates only
 those deterministic assertions:
@@ -394,11 +415,12 @@ If `assertions` contains only deterministic graders (like `contains` or `regex`)
 
 ```
 Warning: Test 'my-test': criteria is defined but no grader in assertions
-will evaluate it. Add 'type: llm-grader' to assertions, or remove criteria
-if it is documentation-only.
+will evaluate it. Add a rubric assertion string or another grader to assertions,
+or remove criteria if it is documentation-only.
 ```
 
-To use `criteria` alongside deterministic checks, add a grader explicitly:
+To use `criteria` alongside deterministic checks, add a rubric assertion string
+or another grader explicitly:
 
 ```yaml
 tests:
@@ -406,7 +428,7 @@ tests:
     criteria: Response is helpful and mentions the fix
     input: "Debug this function..."
     assertions:
-      - type: llm-grader        # explicit — receives criteria automatically
+      - Explains why the bug happens
       - type: contains
         value: "fix"
 ```
@@ -423,7 +445,7 @@ tests:
     criteria: Response is helpful and mentions the fix
     input: "Debug this function..."
     assertions:
-      - type: llm-grader
+      - type: llm-grader        # use explicit form for custom preprocessors
         preprocessors:
           - type: xlsx
             command: ["bun", "run", "scripts/preprocessors/xlsx-to-json.ts"]
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
index e4d932da0..84a8bf437 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
@@ -5,21 +5,21 @@ sidebar:
   order: 1
 ---
 
-Evaluation files define the test cases and graders for an evaluation run. Runtime choices such as target matrices, setup, scripts, and repeat runs belong in [experiments](/docs/evaluation/experiments/). AgentV supports two eval formats: YAML and JSONL.
+Evaluation files define the test cases, graders, workspace lifecycle, and inline runtime block for an evaluation run. Runtime choices such as target matrices, thresholds, budgets, and repeat runs belong under top-level [`experiment:`](/docs/evaluation/experiments/). Install, build, and reset commands belong under `workspace.hooks`; runner-specific setup belongs under `targets[].hooks`. AgentV supports two eval formats: YAML and JSONL.
 
 YAML is the canonical portable model. TypeScript helpers, generated fixtures, and Python scripts should lower to the same YAML/JSONL shapes rather than inventing a separate eval contract.
 
 ## Suites
 
-An eval file is a **suite**: it binds test cases to task context, assertions, and reusable fixtures. Runtime choices such as target matrices, setup, and run counts belong in experiments. Test cases can be inline or loaded from an external file via `tests: ./cases.yaml` for reuse across suites.
+An eval file is a **suite**: it binds test cases to task context, assertions, reusable fixtures, and the inline runtime block. Test cases can be inline, loaded from an external file via `tests: ./cases.yaml`, or imported with `tests[].include`.
 
 ## YAML Format
 
-The primary format. A single file contains metadata, execution config, and tests:
+The primary format. A single file contains metadata, inline runtime config, and tests:
 
 ```yaml
 description: Math problem solving evaluation
-execution:
+experiment:
   target: default
 
 assertions:
@@ -40,9 +40,9 @@ tests:
 |-------|-------------|
 | `description` | Human-readable description of the evaluation |
 | `suite` | Optional suite identifier |
-| `execution` | Default execution config (`target`, `fail_on_error`, `threshold`, etc.) |
+| `experiment` | Runtime policy (`target`, `targets`, `workers`, `repeat`, `threshold`, `timeout_seconds`, `budget_usd`, etc.) |
 | `workspace` | Suite-level workspace config — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). |
-| `tests` | Array of individual tests, or a string path to an external file or directory |
+| `tests` | Array of individual tests, include entries, or a string path to an external file or directory. Tests and include entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. |
 | `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test |
 | `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test |
 
@@ -79,6 +79,9 @@ tests:
 ### Suite-level Assertions
 
 The `assertions` field is the canonical way to define suite-level graders. Suite-level assertions are appended to every test's graders unless a test sets `execution.skip_defaults: true`.
+For semantic or agent-behavior checks, prefer plain assertion strings first;
+AgentV treats them as rubric criteria. Use deterministic assertions or code
+graders when the expected output is exact or requires programmatic inspection.
 
 ```yaml
 description: API response validation
@@ -87,6 +90,8 @@ assertions:
     required: true
   - type: contains
     value: "status"
+  - Correctly answers the user's question
+  - Explains the reasoning clearly
 
 tests:
   - id: health-check
@@ -94,7 +99,10 @@ tests:
     input: Check API health
 ```
 
-`assertions` supports all grader types, including deterministic assertion types (`contains`, `regex`, `is_json`, `equals`) and `rubrics`. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for per-test assertions usage.
+`assertions` supports rubric shorthand strings, deterministic assertion types
+(`contains`, `regex`, `is_json`, `equals`), `rubrics`, LLM graders, and code
+graders. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for
+per-test assertions usage.
 
 ### Assertion Includes
 
@@ -188,8 +196,8 @@ Per-test `input_files` overrides the suite-level value (it does not merge). To o
 
 ### PROMPT.md Fallback
 
-For Vercel-style eval directories, a test may omit `input` and keep the task
-prompt in Markdown instead. AgentV resolves the prompt in this order:
+For directory-style evals, a test may omit `input` and keep the task prompt in
+Markdown instead. AgentV resolves the prompt in this order:
 
 1. If the effective `input_files` contains a file named exactly `PROMPT.md`, that file becomes the test prompt.
 2. Otherwise, if a `PROMPT.md` exists beside the `EVAL.yaml`, that file becomes the test prompt.
@@ -222,12 +230,25 @@ Instead of inlining tests in the same file, you can point `tests` to an external
 ```yaml
 name: my-eval
 description: My evaluation suite
-execution:
+experiment:
   target: default
 tests: ./cases.yaml
 ```
 
-The path is resolved relative to the eval file's directory. The external file should contain a YAML array of test objects or a JSONL file with one test per line.
+The path is resolved relative to the eval file's directory. The external file
+should contain a YAML array of test objects or a JSONL file with one test per
+line. String entries inside a `tests:` list work the same way and may use direct
+paths, directories, or globs:
+
+```yaml
+tests:
+  - ./cases/*.cases.yaml
+  - include: ./suites/*.eval.yaml
+    type: suite
+```
+
+String shorthand is raw-case-only. Import eval suites with object entries using
+`include:` and `type: suite`.
 
 ### Tests as Directory Path
 
@@ -360,7 +381,7 @@ An optional YAML sidecar file provides metadata and execution config. Place it a
 ```yaml
 description: Math evaluation dataset
 suite: math-tests
-execution:
+experiment:
   target: azure-base
 assertions:
   - name: correctness
diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
index 23a9f5cd2..d85077d87 100644
--- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
@@ -5,175 +5,219 @@ sidebar:
   order: 2
 ---
 
-Experiments define **how** eval cases run: target or target matrix, setup,
-scripts, timeout, sandbox, case filters, and repeat-run policy. Eval files stay
-focused on **what** is tested: prompts, datasets, assertions, and task fixtures.
-
-## Experiment YAML
-
-Committed experiments conventionally live under `experiments/`:
+AgentV eval files are the only runnable authoring artifact. Use top-level
+`experiment:` inside `eval.yaml` for runtime choices: targets, workers,
+timeout, sandbox/runtime knobs, budgets, thresholds, and repeat-run policy.
 
 ```yaml
-name: baseline
-target: codex-gpt5
-suites:
-  - ref: evals/support-regression.eval.yaml
-    select:
-      test_ids:
-        - refund-eligibility
-        - missing-order-date
-timeout_seconds: 720
-repeat:
-  count: 4
-  strategy: pass_at_k
-  cost_limit_usd: 2.00
-setup:
-  - script: bun install
-scripts:
-  - build
+name: support-regression
+
+experiment:
+  targets: [codex-gpt5, claude-sonnet]
+  workers: 2
+  timeout_seconds: 720
+  repeat:
+    count: 4
+    strategy: pass_at_k
+    cost_limit_usd: 2.00
+
+workspace:
+  hooks:
+    before_all:
+      command: ["bash", "-lc", "bun install && bun run build"]
+
+tests:
+  - id: refund-eligibility
+    input: Can this customer get a refund?
+    criteria: Applies the refund policy correctly
 ```
 
-Wire fields use `snake_case`. AgentV translates to internal `camelCase` when it
-loads the file.
+`execution:` is accepted only as a legacy top-level alias for existing eval
+files. Do not use both `experiment:` and `execution:` in the same eval.
 
-## Suites and test selection
+## Tests Imports
 
-Eval files keep `tests[]` as the canonical atomic test definition. Experiments
-reference one or more reusable eval suites through `suites[]`:
+Use `tests[]` for composition, imports, and selection.
 
 ```yaml
-suites:
-  - ref: evals/support-regression.eval.yaml
-  - ref: evals/billing-*.eval.yaml
-```
-
-Use suite-local `select.test_ids[]` to run only specific tests from a suite. The
-values match `tests[].id` inside that suite and use the same glob semantics as
-`--test-id`:
-
-```yaml
-suites:
-  - ref: evals/support-regression.eval.yaml
+tests:
+  - include: evals/support/*.eval.yaml
+    type: suite
     select:
       test_ids:
         - refund-*
         - missing-order-date
+      tags: regression
+      metadata:
+        priority: high
+    run:
+      threshold: 1.0
+      repeat:
+        count: 2
+        strategy: pass_all
+  - include: cases/*.cases.yaml
+    type: tests
+  - include: cases/regression.jsonl
+    type: tests
+  - cases/smoke/*.cases.yaml
 ```
 
-## Repeat runs
+`type: suite` preserves the imported suite's task contract: metadata,
+`workspace`, shared `input`, shared `assertions`, and tests. The child suite's
+`experiment:` or legacy `execution:` runtime block is ignored; the parent eval's
+runtime block controls the run.
 
-`repeat` is the full AgentV replacement for the old eval-level
-`execution.trials` shape. It supports the same core strategies:
+`type: tests` imports only raw test entries. It intentionally drops shared
+context from an imported eval suite, so parent suite fields apply to those raw
+cases.
 
-```yaml
-repeat:
-  count: 3
-  strategy: mean
-  cost_limit_usd: 1.50
-```
+`tests[].select.test_ids` filters imported test IDs with glob patterns.
+`tests[].select.tags` filters each imported case's effective `metadata.tags`.
+Effective case tags are suite-first and deduped:
+`suite.tags + suite.metadata.tags + test.metadata.tags`. Top-level suite `tags`
+still remain suite identity metadata for discovery and reporting; selection reads
+the merged case metadata view. `tests[].select.metadata` filters case metadata by
+key/value, where selector values may be scalars or lists. Globbed include paths
+are resolved in deterministic path order, then test order.
 
-Supported strategies:
+String-valued `tests` and string entries inside `tests[]` are raw-case import
+shorthand. They are equivalent to `include` with `type: tests` and may point at
+raw case files, directories, or globs. Importing another eval suite must use
+object form with `include:` and `type: suite`.
 
-| Strategy | Behavior |
-| --- | --- |
-| `pass_at_k` | Uses the best passing attempt; early-exits by default unless the experiment sets `early_exit: false` |
-| `mean` | Aggregates repeated attempt scores by mean |
-| `confidence_interval` | Uses the lower bound of a 95% confidence interval as the conservative score |
+Suite imports are resolved as a deterministic include graph. Circular `type:
+suite` imports fail validation with the import chain; raw-case shorthand does
+not recursively load suite runtime blocks.
 
-`repeat.cost_limit_usd` caps repeat-run spend. `repeat.costLimitUsd` is also
-accepted for prerelease trial-schema parity, but new YAML should use
-`cost_limit_usd`.
+Imported suite artifacts are nested under the source suite name inside a wrapper
+eval result directory, for example
+`.agentv/results/<wrapper-eval>/<timestamp>/<imported-suite>/<test-id>/...`.
+Direct tests owned by the wrapper eval and raw case imports live directly under
+`<test-id>/...`.
 
-## Vercel-compatible shorthand
+## Scoped Run Overrides
 
-AgentV also accepts Vercel-style top-level `runs` and `early_exit`:
+Use scoped `run:` blocks for result interpretation and scheduling policies that
+vary by include group or test case. Precedence is:
 
-```yaml
-runs: 4
-early_exit: true
+```text
+test.run > tests[].run > experiment
 ```
 
-This is shorthand for a `pass_at_k` repeat run. Use `repeat` when you need
-AgentV-specific strategy or cost-limit fields.
-
-Do not set both `repeat` and `runs` in the same experiment. `repeat` is the
-canonical AgentV shape; `runs` exists only for Vercel-compatible shorthand.
-
-Vercel defines the requested run count at the experiment level. Some result
-summaries show fewer actual runs for a case because `earlyExit: true` stops
-remaining attempts after the first pass; smoke runs can also force one run.
-AgentV follows the same experiment-level placement while keeping the richer
-`repeat` block for AgentV strategies.
-
-Repeat-enabled cases use a Vercel-style physical layout with AgentV aggregate
-provenance:
-
-```text
-<run-dir>/index.jsonl
-<run-dir>/summary.json
-<run-dir>/<suite>/<case-id>/summary.json
-<run-dir>/<suite>/<case-id>/run-1/result.json
-<run-dir>/<suite>/<case-id>/run-1/grading.json
-<run-dir>/<suite>/<case-id>/run-1/metrics.json
-<run-dir>/<suite>/<case-id>/run-1/timing.json
-<run-dir>/<suite>/<case-id>/run-1/transcript.json
-<run-dir>/<suite>/<case-id>/run-1/transcript-raw.jsonl
-<run-dir>/<suite>/<case-id>/run-1/outputs/answer.md
+```yaml
+experiment:
+  target: agent
+  threshold: 0.8
+  repeat:
+    count: 3
+    strategy: pass_at_k
+
+tests:
+  - include: ./evals/flaky-agentic/**/*.eval.yaml
+    type: suite
+    select:
+      tags: [agentic]
+    run:
+      repeat:
+        count: 3
+        strategy: pass_at_k
+
+  - include: ./evals/regression/**/*.eval.yaml
+    type: suite
+    select:
+      tags: [must-pass]
+    run:
+      threshold: 1.0
+      repeat:
+        count: 2
+        strategy: pass_all
+
+  - id: critical-case
+    input: "..."
+    criteria: Must pass exactly
+    run:
+      threshold: 1.0
+      repeat:
+        count: 1
 ```
 
-The repeated case aggregate folder uses `summary.json` for run-count, pass-rate,
-fingerprint, and flattened snake_case timing fields such as
-`mean_duration_ms`.
-Each `run-N/result.json` is the per-attempt manifest and includes
-`grading_path`, transcript/output paths, and embedded timing/o11y metrics. Each
-attempt also keeps AgentV `grading.json`, `metrics.json`, and `timing.json`
-sidecars for detailed inspection.
-Root `index.jsonl` and root `summary.json` remain stable for existing CI
-summary scripts and uploaded artifact consumers.
+Scoped `run:` supports `threshold`, `repeat`, `timeout_seconds`, and
+`budget_usd`. Candidate-changing fields such as `target` and `targets` stay
+parent-level under `experiment:`. Workspace mutation belongs in
+`workspace.hooks`, and runner-specific setup belongs in `targets[].hooks`.
 
-## Targets and setup
+## Lifecycle Ownership
 
-Experiments reuse targets from `.agentv/targets.yaml`; they do not define a new
-provider registry.
+`experiment:` configures evaluation policy. It does not own commands that
+prepare files, dependencies, repos, or target-specific runner state.
+
+| Need | Put it in |
+| --- | --- |
+| Install dependencies, build the repo, seed files | `workspace.hooks.before_all` |
+| Reset or apply per-case state | `workspace.hooks.before_each` / `workspace.hooks.after_each` |
+| Configure an agent runner or provider variant | `targets[].hooks` |
+| Choose targets, repeats, pass policy, budget, threshold | `experiment` |
 
 ```yaml
+workspace:
+  hooks:
+    before_all:
+      command: ["bash", "-lc", "bun install && bun run build"]
+
 targets:
-  - copilot
-  - claude
-  - name: gemini-with-hooks
-    use_target: gemini
+  - name: agent-with-skills
+    provider: codex
+    hooks:
+      before_each:
+        command: ["sh", "-c", "cp -R skills \"{{workspace_path}}/.codex/skills\""]
+
+experiment:
+  target: agent-with-skills
+  repeat:
+    count: 3
+    strategy: pass_at_k
 ```
 
-Setup and scripts belong on the experiment because they are often the A/B
-variable:
+## Repeat Runs
+
+`repeat` supports the same core strategies as repeated attempts:
 
 ```yaml
-setup:
-  - script: cp skills/with-docs/AGENTS.md AGENTS.md
-scripts:
-  - script: bun test
-    timeout_seconds: 120
+experiment:
+  repeat:
+    count: 3
+    strategy: mean
+    cost_limit_usd: 1.50
 ```
 
-## Running experiments
-
-Run a specific experiment:
+Supported strategies:
 
-```bash
-bun agentv eval --experiment experiments/default.yaml
-```
+| Strategy | Behavior |
+| --- | --- |
+| `pass_at_k` | Uses the best passing attempt; early-exits by default unless `early_exit: false` is set |
+| `pass_all` | Uses the weakest attempt score, so every repeated attempt must meet the threshold |
+| `mean` | Aggregates repeated attempt scores by mean |
+| `confidence_interval` | Uses the lower bound of a 95% confidence interval as the conservative score |
 
-If no experiment is passed, AgentV checks `.agentv/config.yaml` for a default:
+AgentV also accepts `runs` and `early_exit` under `experiment:` as shorthand for
+repeat-run policy:
 
 ```yaml
-experiments:
-  default: experiments/default.yaml
+experiment:
+  runs: 4
+  early_exit: true
 ```
 
-If no default is configured, AgentV keeps the old behavior and uses the
-`default` experiment label.
+Do not set both `repeat` and `runs` in the same runtime block.
+
+## Result Layout
+
+Default eval runs write to:
 
-## Schema
+```text
+.agentv/results/<eval-name>/<timestamp>/
+```
 
-The generated JSON Schema is available at
-`skills-data/agentv-eval-writer/references/experiment-schema.json`.
+Imported source suite metadata appears in `index.jsonl` rows and manifests.
+AgentV does not add a redundant suite directory when the result group is already
+the eval name.
diff --git a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
index 91f2f937d..ddd7ceedb 100644
--- a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
+++ b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
@@ -43,7 +43,7 @@ Create `./evals/example.yaml`:
 
 ```yaml
 description: Math problem solving evaluation
-execution:
+experiment:
   target: default
 
 tests:
diff --git a/apps/web/src/content/docs/docs/graders/custom-graders.mdx b/apps/web/src/content/docs/docs/graders/custom-graders.mdx
index 8c958eba0..42e9d9865 100644
--- a/apps/web/src/content/docs/docs/graders/custom-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/custom-graders.mdx
@@ -57,11 +57,9 @@ tests:
     criteria: Generates correct Python code
     input: Write a sorting function
     assertions:
-      - type: rubrics
-        criteria:
-          - Code is syntactically valid
-          - Handles edge cases (empty list, single element)
-          - Uses appropriate algorithm
+      - Code is syntactically valid
+      - Handles edge cases such as empty lists and single-element lists
+      - Uses an appropriate algorithm
       - name: syntax_check
         type: code-grader
         command: [./validators/check_syntax.py]
@@ -83,6 +81,7 @@ If any grader has `required: true` (or `required: <threshold>`) and scores below
 
 ## Best Practices
 
+- **Use plain assertion strings first for semantic checks** — AgentV treats them as rubric criteria
 - **Use code graders for deterministic checks** — exact value matching, format validation, schema compliance
 - **Use LLM graders for semantic evaluation** — meaning, quality, helpfulness
 - **Use rubrics for structured multi-criteria grading** — when you need weighted, itemized scoring
diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx
index b2a2e0317..6ff210675 100644
--- a/apps/web/src/content/docs/docs/targets/configuration.mdx
+++ b/apps/web/src/content/docs/docs/targets/configuration.mdx
@@ -91,6 +91,9 @@ targets:
 ### Workspace Lifecycle Hooks
 
 Run commands and reset/cleanup policies at different lifecycle points using `workspace.hooks`. This can be defined at the suite level (applies to all tests) or per test (overrides suite-level).
+Use workspace hooks for repo preparation such as dependency installs, builds,
+fixture generation, and per-case resets. Use target hooks for runner-specific
+setup.
 
 ```yaml
 workspace:
@@ -239,12 +242,12 @@ Use `cwd` on a target to run in an existing directory (shared across tests). If
 
 Eval files can define per-target hooks that run setup/teardown scripts to customize the workspace for each target variant. This enables comparing different harness configurations (e.g., baseline vs with-plugins) in a single eval file.
 
-Targets do not declare `repos`. Repositories belong to the shared eval workspace so every target runs in the same world; target hooks customize the harness under evaluation. Use hooks for per-target setup such as copying skills, enabling wrappers, or changing provider-local config.
+Targets do not declare `repos`. Repositories belong to the shared eval workspace so every target runs in the same world; target hooks customize the harness under evaluation. Use hooks for per-target setup such as copying skills, enabling wrappers, or changing provider-local config. Keep installs, builds, fixture generation, and case resets in `workspace.hooks`.
 
-Target hooks are defined in the eval file's `execution.targets` array using object form:
+Target hooks are defined in the eval file's `experiment.targets` array using object form:
 
 ```yaml
-execution:
+experiment:
   targets:
     - baseline                          # string shorthand (no hooks)
     - name: with-skills                 # object form with hooks
diff --git a/docs/adr/0006-separate-experiments-from-eval-definitions.md b/docs/adr/0006-separate-experiments-from-eval-definitions.md
index 103872e8c..752495b71 100644
--- a/docs/adr/0006-separate-experiments-from-eval-definitions.md
+++ b/docs/adr/0006-separate-experiments-from-eval-definitions.md
@@ -59,8 +59,6 @@ experiment:
     strategy: pass_at_k
   timeout_seconds: 900
   budget_usd: 2.00
-  setup:
-    - command: ./scripts/install-skills.sh
 
 tests:
   - include: ./evals/cargowise/**/*.eval.yaml
@@ -100,12 +98,32 @@ The old experiment runtime fields are ported into the parent eval file:
 - repeat policy such as `count` and `pass_at_k`
 - timeout
 - budget
-- runtime setup commands
 - other run-time controls that do not define the task itself
 
 Suite or case workspace fields remain task-owned when they define what is being
-evaluated. Experiment setup remains parent-owned when it changes the candidate
-or run condition being measured against the same task.
+evaluated.
+
+## Lifecycle Ownership
+
+`experiment:` owns evaluation policy, not lifecycle mutation. Commands that
+prepare or reset files, dependencies, repos, or runner-specific configuration
+must stay with the lifecycle surface that actually owns that work:
+
+- `workspace.hooks` prepare or reset the workspace under test. Dependency
+  installs, builds, fixture generation, case resets, and repo seeding belong
+  here.
+- `targets[].hooks` prepare the target runner or provider variant. Agent
+  discovery files, provider-specific config, and target-specific harness setup
+  belong here.
+- `experiment:` selects runtime policy: target or target matrix, workers,
+  repeat strategy, threshold, timeout, budget, sandbox/runtime knobs, and result
+  identity.
+
+This differs from external experiment formats that allow generic scripts on the
+experiment object. AgentV keeps those scripts in workspace or target hooks so a
+multi-file command such as `agentv eval a.eval.yaml b.eval.yaml` remains a batch
+of independent eval-suite runs, rather than one implicit wrapper experiment with
+shared mutable setup.
 
 ## Tests Import Surface
 
@@ -259,10 +277,11 @@ scheduling:
 - `timeout_seconds`
 - `budget_usd`
 
-Fields that change the candidate or system under test, such as `target`,
-`targets`, runtime setup, and workspace mutation, should remain at the parent
-`experiment:` level unless a later ADR accepts narrower per-group semantics.
-Keeping candidate-changing knobs out of scoped overrides preserves comparable
+Fields that change the candidate or system under test, such as `target` and
+`targets`, should remain at the parent `experiment:` level unless a later ADR
+accepts narrower per-group semantics. Workspace mutation stays in
+`workspace.hooks`; runner setup stays in `targets[].hooks`. Keeping
+candidate-changing knobs out of scoped overrides preserves comparable
 experiment groups and avoids silently mixing different systems under one result
 group.
 
diff --git a/docs/plans/2026-06-23-002-experiments-separation-plan.md b/docs/plans/2026-06-23-002-experiments-separation-plan.md
deleted file mode 100644
index a2e22f6fe..000000000
--- a/docs/plans/2026-06-23-002-experiments-separation-plan.md
+++ /dev/null
@@ -1,407 +0,0 @@
----
-title: "feat: Separate experiments from eval definitions"
-type: feat
-date: 2026-06-23
-origin: docs/adr/0006-separate-experiments-from-eval-definitions.md
----
-
-# feat: Separate experiments from eval definitions
-
-## Summary
-
-AgentV should separate eval task definitions from experiment run definitions.
-Eval YAML stays the canonical authoring layer for prompts, datasets, assertions,
-and task fixtures. Experiments become first-class committed files that select the
-agent or target under test, model, harness options, setup injection, run knobs,
-and case filter.
-
-This should ship in phases. Phase 1 adds the non-breaking foundation:
-experiment contract types, default experiment resolution, and artifact
-attribution by resolved experiment name. Later phases move runtime controls out
-of `eval.yaml execution`, teach the CLI to run experiment matrices, and record
-full experiment provenance and fingerprints in run bundles.
-
-## Problem Frame
-
-Today `experiment` is a string label passed through
-`packages/core/src/evaluation/evaluate.ts`, `packages/core/src/evaluation/run-artifacts.ts`,
-`packages/core/src/evaluation/results-repo.ts`, and
-`packages/core/src/evaluation/trace-envelope.ts`. Runtime choices are still
-scattered across CLI flags, TypeScript config, `.agentv/config.yaml`, and
-`eval.yaml execution`.
-
-That makes it hard to review A/B variants such as `baseline` versus
-`agents-md`, because the variable under test can be hidden inside the eval
-definition. The desired model is:
-
-- Eval equals what is tested.
-- Experiment equals how and with what it is run.
-- Setup that changes the agent's environment belongs to the experiment.
-- Existing eval-only repositories keep working through a default experiment
-  fallback.
-
-## Requirements
-
-- R1. Existing `eval.yaml` files validate and run without modification.
-- R2. Experiment wire config uses `snake_case`; TypeScript types use
-  `camelCase`.
-- R3. `config.yaml` can point at a default experiment, with no pointer falling
-  back to the current `default` experiment label.
-- R4. `agentv eval --experiment <label>` keeps working as a label.
-- R5. `agentv eval --experiment <path>` can resolve a YAML or TypeScript
-  experiment file.
-- R6. Experiment config reuses existing target names and target matrices instead
-  of embedding a new provider schema.
-- R7. Workspace setup and skill injection are modeled as experiment setup steps.
-- R8. Run artifacts record the resolved experiment name immediately and later
-  record full config provenance and fingerprint.
-- R9. Documentation and examples migrate incrementally; no bulk repo migration
-  happens in Phase 1.
-
-## Public Reference Alignment
-
-Vercel `agent-eval` supplies the strongest public precedent for this split:
-eval fixtures describe the task, and experiments describe agent, model, scripts,
-runs, early exit, timeout, sandbox, and setup. AgentV should adopt that
-vocabulary and directory convention while preserving AgentV-owned YAML evals,
-LLM-judge assertions, workspace fixtures, and portable run artifacts.
-
-Anthropic Skills reinforces the value of baseline versus with-skill comparisons
-and pass-rate deltas. Hugging Face Datasets provides the lowest-common
-denominator vocabulary for datasets, records, splits, and features. OpenInference
-provides trace and span vocabulary for external observability correlation. These
-references should inform field names and docs, but none should become a required
-runtime dependency for this change.
-
-## Key Decisions
-
-- KTD1. The canonical committed experiment directory is `experiments/`.
-- KTD2. YAML is the canonical experiment authoring path; TypeScript is the
-  escape hatch for dynamic setup.
-- KTD3. `experiments.default` in `.agentv/config.yaml` is the preferred default
-  pointer. A top-level `default_experiment` compatibility key can be accepted
-  while docs teach the nested form.
-- KTD4. Experiment `target` and `targets` refer to existing AgentV target names
-  and target refs. Provider settings stay in `targets.yaml`.
-- KTD5. Legacy `eval.yaml execution` remains valid for released fields while
-  examples migrate. The prerelease `execution.trials` field is hard-removed
-  with no alias; repeat/run-count placement belongs to experiments.
-- KTD6. AgentV adopts Vercel's experiment structure, not the package dependency,
-  until a direct adapter has a smaller, reviewed boundary.
-- KTD7. Full experiment fingerprints should include the experiment file contents,
-  selected eval source, setup-relevant fields, scripts, repeat config, timeout,
-  sandbox, and target references.
-
-## Experiment Contract
-
-Wire shape, shown in YAML:
-
-```yaml
-name: baseline
-target: codex-gpt5
-targets:
-  - codex-gpt5
-  - name: copilot-gpt55
-    use_target: copilot
-agent: codex
-model: openai/gpt-5.5
-agent_options:
-  reasoning_effort: high
-evals: "agent-042-*"
-scripts:
-  - build
-  - script: bun test
-    timeout_seconds: 120
-repeat:
-  count: 3
-  strategy: pass_at_k
-  cost_limit_usd: 2.00
-early_exit: false
-timeout_seconds: 900
-sandbox: auto
-workspace:
-  mode: temp
-setup:
-  - script: bun install
-  - script: cp skills/default/AGENTS.md AGENTS.md
-```
-
-Internal TypeScript shape:
-
-```ts
-interface ExperimentConfig {
-  name?: string;
-  target?: string;
-  targets?: readonly ExperimentTargetRef[];
-  agent?: string;
-  model?: string;
-  agentOptions?: Record<string, unknown>;
-  evals?: string | readonly string[];
-  scripts?: readonly ExperimentScript[];
-  repeat?: {
-    count: number;
-    strategy: 'pass_at_k' | 'mean' | 'confidence_interval';
-    costLimitUsd?: number;
-  };
-  runs?: number;
-  earlyExit?: boolean;
-  timeoutSeconds?: number;
-  sandbox?: 'auto' | 'docker' | 'vercel';
-  workspace?: Record<string, unknown>;
-  setup?: readonly ExperimentSetupStep[];
-  sourcePath?: string;
-}
-```
-
-`agent` is a harness label for Vercel alignment. AgentV execution should prefer
-`target` or `targets` for actual provider resolution so this does not create a
-parallel provider registry.
-
-## CLI Behavior
-
-Default resolution order for `agentv eval`:
-
-- Explicit `--experiment <label-or-path>`.
-- `.agentv/config.yaml` `experiments.default`.
-- `.agentv/config.yaml` `default_experiment`, accepted as a compatibility alias.
-- `agentv.config.ts` `experiments.default` or `defaultExperiment`, if present.
-- Current implicit `default` label.
-
-Path-like experiment values load an experiment file. Label-like values remain
-labels. If a loaded experiment has `name`, the name is the run namespace;
-otherwise AgentV derives the name from the file basename.
-
-Later CLI phases should add:
-
-- `agentv eval --experiment experiments/baseline.yaml`.
-- `agentv eval --experiment baseline` resolving `experiments/baseline.yaml`
-  before falling back to a label.
-- `agentv eval --experiments "experiments/*.yaml"` for matrices.
-- Experiment `evals` filters AgentV case IDs. When no eval paths are provided,
-  file discovery uses `.agentv/config.yaml eval_patterns` or AgentV's default
-  eval patterns.
-
-## Migration Strategy
-
-Phase 1 is additive. It introduces experiment config loading and default
-resolution without changing how eval execution applies targets or workspace
-settings.
-
-Phase 2 moves examples to committed `experiments/default.yaml` files while
-leaving existing `eval.yaml execution` fields in place.
-
-Phase 3 applies experiment runtime fields in the runner: target selection, eval
-filters, timeout, repeat/runs, early exit, sandbox/workspace mode, setup steps,
-and scripts.
-
-Phase 4 warns when new eval files use experiment-owned `execution` fields and
-documents the replacement.
-
-Phase 5 removes or hard-errors only for a future major or same-week unreleased
-surface where compatibility is not required.
-
-## Artifact Impact
-
-Existing artifact writers already accept an experiment label. Phase 1 should
-continue writing the resolved experiment name to `summary.json`, `index.jsonl`,
-trace envelopes, and results repository paths.
-
-Later artifact work should add:
-
-- `experiment_config_path` for the committed file.
-- `experiment_fingerprint` for cache and comparison invalidation.
-- Redacted `experiment_config` metadata for small safe fields.
-- `setup` and `scripts` provenance as references, not large inline payloads.
-
-`artifact_pointers` must remain reserved for detached large payload bytes. Normal
-experiment sidecars should use explicit path fields.
-
-## Implementation Units
-
-### U1. ADR and Implementation Plan
-
-Files:
-
-- `docs/adr/0006-separate-experiments-from-eval-definitions.md`
-- `docs/plans/2026-06-23-002-experiments-separation-plan.md`
-
-Approach:
-
-Capture the eval-versus-experiment decision, Vercel alignment, dependency
-boundary, compatibility strategy, and phased rollout.
-
-Verification:
-
-- Human review for vocabulary and product boundary.
-
-### U2. Experiment Contract and Loader
-
-Files:
-
-- `packages/core/src/evaluation/experiment.ts`
-- `packages/core/src/index.ts`
-- `packages/core/src/evaluation/config.ts`
-- `packages/core/src/evaluation/loaders/config-loader.ts`
-
-Approach:
-
-Add a narrow experiment wire contract, normalization to camelCase, YAML and
-TypeScript experiment loading, and default experiment config parsing. Keep the
-loader independent of runner behavior.
-
-Test Scenarios:
-
-- YAML experiment with `agent_options`, `early_exit`, `timeout_seconds`, setup,
-  `repeat`, and scripts normalizes to camelCase.
-- TypeScript experiment default export loads and normalizes.
-- Invalid `repeat.count`, `runs`, `timeout_seconds`, or `sandbox` fails with a
-  targeted error.
-- `.agentv/config.yaml` parses `experiments.default`.
-- `.agentv/config.yaml` accepts top-level `default_experiment`.
-
-### U3. CLI Default Experiment Resolution
-
-Files:
-
-- `apps/cli/src/commands/eval/run-eval.ts`
-- `apps/cli/src/commands/eval/result-layout.ts`
-
-Approach:
-
-Resolve the experiment before run directory creation. Explicit labels keep
-working. Path-like values load experiment files and derive the run label from
-`name` or basename. If no experiment is configured, the current `default` label
-is preserved.
-
-Test Scenarios:
-
-- `agentv eval evals/foo/eval.yaml` still writes under
-  `.agentv/results/default/<timestamp>/`.
-- Configured `experiments.default: experiments/default.yaml` with `name:
-  baseline` writes under `.agentv/results/baseline/<timestamp>/`.
-- `--experiment smoke` writes under `.agentv/results/smoke/<timestamp>/`.
-- `--experiment experiments/smoke.yaml` uses the file's `name` when present.
-- Missing path-like experiment values fail clearly.
-
-### U4. Runner Field Application
-
-Files:
-
-- `apps/cli/src/commands/eval/run-eval.ts`
-- `packages/core/src/evaluation/evaluate.ts`
-- `packages/core/src/evaluation/yaml-parser.ts`
-- `packages/core/src/evaluation/loaders/config-loader.ts`
-- `packages/core/src/evaluation/validation/experiment-file.schema.ts`
-
-Approach:
-
-Apply experiment fields in precedence order: CLI overrides, explicit experiment,
-legacy eval `execution` for still-supported fields, project config defaults.
-Reuse existing target resolution and workspace setup paths. Move setup-owned
-behavior out of eval docs before adding warnings. Do not retain
-`execution.trials`; experiments are the only public input path for run counts.
-
-Test Scenarios:
-
-- Experiment `target` selects an existing target from `targets.yaml`.
-- Experiment `targets` drives matrix evaluation.
-- Experiment `evals` selects case IDs; eval file discovery still comes from
-  positional paths, configured `eval_patterns`, or default eval patterns.
-- Experiment setup runs before agent execution and can inject an `AGENTS.md`
-  file.
-- Legacy `eval.yaml execution.target` still works when no experiment target is
-  configured.
-
-### U5. Artifact Provenance and Fingerprint
-
-Files:
-
-- `packages/core/src/evaluation/run-artifacts.ts`
-- `packages/core/src/evaluation/results-repo.ts`
-- `packages/core/src/evaluation/trace-envelope.ts`
-- `apps/cli/src/commands/eval/artifact-writer.ts`
-
-Approach:
-
-Extend artifact metadata with safe experiment provenance and a fingerprint. The
-fingerprint should cover experiment config, selected eval source content, and
-fields that affect execution.
-
-Test Scenarios:
-
-- Two experiments with different setup steps produce different fingerprints.
-- Changing an eval prompt changes the fingerprint.
-- Redacted config metadata excludes obvious secret fields.
-- Historical artifacts without fingerprints still read.
-
-### U6. Docs and Examples Migration
-
-Files:
-
-- `apps/web/src/content/docs/`
-- `examples/`
-- `CONCEPTS.md`
-
-Approach:
-
-Document eval versus experiment authoring. Add at least one example with
-`eval.yaml` plus `experiments/default.yaml`, and one A/B pair such as baseline
-versus with-skill.
-
-Test Scenarios:
-
-- Docs examples use `snake_case` wire fields.
-- Example default experiment runs without explicit `--experiment`.
-- Existing examples continue to run through the compatibility path except
-  prerelease `execution.trials`, which is migrated to experiments in this PR.
-
-## Phase 1 Scope
-
-The first PR originally targeted U1, U2, and U3. Owner review expanded this
-branch to include native U4/U5 support for experiment resolution, eval
-selection, run knobs, setup/scripts, target reuse, artifact provenance, and the
-pre-stable hard removal of `execution.trials`.
-
-The branch now applies `setup`, `scripts`, `repeat`, `runs`, `early_exit`,
-`timeout_seconds`, `target`, `targets`, and `evals` to execution behavior. Matrix
-execution via multiple experiment files remains a later phase.
-
-## Non-Goals
-
-- Do not replace AgentV's engine with `@vercel/agent-eval`.
-- Do not bulk-edit all examples in the first PR.
-- Do not remove or error on released legacy `eval.yaml execution` fields in this
-  branch. `execution.trials` is intentionally excluded because it is prerelease
-  and has moved to experiments.
-- Do not add a new provider schema inside experiments.
-- Do not implement experiment matrix execution in Phase 1.
-- Do not project AgentV experiments into Phoenix or another external store.
-
-## Verification Plan
-
-Phase 1 targeted checks:
-
-- `bun test packages/core/test/evaluation` for loader and config coverage once
-  tests exist.
-- CLI-level test for default experiment run layout.
-- A dry-run fixture using `experiments/default.yaml` to prove the resolved
-  experiment name reaches artifacts.
-- Existing eval-only dry-run fixture to prove fallback stays `default`.
-
-Before PR readiness:
-
-- Run the smallest targeted Bun test set covering changed loader and CLI
-  behavior.
-- Run TypeScript/package checks only if the touched packages require them or
-  targeted tests expose type errors.
-
-## Open Questions
-
-- Should the public field be only `experiments.default`, or should
-  `default_experiment` remain documented as a short alias?
-- Should `agent` be purely descriptive in AgentV, or should it become a harness
-  selector once non-target harness adapters exist?
-- Should experiment setup reuse workspace hooks exactly, or have a smaller setup
-  step schema that compiles into workspace hooks?
-- Should experiment fingerprints be written in Phase 2 with no cache behavior, or
-  wait until result reuse can consume them?
diff --git a/docs/solutions/architecture-patterns/separate-eval-tasks-from-experiment-runtime.md b/docs/solutions/architecture-patterns/separate-eval-tasks-from-experiment-runtime.md
deleted file mode 100644
index d4d66eb12..000000000
--- a/docs/solutions/architecture-patterns/separate-eval-tasks-from-experiment-runtime.md
+++ /dev/null
@@ -1,135 +0,0 @@
----
-title: "Separate eval tasks from experiment runtime"
-date: 2026-06-24
-category: architecture-patterns
-module: evaluation model
-problem_type: architecture_pattern
-component: tooling
-severity: medium
-applies_when:
-  - Designing eval, experiment, or artifact contracts for AgentV
-  - Deciding whether setup, target selection, repeat counts, or scripts belong in eval YAML
-  - Aligning AgentV with external eval conventions without copying their whole product model
-tags:
-  - experiments
-  - evals
-  - artifacts
-  - agent-eval
-  - dashboard
-  - repeat-runs
----
-
-# Separate eval tasks from experiment runtime
-
-## Context
-
-AgentV originally treated an experiment as a string label on a run while `eval.yaml` carried both the task definition and runtime setup. That made simple runs easy, but it blurred the boundary between what is being tested and how it is being tested. It also made A/B tests awkward: setup differences such as adding skill files, installing dependencies, or changing run counts had to be pushed into eval YAML or hidden behind environment variables.
-
-The experiment-separation work aligned AgentV with the useful part of Vercel `agent-eval`: an eval is the frozen task and assertion contract, while an experiment is the committed runtime variant that chooses targets, setup, scripts, repeat behavior, and filters. AgentV kept its own YAML-first authoring, target model, LLM graders, and dashboard artifact contracts instead of taking a hard runtime dependency on Vercel's package.
-
-## Guidance
-
-Keep eval definitions focused on task evidence:
-
-- prompts, datasets, and input files
-- assertions and LLM-grader criteria
-- task fixtures that represent the work being evaluated
-
-Put runtime variation in experiments:
-
-- target or target matrix selection
-- model and provider selection through existing AgentV targets
-- setup steps such as installing dependencies or injecting skill files
-- post-agent scripts
-- timeout, workers, budgets, repeat counts, and early-exit behavior
-- eval/test filters for a suite or A/B variant
-
-This keeps A/B experiments honest. A baseline and a "with skill" variant should point at the same eval task and differ only in experiment setup. If the task itself changes, the result is not an A/B comparison.
-
-Use external conventions as a lowest-common-denominator contract, not as a product takeover. The Vercel structure is useful for naming and layout:
-
-```text
-eval = what is tested
-experiment = how it is run
-run-N = one attempt inside a repeated case
-```
-
-AgentV should still preserve repo-native constraints that make it useful:
-
-- wire formats stay `snake_case`
-- YAML remains the canonical authoring path
-- existing target definitions are reused instead of introducing a parallel provider schema
-- dashboard and CI discovery stay anchored on root run manifests
-- LLM-judge assertions remain part of evals, not experiments
-
-## Why This Matters
-
-The split prevents configuration drift from becoming hidden test drift. When setup lives in an experiment, reviewers can see that two variants are testing the same task. When setup lives inside eval YAML, changing the setup can silently change the meaning of the eval suite.
-
-It also reduces future migration cost. A run can support Vercel-style experiment files, AgentV YAML experiments, repeat attempts, and dashboard browsing without forcing every consumer to understand every nested artifact. Root manifests remain the loading contract; nested files are evidence.
-
-## When to Apply
-
-- Adding a new run-level knob such as repeat count, timeout, workers, budget, sandbox, setup, or post-run scripts.
-- Designing an example that compares one agent/model/setup against another.
-- Moving a field out of eval YAML and deciding where backward compatibility should live.
-- Changing artifact layout for repeat runs or dashboard browsing.
-- Mapping an external eval convention into AgentV.
-
-## Examples
-
-**Prefer experiment setup for A/B variants:**
-
-```yaml
-name: copilot-with-skill
-target: copilot
-evals:
-  - bug-fix-*
-setup:
-  - script: cp skills/repo-debugging/AGENTS.md ./
-repeat:
-  count: 4
-  strategy: pass_at_k
-early_exit: false
-```
-
-**Keep the eval task independent of the runtime variant:**
-
-```yaml
-name: bug-fix-suite
-tests:
-  - id: bug-fix-001
-    input_files:
-      - PROMPT.md
-    assertions:
-      - type: llm-grader
-        target: grader
-```
-
-**Use root manifests for discovery and nested files for evidence:**
-
-```text
-.agentv/results/<experiment>/<timestamp>/
-  index.jsonl
-  summary.json
-  timing.json
-  <suite>/<case>/
-    task/PROMPT.md
-    summary.json
-    grading.json
-    run-1/
-      result.json
-      grading.json
-      transcript.json
-      transcript-raw.jsonl
-      outputs/answer.md
-```
-
-The dashboard should discover runs from root manifests and learn case locations from `index.jsonl` fields such as `artifact_dir`, `task_dir`, `summary_path`, and `grading_path`. It should not depend on optional per-attempt sidecars for discovery.
-
-## Related
-
-- `docs/adr/0006-separate-experiments-from-eval-definitions.md` - architecture decision for the split
-- `docs/plans/2026-06-23-002-experiments-separation-plan.md` - phased implementation plan
-- `docs/plans/2026-06-23-001-feat-repeat-runs-flaky-evals-plan.md` - repeat-run placement reconciled to experiments
-- `docs/solutions/best-practices/prefer-isolated-runtime-boundaries-for-agent-sdk-providers.md` - adjacent guidance on keeping provider runtime instability outside artifact finalization
diff --git a/examples/features/trials/README.md b/examples/features/trials/README.md
index 12241151f..35e0c81d4 100644
--- a/examples/features/trials/README.md
+++ b/examples/features/trials/README.md
@@ -1,33 +1,30 @@
 # Repeat Runs
 
-This example keeps the eval file focused on the task and puts repeat/run-count
-behavior in committed experiment files.
+This example keeps the runnable contract in one eval file. The inline
+`experiment:` block configures target selection and repeat/run-count behavior.
 
 ## Files
 
-- `evals/dataset.eval.yaml` defines the two task cases.
-- `experiments/default.yaml` runs the cases with `pass_at_k`.
-- `experiments/mean.yaml` aggregates repeated scores with `mean`.
-- `experiments/confidence-interval.yaml` aggregates repeated scores with a 95%
-  confidence interval lower bound.
+- `evals/dataset.eval.yaml` defines the task cases and inline runtime config.
 
 ## Run
 
 ```bash
-bun agentv eval --experiment examples/features/trials/experiments/default.yaml
+bun agentv eval examples/features/trials/evals/dataset.eval.yaml
 ```
 
-Swap the experiment path to try the other strategies.
+Edit `experiment.repeat.strategy` to try `mean` or `confidence_interval`.
 
 ## Migration from old `execution.trials`
 
-The repeat block now lives on the experiment, not in `eval.yaml`:
+The repeat block now lives under `experiment:` in `eval.yaml`:
 
 ```yaml
-repeat:
-  count: 2
-  strategy: pass_at_k
-  cost_limit_usd: 1.00
+experiment:
+  repeat:
+    count: 2
+    strategy: pass_at_k
+    cost_limit_usd: 1.00
 ```
 
 Field mapping:
diff --git a/examples/features/trials/evals/dataset.eval.yaml b/examples/features/trials/evals/dataset.eval.yaml
index d7592621a..791f1eade 100644
--- a/examples/features/trials/evals/dataset.eval.yaml
+++ b/examples/features/trials/evals/dataset.eval.yaml
@@ -1,8 +1,15 @@
 # AgentV Repeat Strategy Example
-# Demonstrates experiment-level repeat runs for handling LLM non-determinism
+# Demonstrates inline experiment repeat runs for handling LLM non-determinism
 
 name: trials
-description: Repeat strategy example - pass@k with 2 attempts configured by experiment
+description: Repeat strategy example - pass@k with 2 attempts configured inline
+
+experiment:
+  target: llm
+  repeat:
+    count: 2
+    strategy: pass_at_k
+    cost_limit_usd: 1.00
 
 tests:
   - id: math-basics
diff --git a/examples/features/trials/experiments/confidence-interval.yaml b/examples/features/trials/experiments/confidence-interval.yaml
deleted file mode 100644
index 50199244d..000000000
--- a/examples/features/trials/experiments/confidence-interval.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: trials-confidence-interval
-target: llm
-suites:
-  - ref: examples/features/trials/evals/dataset.eval.yaml
-    select:
-      test_ids:
-        - math-basics
-        - capital-knowledge
-repeat:
-  count: 5
-  strategy: confidence_interval
-  cost_limit_usd: 2.00
diff --git a/examples/features/trials/experiments/default.yaml b/examples/features/trials/experiments/default.yaml
deleted file mode 100644
index 41f173be1..000000000
--- a/examples/features/trials/experiments/default.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: trials
-target: llm
-suites:
-  - ref: examples/features/trials/evals/dataset.eval.yaml
-    select:
-      test_ids:
-        - math-basics
-        - capital-knowledge
-repeat:
-  count: 2
-  strategy: pass_at_k
-  cost_limit_usd: 1.00
diff --git a/examples/features/trials/experiments/mean.yaml b/examples/features/trials/experiments/mean.yaml
deleted file mode 100644
index fab967b8f..000000000
--- a/examples/features/trials/experiments/mean.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: trials-mean
-target: llm
-suites:
-  - ref: examples/features/trials/evals/dataset.eval.yaml
-    select:
-      test_ids:
-        - math-basics
-        - capital-knowledge
-repeat:
-  count: 3
-  strategy: mean
-  cost_limit_usd: 1.50
diff --git a/examples/showcase/multi-model-benchmark/README.md b/examples/showcase/multi-model-benchmark/README.md
index b22675a9c..899a453a4 100644
--- a/examples/showcase/multi-model-benchmark/README.md
+++ b/examples/showcase/multi-model-benchmark/README.md
@@ -17,9 +17,7 @@ Demonstrates a complete **multi-model × multi-metric × variability** evaluatio
 multi-model-benchmark/
 ├── README.md                        # This file
 ├── evals/
-│   └── benchmark.eval.yaml          # Eval definition (task cases + metrics)
-├── experiments/
-│   └── default.yaml                 # Targets, repeat policy, and run knobs
+│   └── benchmark.eval.yaml          # Eval definition, targets, repeat policy, and metrics
 └── prompts/
     ├── accuracy-rubric.md           # Factual correctness grader (weight 3.0)
     ├── completeness-rubric.md       # Coverage grader (weight 2.0)
@@ -37,19 +35,18 @@ From the repository root:
 
 ```bash
 # Run the full matrix (all targets × all tests × 2 repeat attempts)
-bun agentv eval --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml
+bun agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
 ```
 
 ### Cost & Safety
 
-The eval uses **low-cost models by default** (the targets defined in `.agentv/targets.yaml` such as `gpt-5-mini`, `claude-haiku`, `gemini-flash`). With 5 tests × 3 targets × 2 repeat attempts × 3 grader calls each, expect roughly **90 LLM calls**. A `repeat.cost_limit_usd: 2.00` cap is set in the experiment file.
+The eval uses **low-cost models by default** (the targets defined in `.agentv/targets.yaml` such as `gpt-5-mini`, `claude-haiku`, `gemini-flash`). With 5 tests × 3 targets × 2 repeat attempts × 3 grader calls each, expect roughly **90 LLM calls**. A `experiment.repeat.cost_limit_usd: 2.00` cap is set in the eval file.
 
 To run against a single target first:
 
 ```bash
 # Test with just one model before running the full matrix
 bun agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml \
-  --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml \
   --target copilot
 ```
 
@@ -59,16 +56,16 @@ The eval produces a canonical run workspace with `target` in each `index.jsonl`
 
 ```bash
 # N-way matrix — see all models at once
-agentv compare .agentv/results/default/<timestamp>/index.jsonl
+agentv compare .agentv/results/multi-model-benchmark/<timestamp>/index.jsonl
 
 # Designate a baseline for CI regression gating
-agentv compare .agentv/results/default/<timestamp>/index.jsonl --baseline copilot
+agentv compare .agentv/results/multi-model-benchmark/<timestamp>/index.jsonl --baseline copilot
 
 # Pairwise: compare two specific targets
-agentv compare .agentv/results/default/<timestamp>/index.jsonl --baseline copilot --candidate claude
+agentv compare .agentv/results/multi-model-benchmark/<timestamp>/index.jsonl --baseline copilot --candidate claude
 
 # JSON output for CI integration
-agentv compare .agentv/results/default/<timestamp>/index.jsonl --json
+agentv compare .agentv/results/multi-model-benchmark/<timestamp>/index.jsonl --json
 ```
 
 ### Expected Output
@@ -96,13 +93,14 @@ Pairwise Summary:
 
 ### 1. Targets Matrix
 
-The experiment `targets` array runs every test against each listed model:
+The inline `experiment.targets` array runs every test against each listed model:
 
 ```yaml
-targets:
-  - copilot       # e.g., gpt-5-mini
-  - claude        # e.g., claude-haiku
-  - gemini-llm   # e.g., gemini-flash
+experiment:
+  targets:
+    - copilot       # e.g., gpt-5-mini
+    - claude        # e.g., claude-haiku
+    - gemini-llm   # e.g., gemini-flash
 ```
 
 ### 2. Weighted Graders
@@ -123,26 +121,19 @@ Weighted average formula: `(3×accuracy + 2×completeness + 1×clarity) / 6`
 
 ### 3. Experiment repeat
 
-Each test runs twice through the committed experiment. `pass_at_k` uses early-exit
-ergonomics by default: a case can stop once any attempt succeeds.
+Each test runs twice through the inline experiment block. `pass_at_k` uses
+early-exit ergonomics by default: a case can stop once any attempt succeeds.
 
 ```yaml
-targets:
-  - copilot
-  - claude
-  - gemini-llm
-suites:
-  - ref: examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
-    select:
-      test_ids:
-        - factual-*
-        - analytical-comparison
-        - creative-explanation
-        - structured-list
-repeat:
-  count: 2
-  strategy: pass_at_k
-  cost_limit_usd: 2.00
+experiment:
+  targets:
+    - copilot
+    - claude
+    - gemini-llm
+  repeat:
+    count: 2
+    strategy: pass_at_k
+    cost_limit_usd: 2.00
 ```
 
 This surfaces non-determinism — if a model passes on run 1 but fails on run 2,
diff --git a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
index d3bffb350..f59d88555 100644
--- a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
+++ b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
@@ -3,18 +3,28 @@
 # Demonstrates multi-model × multi-metric × variability workflow:
 #   - Targets matrix: runs every test against multiple models
 #   - Weighted graders: accuracy (3×), completeness (2×), clarity (1×)
-#   - Experiment repeat: pass@k with 2 attempts to measure variability
+#   - Inline experiment repeat: pass@k with 2 attempts to measure variability
 #
 # Default targets use low-cost models for safe experimentation.
 # Override with: agentv eval ... --target <name>
 #
 # Usage:
-#   agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml \
-#     --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml
+#   agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
 
+name: multi-model-benchmark
 description: Multi-model benchmark — accuracy, completeness, and clarity across models
 tags: [multi-provider]
 
+experiment:
+  targets:
+    - copilot
+    - claude
+    - gemini-llm
+  repeat:
+    count: 2
+    strategy: pass_at_k
+    cost_limit_usd: 2.00
+
 assertions:
   - name: accuracy
     type: llm-grader
diff --git a/examples/showcase/multi-model-benchmark/experiments/default.yaml b/examples/showcase/multi-model-benchmark/experiments/default.yaml
deleted file mode 100644
index cb941192e..000000000
--- a/examples/showcase/multi-model-benchmark/experiments/default.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-name: multi-model-benchmark
-targets:
-  - copilot
-  - claude
-  - gemini-llm
-suites:
-  - ref: examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
-    select:
-      test_ids:
-        - factual-geography
-        - factual-science
-        - analytical-comparison
-        - creative-explanation
-        - structured-list
-repeat:
-  count: 2
-  strategy: pass_at_k
-  cost_limit_usd: 2.00
diff --git a/packages/core/scripts/generate-eval-schema.ts b/packages/core/scripts/generate-eval-schema.ts
index fa1877444..71d323b01 100644
--- a/packages/core/scripts/generate-eval-schema.ts
+++ b/packages/core/scripts/generate-eval-schema.ts
@@ -8,7 +8,6 @@ import { writeFile } from 'node:fs/promises';
 import path from 'node:path';
 import { zodToJsonSchema } from 'zod-to-json-schema';
 import { EvalFileSchema } from '../src/evaluation/validation/eval-file.schema.js';
-import { ExperimentFileSchema } from '../src/evaluation/validation/experiment-file.schema.js';
 
 async function writeSchema(options: {
   readonly schema: Parameters<typeof zodToJsonSchema>[0];
@@ -47,11 +46,3 @@ await writeSchema({
   description: 'Schema for AgentV evaluation YAML files (.eval.yaml)',
   outputFile: 'eval-schema.json',
 });
-
-await writeSchema({
-  schema: ExperimentFileSchema,
-  name: 'ExperimentFile',
-  title: 'AgentV Experiment File',
-  description: 'Schema for AgentV experiment YAML files (experiments/*.yaml)',
-  outputFile: 'experiment-schema.json',
-});
diff --git a/packages/core/src/evaluation/config.ts b/packages/core/src/evaluation/config.ts
index 7d2aa6adb..a82708ed0 100644
--- a/packages/core/src/evaluation/config.ts
+++ b/packages/core/src/evaluation/config.ts
@@ -67,18 +67,6 @@ const AgentVConfigSchema = z.object({
     })
     .optional(),
 
-  /** Compatibility shorthand for experiments.default */
-  defaultExperiment: z.string().optional(),
-
-  /** Experiment defaults */
-  experiments: z
-    .object({
-      /** Default experiment label or path used when eval runs omit --experiment */
-      default: z.string().optional(),
-    })
-    .strict()
-    .optional(),
-
   /** Cost and duration limits */
   limits: z
     .object({
diff --git a/packages/core/src/evaluation/experiment.ts b/packages/core/src/evaluation/experiment.ts
index a572511b3..b0659c3f9 100644
--- a/packages/core/src/evaluation/experiment.ts
+++ b/packages/core/src/evaluation/experiment.ts
@@ -1,10 +1,6 @@
 import { createHash } from 'node:crypto';
-import { readFile } from 'node:fs/promises';
-import path from 'node:path';
-import { pathToFileURL } from 'node:url';
 
-import type { TrialStrategy } from './types.js';
-import { parseYamlValue } from './yaml-loader.js';
+import type { EvalRunOverride, TrialStrategy } from './types.js';
 
 export type ExperimentSandbox = 'auto' | 'docker' | 'vercel';
 
@@ -24,27 +20,6 @@ export type ExperimentTargetRef =
       readonly hooks?: Record<string, unknown>;
     };
 
-export type ExperimentScriptWire =
-  | string
-  | {
-      readonly command?: string | readonly string[];
-      readonly script?: string | readonly string[];
-      readonly timeout_seconds?: number;
-      readonly cwd?: string;
-      readonly env?: Record<string, string>;
-    };
-
-export type ExperimentScript = {
-  readonly command?: readonly string[];
-  readonly script?: string | readonly string[];
-  readonly timeoutSeconds?: number;
-  readonly cwd?: string;
-  readonly env?: Record<string, string>;
-};
-
-export type ExperimentSetupFn = (sandbox: unknown) => void | Promise<void>;
-export type ExperimentSetup = readonly ExperimentScript[] | ExperimentSetupFn;
-
 export type ExperimentRepeatWire = {
   readonly count?: number;
   readonly strategy?: TrialStrategy;
@@ -58,25 +33,6 @@ export type ExperimentRepeat = {
   readonly costLimitUsd?: number;
 };
 
-export type ExperimentSuiteSelectWire = {
-  readonly test_ids?: readonly string[];
-  readonly testIds?: readonly string[];
-};
-
-export type ExperimentSuiteSelect = {
-  readonly testIds: readonly string[];
-};
-
-export type ExperimentSuiteRefWire = {
-  readonly ref: string;
-  readonly select?: ExperimentSuiteSelectWire;
-};
-
-export type ExperimentSuiteRef = {
-  readonly ref: string;
-  readonly select?: ExperimentSuiteSelect;
-};
-
 export type ExperimentConfigWire = {
   readonly name?: string;
   readonly agent?: string;
@@ -84,17 +40,15 @@ export type ExperimentConfigWire = {
   readonly targets?: readonly ExperimentTargetRefWire[];
   readonly model?: string;
   readonly agent_options?: Record<string, unknown>;
-  readonly suites?: readonly ExperimentSuiteRefWire[];
-  readonly scripts?: readonly ExperimentScriptWire[];
   readonly repeat?: ExperimentRepeatWire;
   readonly runs?: number;
   readonly early_exit?: boolean;
   readonly timeout_seconds?: number;
   readonly workers?: number;
+  readonly threshold?: number;
   readonly budget_usd?: number;
   readonly sandbox?: ExperimentSandbox;
   readonly workspace?: Record<string, unknown>;
-  readonly setup?: readonly ExperimentScriptWire[] | ExperimentSetupFn;
 };
 
 export type ExperimentConfig = {
@@ -104,35 +58,25 @@ export type ExperimentConfig = {
   readonly targets?: readonly ExperimentTargetRef[];
   readonly model?: string;
   readonly agentOptions?: Record<string, unknown>;
-  readonly suites?: readonly ExperimentSuiteRef[];
-  readonly scripts?: readonly ExperimentScript[];
   readonly repeat?: ExperimentRepeat;
   readonly runs?: number;
   readonly earlyExit?: boolean;
   readonly timeoutSeconds?: number;
   readonly workers?: number;
+  readonly threshold?: number;
   readonly budgetUsd?: number;
   readonly sandbox?: ExperimentSandbox;
   readonly workspace?: Record<string, unknown>;
-  readonly setup?: ExperimentSetup;
-  readonly sourcePath?: string;
   readonly fingerprint?: string;
 };
 
 export type ExperimentArtifactMetadata = {
   readonly name?: string;
-  readonly source_path?: string;
   readonly fingerprint?: string;
   readonly agent?: string;
   readonly target?: string;
   readonly targets?: readonly string[];
   readonly model?: string;
-  readonly suites?: readonly {
-    readonly ref: string;
-    readonly select?: {
-      readonly test_ids: readonly string[];
-    };
-  }[];
   readonly repeat?: {
     readonly count: number;
     readonly strategy: TrialStrategy;
@@ -142,65 +86,29 @@ export type ExperimentArtifactMetadata = {
   readonly early_exit?: boolean;
   readonly timeout_seconds?: number;
   readonly workers?: number;
+  readonly threshold?: number;
   readonly budget_usd?: number;
   readonly sandbox?: ExperimentSandbox;
 };
 
-type NormalizeOptions = {
-  readonly sourcePath?: string;
-};
-
-const EXPERIMENT_FILE_EXTENSIONS = new Set(['.yaml', '.yml', '.ts', '.js', '.mts', '.mjs']);
 const VALID_SANDBOXES: ReadonlySet<string> = new Set(['auto', 'docker', 'vercel']);
 const VALID_REPEAT_STRATEGIES: ReadonlySet<string> = new Set([
   'pass_at_k',
+  'pass_all',
   'mean',
   'confidence_interval',
 ]);
 
-export function isExperimentFileReference(value: string): boolean {
-  const trimmed = value.trim();
-  if (!trimmed) {
-    return false;
-  }
-  return (
-    trimmed.includes('/') ||
-    trimmed.includes('\\') ||
-    EXPERIMENT_FILE_EXTENSIONS.has(path.extname(trimmed).toLowerCase())
-  );
-}
-
-export function deriveExperimentNameFromPath(filePath: string): string {
-  return path
-    .basename(filePath)
-    .replace(/\.experiment\.(ya?ml|[cm]?[jt]s)$/i, '')
-    .replace(/\.(ya?ml|[cm]?[jt]s)$/i, '');
-}
-
-export async function loadExperimentConfig(filePath: string): Promise<ExperimentConfig> {
-  const resolvedPath = path.resolve(filePath);
-  const ext = path.extname(resolvedPath).toLowerCase();
-  let rawConfig: unknown;
-
-  if (ext === '.yaml' || ext === '.yml') {
-    rawConfig = parseYamlValue(await readFile(resolvedPath, 'utf8'));
-  } else if (EXPERIMENT_FILE_EXTENSIONS.has(ext)) {
-    const moduleUrl = pathToFileURL(resolvedPath).href;
-    const mod = await import(moduleUrl);
-    rawConfig = mod.default ?? mod.config ?? mod;
-  } else {
-    throw new Error(
-      `Unsupported experiment file extension '${ext}'. Use .yaml, .yml, .ts, .js, .mts, or .mjs.`,
-    );
-  }
-
-  return normalizeExperimentConfig(rawConfig, { sourcePath: resolvedPath });
-}
+const RUN_OVERRIDE_FIELDS: ReadonlySet<string> = new Set([
+  'threshold',
+  'repeat',
+  'timeout_seconds',
+  'timeoutSeconds',
+  'budget_usd',
+  'budgetUsd',
+]);
 
-export function normalizeExperimentConfig(
-  rawConfig: unknown,
-  options: NormalizeOptions = {},
-): ExperimentConfig {
+export function normalizeExperimentConfig(rawConfig: unknown): ExperimentConfig {
   if (!isRecord(rawConfig)) {
     throw new Error('Experiment config must be an object.');
   }
@@ -211,8 +119,7 @@ export function normalizeExperimentConfig(
   const targets = readTargets(rawConfig.targets);
   const model = readOptionalString(rawConfig.model, 'model');
   const agentOptions = readOptionalRecord(rawConfig.agent_options ?? rawConfig.agentOptions);
-  const suites = readSuites(rawConfig.suites);
-  const scripts = readScriptArray(rawConfig.scripts, 'scripts');
+  rejectExperimentLifecycleCommands(rawConfig);
   const repeat = readRepeat(rawConfig.repeat);
   const runs = readOptionalPositiveInteger(rawConfig.runs, 'runs');
   if (repeat !== undefined && runs !== undefined) {
@@ -224,13 +131,13 @@ export function normalizeExperimentConfig(
     'timeout_seconds',
   );
   const workers = readOptionalPositiveInteger(rawConfig.workers, 'workers');
+  const threshold = readOptionalThreshold(rawConfig.threshold);
   const budgetUsd = readOptionalPositiveNumber(
     rawConfig.budget_usd ?? rawConfig.budgetUsd,
     'budget_usd',
   );
   const sandbox = readOptionalSandbox(rawConfig.sandbox);
   const workspace = readOptionalRecord(rawConfig.workspace);
-  const setup = readSetup(rawConfig.setup);
 
   const configWithoutFingerprint: Omit<ExperimentConfig, 'fingerprint'> = {
     ...(name !== undefined && { name }),
@@ -239,18 +146,15 @@ export function normalizeExperimentConfig(
     ...(targets !== undefined && { targets }),
     ...(model !== undefined && { model }),
     ...(agentOptions !== undefined && { agentOptions }),
-    ...(suites !== undefined && { suites }),
-    ...(scripts !== undefined && { scripts }),
     ...(repeat !== undefined && { repeat }),
     ...(runs !== undefined && { runs }),
     ...(earlyExit !== undefined && { earlyExit }),
     ...(timeoutSeconds !== undefined && { timeoutSeconds }),
     ...(workers !== undefined && { workers }),
+    ...(threshold !== undefined && { threshold }),
     ...(budgetUsd !== undefined && { budgetUsd }),
     ...(sandbox !== undefined && { sandbox }),
     ...(workspace !== undefined && { workspace }),
-    ...(setup !== undefined && { setup }),
-    ...(options.sourcePath !== undefined && { sourcePath: options.sourcePath }),
   };
 
   return {
@@ -259,6 +163,37 @@ export function normalizeExperimentConfig(
   };
 }
 
+export function normalizeExperimentRunOverride(rawConfig: unknown): EvalRunOverride {
+  if (!isRecord(rawConfig)) {
+    throw new Error('Run override must be an object.');
+  }
+  for (const key of Object.keys(rawConfig)) {
+    if (!RUN_OVERRIDE_FIELDS.has(key)) {
+      throw new Error(
+        `Invalid run override field '${key}'. Scoped run overrides support only threshold, repeat, timeout_seconds, and budget_usd.`,
+      );
+    }
+  }
+
+  const threshold = readOptionalThreshold(rawConfig.threshold);
+  const repeat = readRepeat(rawConfig.repeat);
+  const timeoutSeconds = readOptionalPositiveNumber(
+    rawConfig.timeout_seconds ?? rawConfig.timeoutSeconds,
+    'timeout_seconds',
+  );
+  const budgetUsd = readOptionalPositiveNumber(
+    rawConfig.budget_usd ?? rawConfig.budgetUsd,
+    'budget_usd',
+  );
+
+  return {
+    ...(threshold !== undefined && { threshold }),
+    ...(repeat !== undefined && { repeat }),
+    ...(timeoutSeconds !== undefined && { timeoutSeconds }),
+    ...(budgetUsd !== undefined && { budgetUsd }),
+  };
+}
+
 export function fingerprintExperimentConfig(config: ExperimentConfig): string {
   const stablePayload = toStableJsonValue(config);
   return createHash('sha256').update(JSON.stringify(stablePayload)).digest('hex');
@@ -275,13 +210,11 @@ export function buildExperimentArtifactMetadata(
     .filter((target) => target.trim().length > 0);
   return {
     ...(config.name !== undefined && { name: config.name }),
-    ...(config.sourcePath !== undefined && { source_path: config.sourcePath }),
     ...(config.fingerprint !== undefined && { fingerprint: config.fingerprint }),
     ...(config.agent !== undefined && { agent: config.agent }),
     ...(config.target !== undefined && { target: config.target }),
     ...(targets && targets.length > 0 && { targets }),
     ...(config.model !== undefined && { model: config.model }),
-    ...(config.suites !== undefined && { suites: config.suites.map(toSuiteArtifactMetadata) }),
     ...(config.repeat !== undefined && {
       repeat: {
         count: config.repeat.count,
@@ -295,25 +228,12 @@ export function buildExperimentArtifactMetadata(
     ...(config.earlyExit !== undefined && { early_exit: config.earlyExit }),
     ...(config.timeoutSeconds !== undefined && { timeout_seconds: config.timeoutSeconds }),
     ...(config.workers !== undefined && { workers: config.workers }),
+    ...(config.threshold !== undefined && { threshold: config.threshold }),
     ...(config.budgetUsd !== undefined && { budget_usd: config.budgetUsd }),
     ...(config.sandbox !== undefined && { sandbox: config.sandbox }),
   };
 }
 
-function toSuiteArtifactMetadata(suite: ExperimentSuiteRef): {
-  readonly ref: string;
-  readonly select?: { readonly test_ids: readonly string[] };
-} {
-  return {
-    ref: suite.ref,
-    ...(suite.select !== undefined && {
-      select: {
-        test_ids: suite.select.testIds,
-      },
-    }),
-  };
-}
-
 function readRepeat(raw: unknown): ExperimentRepeat | undefined {
   if (raw === undefined) {
     return undefined;
@@ -363,156 +283,6 @@ function readTargets(raw: unknown): readonly ExperimentTargetRef[] | undefined {
   });
 }
 
-function readSuites(raw: unknown): readonly ExperimentSuiteRef[] | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (!Array.isArray(raw)) {
-    throw new Error('Experiment suites must be an array.');
-  }
-  if (raw.length === 0) {
-    throw new Error('Experiment suites must not be empty.');
-  }
-  return raw.map((entry, index): ExperimentSuiteRef => {
-    if (!isRecord(entry)) {
-      throw new Error(`Experiment suites[${index}] must be an object.`);
-    }
-    const ref = readRequiredString(entry.ref, `suites[${index}].ref`);
-    const select = readSuiteSelect(entry.select, `suites[${index}].select`);
-    return {
-      ref,
-      ...(select !== undefined && { select }),
-    };
-  });
-}
-
-function readSuiteSelect(raw: unknown, location: string): ExperimentSuiteSelect | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (!isRecord(raw)) {
-    throw new Error(`Experiment ${location} must be an object.`);
-  }
-  const testIds = readOptionalStringArray(raw.test_ids ?? raw.testIds, `${location}.test_ids`);
-  if (testIds === undefined) {
-    throw new Error(`Experiment ${location}.test_ids is required when select is set.`);
-  }
-  return { testIds };
-}
-
-function readScriptArray(raw: unknown, location: string): readonly ExperimentScript[] | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (!Array.isArray(raw)) {
-    throw new Error(`Experiment ${location} must be an array.`);
-  }
-  return raw.map((entry, index) => readScript(entry, `${location}[${index}]`));
-}
-
-function readSetup(raw: unknown): ExperimentSetup | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (typeof raw === 'function') {
-    return raw as ExperimentSetupFn;
-  }
-  return readScriptArray(raw, 'setup');
-}
-
-function readScript(raw: unknown, location: string): ExperimentScript {
-  if (typeof raw === 'string') {
-    const script = raw.trim();
-    if (!script) {
-      throw new Error(`Experiment ${location} must not be empty.`);
-    }
-    return { script };
-  }
-  if (!isRecord(raw)) {
-    throw new Error(`Experiment ${location} must be a string or object.`);
-  }
-
-  const command = readOptionalCommand(raw.command, `${location}.command`);
-  const script = readOptionalStringOrStringArray(raw.script, `${location}.script`);
-  if (command === undefined && script === undefined) {
-    throw new Error(`Experiment ${location} must define command or script.`);
-  }
-
-  const timeoutSeconds = readOptionalPositiveNumber(
-    raw.timeout_seconds ?? raw.timeoutSeconds,
-    `${location}.timeout_seconds`,
-  );
-  const cwd = readOptionalString(raw.cwd, `${location}.cwd`);
-  const env = readOptionalStringRecord(raw.env, `${location}.env`);
-
-  return {
-    ...(command !== undefined && { command }),
-    ...(script !== undefined && { script }),
-    ...(timeoutSeconds !== undefined && { timeoutSeconds }),
-    ...(cwd !== undefined && { cwd }),
-    ...(env !== undefined && { env }),
-  };
-}
-
-function readOptionalCommand(raw: unknown, location: string): readonly string[] | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (typeof raw === 'string') {
-    const command = raw.trim();
-    if (!command) {
-      throw new Error(`Experiment ${location} must not be empty.`);
-    }
-    return ['sh', '-c', command];
-  }
-  if (
-    Array.isArray(raw) &&
-    raw.length > 0 &&
-    raw.every((entry) => typeof entry === 'string' && entry.trim())
-  ) {
-    return raw.map((entry) => entry.trim());
-  }
-  throw new Error(`Experiment ${location} must be a string or string array.`);
-}
-
-function readOptionalStringOrStringArray(
-  raw: unknown,
-  location: string,
-): string | readonly string[] | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (typeof raw === 'string') {
-    const trimmed = raw.trim();
-    if (!trimmed) {
-      throw new Error(`Experiment ${location} must not be empty.`);
-    }
-    return trimmed;
-  }
-  if (
-    Array.isArray(raw) &&
-    raw.length > 0 &&
-    raw.every((entry) => typeof entry === 'string' && entry.trim())
-  ) {
-    return raw.map((entry) => entry.trim());
-  }
-  throw new Error(`Experiment ${location} must be a string or string array.`);
-}
-
-function readOptionalStringArray(raw: unknown, location: string): readonly string[] | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (
-    Array.isArray(raw) &&
-    raw.length > 0 &&
-    raw.every((entry) => typeof entry === 'string' && entry.trim())
-  ) {
-    return raw.map((entry) => entry.trim());
-  }
-  throw new Error(`Experiment ${location} must be a non-empty string array.`);
-}
-
 function readOptionalString(raw: unknown, location: string): string | undefined {
   if (raw === undefined) {
     return undefined;
@@ -581,6 +351,16 @@ function readOptionalNonNegativeNumber(raw: unknown, location: string): number |
   return raw;
 }
 
+function readOptionalThreshold(raw: unknown): number | undefined {
+  if (raw === undefined) {
+    return undefined;
+  }
+  if (typeof raw !== 'number' || raw < 0 || raw > 1) {
+    throw new Error('Experiment threshold must be a number between 0 and 1.');
+  }
+  return raw;
+}
+
 function readOptionalPositiveNumber(raw: unknown, location: string): number | undefined {
   if (raw === undefined) {
     return undefined;
@@ -611,27 +391,23 @@ function readOptionalRecord(raw: unknown): Record<string, unknown> | undefined {
   return raw;
 }
 
-function readOptionalStringRecord(
-  raw: unknown,
-  location: string,
-): Record<string, string> | undefined {
-  if (raw === undefined) {
-    return undefined;
-  }
-  if (!isRecord(raw)) {
-    throw new Error(`Experiment ${location} must be an object.`);
-  }
-  const entries = Object.entries(raw);
-  if (!entries.every((entry): entry is [string, string] => typeof entry[1] === 'string')) {
-    throw new Error(`Experiment ${location} values must be strings.`);
-  }
-  return Object.fromEntries(entries);
-}
-
 function isRecord(value: unknown): value is Record<string, unknown> {
   return typeof value === 'object' && value !== null && !Array.isArray(value);
 }
 
+function rejectExperimentLifecycleCommands(rawConfig: Record<string, unknown>): void {
+  if (rawConfig.setup !== undefined) {
+    throw new Error(
+      'Experiment setup is not supported. Use workspace.hooks for repo setup or targets[].hooks for runner setup.',
+    );
+  }
+  if (rawConfig.scripts !== undefined) {
+    throw new Error(
+      'Experiment scripts are not supported. Use workspace.hooks for repo setup or targets[].hooks for runner setup.',
+    );
+  }
+}
+
 function toStableJsonValue(value: unknown): unknown {
   if (typeof value === 'function') {
     return '[function]';
diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts
index 5b2b759fd..74bbd7b40 100644
--- a/packages/core/src/evaluation/loaders/config-loader.ts
+++ b/packages/core/src/evaluation/loaders/config-loader.ts
@@ -70,17 +70,9 @@ export type HooksConfig = {
   readonly before_session?: string;
 };
 
-export type ExperimentsConfig = {
-  /** Default experiment label or path used when `agentv eval` omits --experiment. */
-  readonly default?: string;
-};
-
 export type AgentVConfig = {
   readonly required_version?: string;
   readonly eval_patterns?: readonly string[];
-  /** Compatibility shorthand for experiments.default. */
-  readonly default_experiment?: string;
-  readonly experiments?: ExperimentsConfig;
   readonly execution?: ExecutionDefaults;
   readonly results?: ResultsConfig;
   readonly hooks?: HooksConfig;
@@ -191,13 +183,15 @@ function parseConfigObject(
       (parsed as Record<string, unknown>).execution,
       configPath,
     );
-    const defaultExperiment = parseDefaultExperiment(
+    warnRemovedExperimentPointer(
       (parsed as Record<string, unknown>).default_experiment,
       configPath,
+      'default_experiment',
     );
-    const experiments = parseExperimentsConfig(
+    warnRemovedExperimentPointer(
       (parsed as Record<string, unknown>).experiments,
       configPath,
+      'experiments',
     );
     const results = parseResultsConfig((parsed as Record<string, unknown>).results, configPath);
     const hooks = parseHooksConfig((parsed as Record<string, unknown>).hooks, configPath);
@@ -205,8 +199,6 @@ function parseConfigObject(
     return {
       required_version: requiredVersion as string | undefined,
       eval_patterns: evalPatterns as readonly string[] | undefined,
-      ...(defaultExperiment && { default_experiment: defaultExperiment }),
-      ...(experiments && { experiments }),
       execution: executionDefaults,
       results,
       ...(hooks && { hooks }),
@@ -217,17 +209,26 @@ function parseConfigObject(
   }
 }
 
+function getSuiteRuntimeBlock(suite: JsonObject): Record<string, unknown> | undefined {
+  if (suite.experiment !== undefined && suite.execution !== undefined) {
+    throw new Error("Use either top-level 'experiment' or legacy 'execution', not both.");
+  }
+  const runtime = suite.experiment ?? suite.execution;
+  if (!runtime || typeof runtime !== 'object' || Array.isArray(runtime)) {
+    return undefined;
+  }
+  return runtime as Record<string, unknown>;
+}
+
 /**
  * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
  */
 export function extractTargetFromSuite(suite: JsonObject): string | undefined {
-  // Check execution.target first (new location), fallback to root-level target (legacy)
-  const execution = suite.execution;
-  if (execution && typeof execution === 'object' && !Array.isArray(execution)) {
-    const executionTarget = (execution as Record<string, unknown>).target;
-    if (typeof executionTarget === 'string' && executionTarget.trim().length > 0) {
-      return executionTarget.trim();
-    }
+  // Check experiment.target first, then legacy execution.target, then root-level target.
+  const runtime = getSuiteRuntimeBlock(suite);
+  const runtimeTarget = runtime?.target;
+  if (typeof runtimeTarget === 'string' && runtimeTarget.trim().length > 0) {
+    return runtimeTarget.trim();
   }
 
   // Fallback to legacy root-level target
@@ -247,12 +248,12 @@ export function extractTargetFromSuite(suite: JsonObject): string | undefined {
 export function extractTargetRefsFromSuite(
   suite: JsonObject,
 ): readonly EvalTargetRef[] | undefined {
-  const execution = suite.execution;
-  if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
+  const runtime = getSuiteRuntimeBlock(suite);
+  if (!runtime) {
     return undefined;
   }
 
-  const targets = (execution as Record<string, unknown>).targets;
+  const targets = runtime.targets;
   if (!Array.isArray(targets)) {
     return undefined;
   }
@@ -353,12 +354,12 @@ function parseTargetHooks(raw: unknown): TargetHooksConfig | undefined {
  * Extract workers count from suite-level execution block.
  */
 export function extractWorkersFromSuite(suite: JsonObject): number | undefined {
-  const execution = suite.execution;
-  if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
+  const runtime = getSuiteRuntimeBlock(suite);
+  if (!runtime) {
     return undefined;
   }
 
-  const workers = (execution as Record<string, unknown>).workers;
+  const workers = runtime.workers;
   if (typeof workers === 'number' && Number.isInteger(workers) && workers >= 1 && workers <= 50) {
     return workers;
   }
@@ -397,12 +398,11 @@ export interface CacheConfig {
  * Returns undefined when no cache config is specified.
  */
 export function extractCacheConfig(suite: JsonObject): CacheConfig | undefined {
-  const execution = suite.execution;
-  if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
+  const executionObj = getSuiteRuntimeBlock(suite);
+  if (!executionObj) {
     return undefined;
   }
 
-  const executionObj = execution as Record<string, unknown>;
   const cache = executionObj.cache;
 
   if (cache === undefined || cache === null) {
@@ -430,13 +430,11 @@ export function extractCacheConfig(suite: JsonObject): CacheConfig | undefined {
  * Returns undefined when not specified.
  */
 export function extractBudgetUsd(suite: JsonObject): number | undefined {
-  const execution = suite.execution;
-  if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
+  const executionObj = getSuiteRuntimeBlock(suite);
+  if (!executionObj) {
     return undefined;
   }
 
-  const executionObj = execution as Record<string, unknown>;
-
   // Reject the old key with a clear error
   if ('total_budget_usd' in executionObj || 'totalBudgetUsd' in executionObj) {
     throw new Error(
@@ -464,12 +462,11 @@ export function extractBudgetUsd(suite: JsonObject): number | undefined {
  * Returns undefined when not specified.
  */
 export function extractFailOnError(suite: JsonObject): FailOnError | undefined {
-  const execution = suite.execution;
-  if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
+  const executionObj = getSuiteRuntimeBlock(suite);
+  if (!executionObj) {
     return undefined;
   }
 
-  const executionObj = execution as Record<string, unknown>;
   const raw = executionObj.fail_on_error ?? executionObj.failOnError;
 
   if (raw === undefined || raw === null) {
@@ -490,12 +487,11 @@ export function extractFailOnError(suite: JsonObject): FailOnError | undefined {
  * Returns undefined when not specified.
  */
 export function extractThreshold(suite: JsonObject): number | undefined {
-  const execution = suite.execution;
-  if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
+  const executionObj = getSuiteRuntimeBlock(suite);
+  if (!executionObj) {
     return undefined;
   }
 
-  const executionObj = execution as Record<string, unknown>;
   const raw = executionObj.threshold;
 
   if (raw === undefined || raw === null) {
@@ -586,44 +582,13 @@ export function parseExecutionDefaults(
   return Object.keys(result).length > 0 ? (result as ExecutionDefaults) : undefined;
 }
 
-export function resolveDefaultExperimentReference(
-  config: AgentVConfig | null | undefined,
-): string | undefined {
-  return config?.experiments?.default ?? config?.default_experiment;
-}
-
-function parseDefaultExperiment(raw: unknown, configPath: string): string | undefined {
+function warnRemovedExperimentPointer(raw: unknown, configPath: string, key: string): void {
   if (raw === undefined || raw === null) {
-    return undefined;
-  }
-  const value = readTrimmedString(raw);
-  if (!value) {
-    logWarning(`Invalid default_experiment in ${configPath}, expected non-empty string`);
-    return undefined;
-  }
-  return value;
-}
-
-export function parseExperimentsConfig(
-  raw: unknown,
-  configPath: string,
-): ExperimentsConfig | undefined {
-  if (raw === undefined || raw === null) {
-    return undefined;
+    return;
   }
-  if (typeof raw !== 'object' || Array.isArray(raw)) {
-    logWarning(`Invalid experiments in ${configPath}, expected object`);
-    return undefined;
-  }
-
-  const obj = raw as Record<string, unknown>;
-  const defaultExperiment = readTrimmedString(obj.default);
-  if (obj.default !== undefined && !defaultExperiment) {
-    logWarning(`Invalid experiments.default in ${configPath}, expected non-empty string`);
-    return undefined;
-  }
-
-  return defaultExperiment ? { default: defaultExperiment } : undefined;
+  logWarning(
+    `${key} in ${configPath} is ignored. Runtime configuration now belongs in eval.yaml under experiment:.`,
+  );
 }
 
 function isFilesystemPath(p: string): boolean {
diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts
index e4040aad1..3e3751dbf 100644
--- a/packages/core/src/evaluation/run-artifacts.ts
+++ b/packages/core/src/evaluation/run-artifacts.ts
@@ -192,6 +192,12 @@ export type TrialAggregationArtifact =
       readonly passed_attempts: number;
       readonly total_attempts: number;
     }
+  | {
+      readonly strategy: 'pass_all';
+      readonly passed_attempts: number;
+      readonly total_attempts: number;
+      readonly min: number;
+    }
   | {
       readonly strategy: 'mean';
       readonly mean: number;
@@ -565,6 +571,13 @@ function toTrialAggregationArtifact(
         passed_attempts: aggregation.passedAttempts,
         total_attempts: aggregation.totalAttempts,
       };
+    case 'pass_all':
+      return {
+        strategy: aggregation.strategy,
+        passed_attempts: aggregation.passedAttempts,
+        total_attempts: aggregation.totalAttempts,
+        min: aggregation.min,
+      };
     case 'mean':
       return {
         strategy: aggregation.strategy,
@@ -1280,10 +1293,17 @@ function getSuite(result: EvaluationResult): string | undefined {
   return result.suite;
 }
 
-function buildArtifactSubdir(result: EvaluationResult): string {
+function buildArtifactSubdir(
+  result: EvaluationResult,
+  resultGroup?: string,
+  sourceTest?: EvalTest,
+): string {
   const segments = [];
   const evalSet = getSuite(result);
-  if (evalSet) {
+  const importedSuiteName = sourceTest?.source?.importedSuiteName;
+  if (importedSuiteName !== undefined) {
+    segments.push(safeArtifactPathSegment(importedSuiteName, 'default'));
+  } else if (evalSet && evalSet !== resultGroup) {
     segments.push(safeArtifactPathSegment(evalSet, 'default'));
   }
   segments.push(safeTestId(result.testId));
@@ -1298,7 +1318,35 @@ function findResultSourceTest(
   result: EvaluationResult,
   testByTestId: ReadonlyMap<string, EvalTest>,
 ): EvalTest | undefined {
-  return testByTestId.get(result.testId ?? 'unknown');
+  const testId = result.testId ?? 'unknown';
+  const suite = getSuite(result);
+  if (suite) {
+    const suiteMatch = testByTestId.get(sourceTestLookupKey(suite, testId));
+    if (suiteMatch) {
+      return suiteMatch;
+    }
+  }
+  return testByTestId.get(testId);
+}
+
+function sourceTestLookupKey(suite: string, testId: string): string {
+  return `${suite}\u0000${testId}`;
+}
+
+function buildSourceTestLookup(
+  sourceTests: readonly EvalTest[] | undefined,
+): Map<string, EvalTest> {
+  const tests = sourceTests ?? [];
+  const lookup = new Map<string, EvalTest>();
+  for (const test of tests) {
+    if (test.suite) {
+      lookup.set(sourceTestLookupKey(test.suite, test.id), test);
+    }
+    if (!lookup.has(test.id)) {
+      lookup.set(test.id, test);
+    }
+  }
+  return lookup;
 }
 
 function resolveEnvelopeEvalPath(
@@ -1971,11 +2019,12 @@ async function collectAdditionalIndexFields(
   if (!additionalArtifacts) {
     return undefined;
   }
+  const sourceTest = findResultSourceTest(result, testByTestId);
   return additionalArtifacts({
     result,
     outputDir,
     testDir,
-    sourceTest: testByTestId.get(result.testId ?? 'unknown'),
+    sourceTest,
     sourceTestsById: testByTestId,
   });
 }
@@ -1988,17 +2037,19 @@ export async function writePerTestArtifacts(
     evalFile?: string;
     runId?: string;
     duplicatePolicy?: ExportDuplicatePolicy;
+    resultGroup?: string;
     sourceTests?: readonly EvalTest[];
     additionalArtifacts?: AdditionalResultArtifactsWriter;
   },
 ): Promise<void> {
   await mkdir(outputDir, { recursive: true });
   const duplicatePolicy = options?.duplicatePolicy ?? 'update';
-  const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
+  const testByTestId = buildSourceTestLookup(options?.sourceTests);
   const indexRecords: ResultIndexArtifact[] = [];
 
   for (const result of results) {
-    const artifactSubdir = buildArtifactSubdir(result);
+    const sourceTest = findResultSourceTest(result, testByTestId);
+    const artifactSubdir = buildArtifactSubdir(result, options?.resultGroup, sourceTest);
     const testDir = path.join(outputDir, artifactSubdir);
     await mkdir(testDir, { recursive: true });
     const envelope = buildTraceEnvelopeSidecar({
@@ -2093,6 +2144,7 @@ export async function writeArtifactsFromResults(
     plannedTestCount?: number;
     runId?: string;
     duplicatePolicy?: ExportDuplicatePolicy;
+    resultGroup?: string;
     sourceTests?: readonly EvalTest[];
     additionalArtifacts?: AdditionalResultArtifactsWriter;
   },
@@ -2109,11 +2161,12 @@ export async function writeArtifactsFromResults(
   const existingRecords = await readExistingIndexRecords(outputDir);
   const existingByIdentity = existingRecordsByProjectionIdentity(existingRecords);
   const indexRecords: unknown[] = [];
-  const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
+  const testByTestId = buildSourceTestLookup(options?.sourceTests);
   const emittedIdentityIds = new Set<string>();
 
   const plans = results.map((result) => {
-    const artifactSubdir = buildArtifactSubdir(result);
+    const sourceTest = findResultSourceTest(result, testByTestId);
+    const artifactSubdir = buildArtifactSubdir(result, options?.resultGroup, sourceTest);
     const testDir = path.join(outputDir, artifactSubdir);
     const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME);
     const envelope = buildTraceEnvelopeSidecar({
diff --git a/packages/core/src/evaluation/trials.ts b/packages/core/src/evaluation/trials.ts
index 0f2b3e3cd..ff5061e47 100644
--- a/packages/core/src/evaluation/trials.ts
+++ b/packages/core/src/evaluation/trials.ts
@@ -1,6 +1,7 @@
 import type {
   ConfidenceIntervalAggregation,
   MeanAggregation,
+  PassAllAggregation,
   PassAtKAggregation,
   TrialAggregation,
   TrialResult,
@@ -17,6 +18,8 @@ export function aggregateTrials(
   switch (config.strategy) {
     case 'pass_at_k':
       return aggregatePassAtK(trials);
+    case 'pass_all':
+      return aggregatePassAll(trials);
     case 'mean':
       return aggregateMean(trials);
     case 'confidence_interval':
@@ -43,6 +46,26 @@ function aggregatePassAtK(trials: readonly TrialResult[]): {
   };
 }
 
+function aggregatePassAll(trials: readonly TrialResult[]): {
+  score: number;
+  aggregation: PassAllAggregation;
+} {
+  const passedAttempts = trials.filter((t) => t.verdict === 'pass').length;
+  const min = Math.min(...trials.map((trial) => trial.score));
+
+  const aggregation: PassAllAggregation = {
+    strategy: 'pass_all',
+    passedAttempts,
+    totalAttempts: trials.length,
+    min,
+  };
+
+  return {
+    score: min,
+    aggregation,
+  };
+}
+
 function aggregateMean(trials: readonly TrialResult[]): {
   score: number;
   aggregation: MeanAggregation;
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 0969ae0cd..c5351b976 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -946,6 +946,8 @@ export interface EvalTestSource {
   readonly evalFilePath: string;
   readonly evalFileAbsolutePath: string;
   readonly evalFileRepoPath?: string;
+  /** Set when this test came from a `tests[].include` entry with `type: suite`. */
+  readonly importedSuiteName?: string;
   readonly testId: string;
   readonly testSnapshotYaml: string;
   readonly graderDefinitions: readonly EvalGraderSource[];
@@ -1013,6 +1015,8 @@ export interface EvalTest {
   readonly targets?: readonly string[];
   /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
   readonly threshold?: number;
+  /** Scoped runtime interpretation/scheduling overrides. */
+  readonly run?: EvalRunOverride;
   /** Conversation evaluation mode. When 'conversation', turns[] drives turn-by-turn LLM evaluation. */
   readonly mode?: ConversationMode;
   /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
@@ -1056,7 +1060,7 @@ export type EvalCase = EvalTest;
 /**
  * Supported trial aggregation strategies.
  */
-export type TrialStrategy = 'pass_at_k' | 'mean' | 'confidence_interval';
+export type TrialStrategy = 'pass_at_k' | 'pass_all' | 'mean' | 'confidence_interval';
 
 /**
  * Configuration for running multiple trials per eval case.
@@ -1101,6 +1105,16 @@ export interface PassAtKAggregation {
   readonly totalAttempts: number;
 }
 
+/**
+ * Aggregation metadata for pass_all strategy.
+ */
+export interface PassAllAggregation {
+  readonly strategy: 'pass_all';
+  readonly passedAttempts: number;
+  readonly totalAttempts: number;
+  readonly min: number;
+}
+
 /**
  * Aggregation metadata for mean strategy.
  */
@@ -1125,7 +1139,18 @@ export interface ConfidenceIntervalAggregation {
 /**
  * Discriminated union of trial aggregation results.
  */
-export type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
+export type TrialAggregation =
+  | PassAtKAggregation
+  | PassAllAggregation
+  | MeanAggregation
+  | ConfidenceIntervalAggregation;
+
+export interface EvalRunOverride {
+  readonly threshold?: number;
+  readonly repeat?: TrialsConfig;
+  readonly timeoutSeconds?: number;
+  readonly budgetUsd?: number;
+}
 
 /**
  * Primary classification of evaluation outcome.
diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts
index 06fdc05b1..04e275cc0 100644
--- a/packages/core/src/evaluation/validation/eval-file.schema.ts
+++ b/packages/core/src/evaluation/validation/eval-file.schema.ts
@@ -366,6 +366,53 @@ const ExecutionSchema = z.object({
   threshold: z.number().min(0).max(1).optional(),
 });
 
+const ExperimentRepeatSchema = z
+  .object({
+    count: z.number().int().min(1),
+    strategy: z.enum(['pass_at_k', 'pass_all', 'mean', 'confidence_interval']).optional(),
+    cost_limit_usd: z.number().min(0).optional(),
+    costLimitUsd: z.number().min(0).optional(),
+  })
+  .strict();
+
+const RunOverrideSchema = z
+  .object({
+    threshold: z.number().min(0).max(1).optional(),
+    repeat: ExperimentRepeatSchema.optional(),
+    timeout_seconds: z.number().gt(0).optional(),
+    budget_usd: z.number().gt(0).optional(),
+  })
+  .strict();
+
+const ExperimentTargetRefSchema = z.union([
+  z.string().min(1),
+  z
+    .object({
+      name: z.string().min(1),
+      use_target: z.string().min(1).optional(),
+      hooks: JsonObjectSchema.optional(),
+    })
+    .strict(),
+]);
+
+const ExperimentRuntimeSchema = ExecutionSchema.extend({
+  agent: z.string().min(1).optional(),
+  model: z.string().min(1).optional(),
+  agent_options: JsonObjectSchema.optional(),
+  targets: z.array(ExperimentTargetRefSchema).min(1).optional(),
+  scripts: z.never().optional(),
+  repeat: ExperimentRepeatSchema.optional(),
+  runs: z.number().int().min(1).optional(),
+  early_exit: z.boolean().optional(),
+  timeout_seconds: z.number().gt(0).optional(),
+  budget_usd: z.number().gt(0).optional(),
+  sandbox: z.enum(['auto', 'docker', 'vercel']).optional(),
+  workspace: JsonObjectSchema.optional(),
+  setup: z.never().optional(),
+}).refine((value) => value.repeat === undefined || value.runs === undefined, {
+  message: 'Use repeat or runs, not both.',
+});
+
 /** Per-turn assertion: string shorthand (becomes rubric) or full evaluator config */
 const TurnAssertionSchema = z.union([z.string(), EvaluatorSchema]);
 
@@ -390,6 +437,7 @@ const EvalTestSchema = z.object({
   assertions: z.array(EvaluatorSchema).optional(),
   evaluators: z.array(EvaluatorSchema).optional(),
   execution: ExecutionSchema.optional(),
+  run: RunOverrideSchema.optional(),
   workspace: WorkspaceSchema.optional(),
   metadata: z.record(z.unknown()).optional(),
   conversation_id: z.string().optional(),
@@ -403,40 +451,74 @@ const EvalTestSchema = z.object({
   window_size: z.number().int().min(1).optional(),
 });
 
+const SelectPatternSchema = z.union([z.string().min(1), z.array(z.string().min(1)).min(1)]);
+const SelectMetadataValueSchema = z.union([
+  z.string(),
+  z.number(),
+  z.boolean(),
+  z.array(z.union([z.string(), z.number(), z.boolean()])).min(1),
+]);
+const TestIncludeSelectSchema = z
+  .object({
+    test_ids: SelectPatternSchema.optional(),
+    tags: SelectPatternSchema.optional(),
+    metadata: z.record(SelectMetadataValueSchema).optional(),
+  })
+  .strict();
+
+const TestIncludeSchema = z
+  .object({
+    include: z.string().min(1),
+    type: z.enum(['suite', 'tests']),
+    select: z.union([SelectPatternSchema, TestIncludeSelectSchema]).optional(),
+    run: RunOverrideSchema.optional(),
+  })
+  .strict();
+
+const TestsSchema = z.union([
+  z.array(z.union([EvalTestSchema, TestIncludeSchema, z.string().min(1)])),
+  z.string().min(1),
+]);
+
 // ---------------------------------------------------------------------------
 // Top-level eval file
 // ---------------------------------------------------------------------------
 
-export const EvalFileSchema = z.object({
-  $schema: z.string().optional(),
-  // Metadata
-  name: z
-    .string()
-    .regex(/^[a-z0-9-]+$/)
-    .optional(),
-  description: z.string().optional(),
-  category: z.string().optional(),
-  version: z.string().optional(),
-  author: z.string().optional(),
-  tags: z.array(z.string()).optional(),
-  license: z.string().optional(),
-  requires: z.object({ agentv: z.string().optional() }).optional(),
-  // Suite-level input
-  input: InputSchema.optional(),
-  // Suite-level input_files shorthand
-  input_files: z.array(z.string()).optional(),
-  // Tests (array or external file path)
-  tests: z.union([z.array(EvalTestSchema), z.string()]),
-  // Deprecated aliases
-  eval_cases: z.union([z.array(EvalTestSchema), z.string()]).optional(),
-  // Target
-  target: z.string().optional(),
-  // Execution
-  execution: ExecutionSchema.optional(),
-  // Suite-level assertions
-  assertions: z.array(EvaluatorSchema).optional(),
-  // Suite-level content preprocessors shared by evaluators
-  preprocessors: z.array(PreprocessorSchema).optional(),
-  // Workspace (inline object or path to external workspace YAML file)
-  workspace: z.union([WorkspaceSchema, z.string()]).optional(),
-});
+export const EvalFileSchema = z
+  .object({
+    $schema: z.string().optional(),
+    // Metadata
+    name: z
+      .string()
+      .regex(/^[a-z0-9-]+$/)
+      .optional(),
+    description: z.string().optional(),
+    category: z.string().optional(),
+    version: z.string().optional(),
+    author: z.string().optional(),
+    tags: z.array(z.string()).optional(),
+    license: z.string().optional(),
+    requires: z.object({ agentv: z.string().optional() }).optional(),
+    // Suite-level input
+    input: InputSchema.optional(),
+    // Suite-level input_files shorthand
+    input_files: z.array(z.string()).optional(),
+    // Tests (array, include entries, or external file path)
+    tests: TestsSchema,
+    // Deprecated aliases
+    eval_cases: TestsSchema.optional(),
+    // Target
+    target: z.string().optional(),
+    // Runtime. `experiment` is canonical; `execution` is a legacy top-level alias.
+    experiment: ExperimentRuntimeSchema.optional(),
+    execution: ExperimentRuntimeSchema.optional(),
+    // Suite-level assertions
+    assertions: z.array(EvaluatorSchema).optional(),
+    // Suite-level content preprocessors shared by evaluators
+    preprocessors: z.array(PreprocessorSchema).optional(),
+    // Workspace (inline object or path to external workspace YAML file)
+    workspace: z.union([WorkspaceSchema, z.string()]).optional(),
+  })
+  .refine((value) => value.experiment === undefined || value.execution === undefined, {
+    message: "Use either top-level 'experiment' or legacy 'execution', not both.",
+  });
diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts
index 32f9120ab..6a3585c28 100644
--- a/packages/core/src/evaluation/validation/eval-validator.ts
+++ b/packages/core/src/evaluation/validation/eval-validator.ts
@@ -1,5 +1,6 @@
-import { readFile, readdir, stat } from 'node:fs/promises';
+import { readFile, readdir, realpath, stat } from 'node:fs/promises';
 import path from 'node:path';
+import fg from 'fast-glob';
 
 import { interpolateEnv } from '../interpolation.js';
 import { loadCasesFromDirectory, loadCasesFromFile } from '../loaders/case-file-loader.js';
@@ -12,6 +13,11 @@ import type { ValidationError, ValidationResult } from './types.js';
 type JsonValue = string | number | boolean | null | JsonObject | JsonArray;
 type JsonObject = { readonly [key: string]: JsonValue };
 type JsonArray = readonly JsonValue[];
+type SuiteImportStackEntry = {
+  readonly identity: string;
+  readonly displayPath: string;
+  readonly filePath: string;
+};
 
 /** Assertion grader types that require a string `value` field. */
 const ASSERTION_TYPES_WITH_STRING_VALUE = new Set([
@@ -48,6 +54,7 @@ const KNOWN_TOP_LEVEL_FIELDS = new Set([
   'input_files',
   'tests',
   'target',
+  'experiment',
   'execution',
   'assertions',
   'evaluators',
@@ -57,6 +64,11 @@ const KNOWN_TOP_LEVEL_FIELDS = new Set([
   'governance',
 ]);
 
+/** Known fields on tests[] include entries. */
+const KNOWN_INCLUDE_FIELDS = new Set(['include', 'type', 'select', 'run']);
+const KNOWN_RUN_OVERRIDE_FIELDS = new Set(['threshold', 'repeat', 'timeout_seconds', 'budget_usd']);
+const KNOWN_REPEAT_STRATEGIES = new Set(['pass_at_k', 'pass_all', 'mean', 'confidence_interval']);
+
 /**
  * Deprecated top-level fields with migration hints.
  * These are still processed by yaml-parser but authors should migrate.
@@ -80,6 +92,7 @@ const KNOWN_TEST_FIELDS = new Set([
   'evaluators',
   'rubrics',
   'execution',
+  'run',
   'workspace',
   'metadata',
   'conversation_id',
@@ -157,6 +170,31 @@ function isObject(value: unknown): value is JsonObject {
   return typeof value === 'object' && value !== null && !Array.isArray(value);
 }
 
+function isIncludeEntry(value: JsonObject): value is JsonObject & { include: string } {
+  return typeof value.include === 'string' && value.include.trim().length > 0;
+}
+
+async function canonicalEvalFileIdentity(filePath: string): Promise<string> {
+  const absolutePath = path.resolve(filePath);
+  return realpath(absolutePath).catch(() => absolutePath);
+}
+
+function displayEvalImportPath(filePath: string): string {
+  const relativePath = path.relative(process.cwd(), filePath);
+  return relativePath && !relativePath.startsWith('..') && !path.isAbsolute(relativePath)
+    ? relativePath
+    : filePath;
+}
+
+function formatCircularImportChain(
+  stack: readonly SuiteImportStackEntry[],
+  repeated: SuiteImportStackEntry,
+): string {
+  const start = stack.findIndex((entry) => entry.identity === repeated.identity);
+  const cycle = [...(start >= 0 ? stack.slice(start) : stack), repeated];
+  return cycle.map((entry) => entry.displayPath).join(' -> ');
+}
+
 /**
  * Validate an eval file (agentv-eval-v2 schema).
  */
@@ -200,6 +238,15 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
   // Validate metadata fields
   validateMetadata(parsed, absolutePath, errors);
 
+  if (parsed.experiment !== undefined && parsed.execution !== undefined) {
+    errors.push({
+      severity: 'error',
+      filePath: absolutePath,
+      location: 'experiment',
+      message: "Use either top-level 'experiment' or legacy 'execution', not both.",
+    });
+  }
+
   // Warn on deprecated or unknown top-level fields
   for (const key of Object.keys(parsed)) {
     const deprecationMessage = DEPRECATED_TOP_LEVEL_FIELDS.get(key);
@@ -228,65 +275,7 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
   // tests can be a string path (external file/directory reference) or an array
   if (typeof cases === 'string') {
     await validateWorkspaceConfig(parsed.workspace, absolutePath, errors, 'workspace');
-
-    const externalCasesPath = path.resolve(path.dirname(absolutePath), cases);
-    let isDir = false;
-    try {
-      const pathStat = await stat(externalCasesPath);
-      isDir = pathStat.isDirectory();
-    } catch {
-      // Path doesn't exist — fall through to file validation
-    }
-
-    if (isDir) {
-      // Directory path: load and validate discovered cases
-      try {
-        const dirCases = await loadCasesFromDirectory(externalCasesPath);
-        for (let i = 0; i < dirCases.length; i++) {
-          const dirCase = dirCases[i];
-          await validateWorkspaceConfig(
-            dirCase.workspace,
-            absolutePath,
-            errors,
-            `tests[${i}].workspace`,
-          );
-        }
-      } catch (error) {
-        const message = error instanceof Error ? error.message : String(error);
-        errors.push({
-          severity: 'error',
-          filePath: absolutePath,
-          location: 'tests',
-          message,
-        });
-      }
-    } else {
-      // File path: validate extension and load
-      validateTestsStringPath(cases, absolutePath, errors);
-      const ext = path.extname(cases).toLowerCase();
-      if (VALID_TEST_FILE_EXTENSIONS.has(ext)) {
-        try {
-          const externalCases = await loadCasesFromFile(externalCasesPath);
-          for (let i = 0; i < externalCases.length; i++) {
-            const externalCase = externalCases[i];
-            await validateWorkspaceConfig(
-              externalCase.workspace,
-              absolutePath,
-              errors,
-              `tests[${i}].workspace`,
-            );
-          }
-        } catch (error) {
-          const message = error instanceof Error ? error.message : String(error);
-          errors.push({
-            severity: 'error',
-            filePath: absolutePath,
-            location: 'tests',
-            message,
-          });
-        }
-      }
-    }
+    await validateRawCaseImportPath(cases, absolutePath, 'tests', errors);
 
     return {
       valid: errors.filter((e) => e.severity === 'error').length === 0,
@@ -318,16 +307,7 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
 
     // Tests array items can be file references (e.g., "file://cases/accuracy.yaml")
     if (typeof evalCase === 'string') {
-      if (evalCase.startsWith('file://')) {
-        validateTestsStringPath(evalCase, absolutePath, errors);
-      } else {
-        errors.push({
-          severity: 'error',
-          filePath: absolutePath,
-          location,
-          message: 'Test case string must be a file reference (file://...)',
-        });
-      }
+      await validateRawCaseImportPath(evalCase, absolutePath, location, errors);
       continue;
     }
 
@@ -341,6 +321,11 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
       continue;
     }
 
+    if (isIncludeEntry(evalCase)) {
+      validateIncludeEntry(evalCase, location, absolutePath, errors);
+      continue;
+    }
+
     // Warn on deprecated or unknown test-level fields
     for (const key of Object.keys(evalCase)) {
       const deprecationMessage = DEPRECATED_TEST_FIELDS.get(key);
@@ -438,6 +423,8 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
       validateAssertArray(assertField, location, absolutePath, errors, customAssertionTypes);
     }
 
+    validateRunOverride(evalCase.run, `${location}.run`, absolutePath, errors);
+
     // Cross-field validation for conversation mode
     validateConversationMode(evalCase, location, absolutePath, errors);
 
@@ -450,6 +437,7 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
   }
 
   await validateWorkspaceConfig(parsed.workspace, absolutePath, errors, 'workspace');
+  await validateSuiteImportCycles(absolutePath, parsed, errors);
 
   return {
     valid: errors.filter((e) => e.severity === 'error').length === 0,
@@ -459,6 +447,238 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
   };
 }
 
+function validateIncludeEntry(
+  entry: JsonObject,
+  location: string,
+  filePath: string,
+  errors: ValidationError[],
+): void {
+  for (const key of Object.keys(entry)) {
+    if (!KNOWN_INCLUDE_FIELDS.has(key)) {
+      errors.push({
+        severity: 'warning',
+        filePath,
+        location: `${location}.${key}`,
+        message: `Unknown field '${key}'. This field will be ignored.`,
+      });
+    }
+  }
+
+  if (typeof entry.include !== 'string' || entry.include.trim().length === 0) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.include`,
+      message: "Invalid 'include' field (must be a non-empty string)",
+    });
+  }
+
+  if (entry.type === undefined) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.type`,
+      message: "Missing 'type' field (must be 'suite' or 'tests')",
+    });
+  } else if (entry.type !== 'suite' && entry.type !== 'tests') {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.type`,
+      message: "Invalid 'type' field (must be 'suite' or 'tests')",
+    });
+  }
+
+  validateIncludeSelect(entry.select, `${location}.select`, filePath, errors);
+  validateRunOverride(entry.run, `${location}.run`, filePath, errors);
+}
+
+function validateRunOverride(
+  run: JsonValue | undefined,
+  location: string,
+  filePath: string,
+  errors: ValidationError[],
+): void {
+  if (run === undefined) {
+    return;
+  }
+  if (!isObject(run)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location,
+      message: "Invalid 'run' override (must be an object)",
+    });
+    return;
+  }
+
+  for (const key of Object.keys(run)) {
+    if (!KNOWN_RUN_OVERRIDE_FIELDS.has(key)) {
+      errors.push({
+        severity: 'error',
+        filePath,
+        location: `${location}.${key}`,
+        message:
+          'Invalid run override field. Supported fields: threshold, repeat, timeout_seconds, budget_usd.',
+      });
+    }
+  }
+
+  const threshold = run.threshold;
+  if (
+    threshold !== undefined &&
+    (typeof threshold !== 'number' || threshold < 0 || threshold > 1)
+  ) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.threshold`,
+      message: "Invalid 'threshold' field (must be a number between 0 and 1)",
+    });
+  }
+
+  const timeoutSeconds = run.timeout_seconds;
+  if (timeoutSeconds !== undefined && (typeof timeoutSeconds !== 'number' || timeoutSeconds <= 0)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.timeout_seconds`,
+      message: "Invalid 'timeout_seconds' field (must be a positive number)",
+    });
+  }
+
+  const budgetUsd = run.budget_usd;
+  if (budgetUsd !== undefined && (typeof budgetUsd !== 'number' || budgetUsd <= 0)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.budget_usd`,
+      message: "Invalid 'budget_usd' field (must be a positive number)",
+    });
+  }
+
+  validateRepeatOverride(run.repeat, `${location}.repeat`, filePath, errors);
+}
+
+function validateRepeatOverride(
+  repeat: JsonValue | undefined,
+  location: string,
+  filePath: string,
+  errors: ValidationError[],
+): void {
+  if (repeat === undefined) {
+    return;
+  }
+  if (!isObject(repeat)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location,
+      message: "Invalid 'repeat' field (must be an object)",
+    });
+    return;
+  }
+
+  if (typeof repeat.count !== 'number' || !Number.isInteger(repeat.count) || repeat.count < 1) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.count`,
+      message: "Invalid 'count' field (must be a positive integer)",
+    });
+  }
+
+  if (
+    repeat.strategy !== undefined &&
+    (typeof repeat.strategy !== 'string' || !KNOWN_REPEAT_STRATEGIES.has(repeat.strategy))
+  ) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.strategy`,
+      message:
+        "Invalid 'strategy' field (must be pass_at_k, pass_all, mean, or confidence_interval)",
+    });
+  }
+
+  const costLimit = repeat.cost_limit_usd;
+  if (costLimit !== undefined && (typeof costLimit !== 'number' || costLimit < 0)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.cost_limit_usd`,
+      message: "Invalid 'cost_limit_usd' field (must be a non-negative number)",
+    });
+  }
+}
+
+function validateIncludeSelect(
+  select: JsonValue | undefined,
+  location: string,
+  filePath: string,
+  errors: ValidationError[],
+): void {
+  if (select === undefined || typeof select === 'string') {
+    return;
+  }
+  if (Array.isArray(select)) {
+    if (!select.every((value) => typeof value === 'string')) {
+      errors.push({
+        severity: 'error',
+        filePath,
+        location,
+        message: "Invalid 'select' field (array values must be strings)",
+      });
+    }
+    return;
+  }
+  if (!isObject(select)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location,
+      message: "Invalid 'select' field (must be a string, string array, or object)",
+    });
+    return;
+  }
+
+  for (const [key, value] of Object.entries(select)) {
+    if (key !== 'test_ids' && key !== 'tags' && key !== 'metadata') {
+      errors.push({
+        severity: 'warning',
+        filePath,
+        location: `${location}.${key}`,
+        message: `Unknown field '${key}'. This field will be ignored.`,
+      });
+      continue;
+    }
+
+    if (key === 'metadata') {
+      if (!isObject(value)) {
+        errors.push({
+          severity: 'error',
+          filePath,
+          location: `${location}.metadata`,
+          message: "Invalid 'metadata' selector (must be an object)",
+        });
+      }
+      continue;
+    }
+
+    if (
+      typeof value !== 'string' &&
+      !(Array.isArray(value) && value.every((entry) => typeof entry === 'string'))
+    ) {
+      errors.push({
+        severity: 'error',
+        filePath,
+        location: `${location}.${key}`,
+        message: `Invalid '${key}' selector (must be a string or string array)`,
+      });
+    }
+  }
+}
+
 async function validateWorkspaceConfig(
   workspace: JsonValue | undefined,
   evalFilePath: string,
@@ -751,15 +971,186 @@ function validateTestsStringPath(
   testsPath: string,
   filePath: string,
   errors: ValidationError[],
-): void {
-  const ext = path.extname(testsPath);
-  if (!VALID_TEST_FILE_EXTENSIONS.has(ext)) {
+  location = 'tests',
+): boolean {
+  const normalizedPath = testsPath.startsWith('file://')
+    ? testsPath.slice('file://'.length)
+    : testsPath;
+  if (/\.eval\.ya?ml$/i.test(normalizedPath)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location,
+      message:
+        'tests shorthand imports raw case files only. Use an include entry with type: suite to import eval suites.',
+    });
+    return false;
+  }
+  const ext = path.extname(normalizedPath);
+  if (ext && !VALID_TEST_FILE_EXTENSIONS.has(ext)) {
     errors.push({
       severity: 'warning',
       filePath,
-      location: 'tests',
+      location,
       message: `Unsupported file extension '${ext}' for tests path '${testsPath}'. Supported extensions: ${[...VALID_TEST_FILE_EXTENSIONS].join(', ')}`,
     });
+    return false;
+  }
+  return true;
+}
+
+function hasGlobMagic(value: string): boolean {
+  return /[*?[\]{}()!+@]/.test(value);
+}
+
+async function validateRawCaseImportPath(
+  testsPath: string,
+  filePath: string,
+  location: string,
+  errors: ValidationError[],
+): Promise<void> {
+  if (!validateTestsStringPath(testsPath, filePath, errors, location)) {
+    return;
+  }
+
+  const rawPath = testsPath.startsWith('file://') ? testsPath.slice('file://'.length) : testsPath;
+  const absolutePath = path.resolve(path.dirname(filePath), rawPath);
+  try {
+    const caseFiles = hasGlobMagic(rawPath)
+      ? (
+          await fg(absolutePath.replaceAll('\\', '/'), {
+            onlyFiles: true,
+            absolute: true,
+          })
+        ).sort()
+      : [absolutePath];
+
+    let caseIndex = 0;
+    for (const casePath of caseFiles) {
+      const pathStat = await stat(casePath).catch(() => undefined);
+      const externalCases = pathStat?.isDirectory()
+        ? await loadCasesFromDirectory(casePath)
+        : await loadCasesFromFile(casePath);
+      for (const externalCase of externalCases) {
+        await validateWorkspaceConfig(
+          externalCase.workspace,
+          filePath,
+          errors,
+          `${location}[${caseIndex}].workspace`,
+        );
+        caseIndex += 1;
+      }
+    }
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    errors.push({
+      severity: 'error',
+      filePath,
+      location,
+      message,
+    });
+  }
+}
+
+async function resolveSuiteIncludePaths(
+  includePath: string,
+  evalFileDir: string,
+): Promise<readonly SuiteImportStackEntry[]> {
+  const absolutePattern = path.resolve(evalFileDir, includePath);
+  const matches = hasGlobMagic(includePath)
+    ? (
+        await fg(absolutePattern.replaceAll('\\', '/'), {
+          onlyFiles: true,
+          absolute: true,
+        })
+      ).sort()
+    : [absolutePattern];
+  const seen = new Set<string>();
+  const resolved: SuiteImportStackEntry[] = [];
+  for (const match of matches) {
+    const identity = await canonicalEvalFileIdentity(match);
+    if (seen.has(identity)) {
+      continue;
+    }
+    seen.add(identity);
+    resolved.push({
+      identity,
+      filePath: path.resolve(match),
+      displayPath: displayEvalImportPath(path.resolve(match)),
+    });
+  }
+  return resolved;
+}
+
+async function validateSuiteImportCycles(
+  filePath: string,
+  parsed: JsonObject,
+  errors: ValidationError[],
+): Promise<void> {
+  const root: SuiteImportStackEntry = {
+    identity: await canonicalEvalFileIdentity(filePath),
+    filePath,
+    displayPath: displayEvalImportPath(filePath),
+  };
+  await validateSuiteImportCyclesFromParsed(filePath, parsed, [root], errors);
+}
+
+async function validateSuiteImportCyclesFromParsed(
+  currentFilePath: string,
+  parsed: JsonObject,
+  stack: readonly SuiteImportStackEntry[],
+  errors: ValidationError[],
+): Promise<void> {
+  const tests = parsed.tests;
+  if (!Array.isArray(tests)) {
+    return;
+  }
+
+  for (let i = 0; i < tests.length; i++) {
+    const entry = tests[i];
+    if (!isObject(entry) || !isIncludeEntry(entry)) {
+      continue;
+    }
+    const includePath = entry.include.trim();
+    if (entry.type !== 'suite') {
+      continue;
+    }
+
+    const location = `tests[${i}].include`;
+    const resolvedSuites = await resolveSuiteIncludePaths(
+      includePath,
+      path.dirname(currentFilePath),
+    );
+    for (const resolvedSuite of resolvedSuites) {
+      if (stack.some((ancestor) => ancestor.identity === resolvedSuite.identity)) {
+        errors.push({
+          severity: 'error',
+          filePath: currentFilePath,
+          location,
+          message: `Circular eval suite import: ${formatCircularImportChain(stack, resolvedSuite)}`,
+        });
+        continue;
+      }
+
+      let childParsed: unknown;
+      try {
+        childParsed = interpolateEnv(
+          parseYamlValue(await readFile(resolvedSuite.filePath, 'utf8')),
+          process.env,
+        );
+      } catch {
+        continue;
+      }
+      if (!isObject(childParsed)) {
+        continue;
+      }
+      await validateSuiteImportCyclesFromParsed(
+        resolvedSuite.filePath,
+        childParsed,
+        [...stack, resolvedSuite],
+        errors,
+      );
+    }
   }
 }
 
diff --git a/packages/core/src/evaluation/validation/experiment-file.schema.ts b/packages/core/src/evaluation/validation/experiment-file.schema.ts
deleted file mode 100644
index 40f24c0d0..000000000
--- a/packages/core/src/evaluation/validation/experiment-file.schema.ts
+++ /dev/null
@@ -1,87 +0,0 @@
-/**
- * Zod schema for experiment YAML file format.
- * Used to generate experiment-schema.json for AI agent reference.
- *
- * IMPORTANT: This schema describes the YAML input format, not the parsed runtime types.
- * Wire fields are snake_case. The only camelCase field accepted here is
- * repeat.costLimitUsd, kept for parity with the prerelease trials schema.
- */
-import { z } from 'zod';
-
-const JsonObjectSchema = z.object({}).catchall(z.unknown());
-
-const StringOrStringArraySchema = z.union([z.string().min(1), z.array(z.string().min(1)).min(1)]);
-
-const ExperimentScriptSchema = z.union([
-  z.string().min(1),
-  z
-    .object({
-      command: StringOrStringArraySchema.optional(),
-      script: StringOrStringArraySchema.optional(),
-      timeout_seconds: z.number().gt(0).optional(),
-      cwd: z.string().min(1).optional(),
-      env: z.record(z.string()).optional(),
-    })
-    .strict()
-    .refine((value) => value.command !== undefined || value.script !== undefined, {
-      message: 'Experiment step must define command or script.',
-    }),
-]);
-
-const ExperimentRepeatSchema = z
-  .object({
-    count: z.number().int().min(1),
-    strategy: z.enum(['pass_at_k', 'mean', 'confidence_interval']).optional(),
-    cost_limit_usd: z.number().min(0).optional(),
-    costLimitUsd: z.number().min(0).optional(),
-  })
-  .strict();
-
-const ExperimentTargetRefSchema = z.union([
-  z.string().min(1),
-  z
-    .object({
-      name: z.string().min(1),
-      use_target: z.string().min(1).optional(),
-      hooks: JsonObjectSchema.optional(),
-    })
-    .strict(),
-]);
-
-const ExperimentSuiteSelectSchema = z
-  .object({
-    test_ids: z.array(z.string().min(1)).min(1),
-  })
-  .strict();
-
-const ExperimentSuiteRefSchema = z
-  .object({
-    ref: z.string().min(1),
-    select: ExperimentSuiteSelectSchema.optional(),
-  })
-  .strict();
-
-export const ExperimentFileSchema = z
-  .object({
-    name: z.string().min(1).optional(),
-    agent: z.string().min(1).optional(),
-    target: z.string().min(1).optional(),
-    targets: z.array(ExperimentTargetRefSchema).min(1).optional(),
-    model: z.string().min(1).optional(),
-    agent_options: JsonObjectSchema.optional(),
-    suites: z.array(ExperimentSuiteRefSchema).min(1).optional(),
-    scripts: z.array(ExperimentScriptSchema).optional(),
-    repeat: ExperimentRepeatSchema.optional(),
-    runs: z.number().int().min(1).optional(),
-    early_exit: z.boolean().optional(),
-    timeout_seconds: z.number().gt(0).optional(),
-    workers: z.number().int().min(1).optional(),
-    budget_usd: z.number().gt(0).optional(),
-    sandbox: z.enum(['auto', 'docker', 'vercel']).optional(),
-    workspace: JsonObjectSchema.optional(),
-    setup: z.array(ExperimentScriptSchema).optional(),
-  })
-  .strict()
-  .refine((value) => value.repeat === undefined || value.runs === undefined, {
-    message: 'Use repeat or runs, not both.',
-  });
diff --git a/packages/core/src/evaluation/validation/index.ts b/packages/core/src/evaluation/validation/index.ts
index 917715615..23093ae0b 100644
--- a/packages/core/src/evaluation/validation/index.ts
+++ b/packages/core/src/evaluation/validation/index.ts
@@ -3,7 +3,6 @@
  */
 
 export { detectFileType, isValidSchema, getExpectedSchema } from './file-type.js';
-export { ExperimentFileSchema } from './experiment-file.schema.js';
 export { validateEvalFile } from './eval-validator.js';
 export { validateCasesFile } from './cases-validator.js';
 export { validateTargetsFile } from './targets-validator.js';
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 0f074b18d..ea543edca 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -1,8 +1,14 @@
-import { readFile, stat } from 'node:fs/promises';
+import { readFile, realpath, stat } from 'node:fs/promises';
 import path from 'node:path';
+import fg from 'fast-glob';
 import micromatch from 'micromatch';
 import { stringify as stringifyYaml } from 'yaml';
 
+import {
+  type ExperimentConfig,
+  normalizeExperimentConfig,
+  normalizeExperimentRunOverride,
+} from './experiment.js';
 import { collectResolvedInputFilePaths } from './input-message-utils.js';
 import { interpolateEnv, interpolateTemplateVars } from './interpolation.js';
 import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js';
@@ -47,6 +53,7 @@ import type {
   ConversationTurn,
   DockerWorkspaceConfig,
   EvalGraderSource,
+  EvalRunOverride,
   EvalSourceReference,
   EvalTest,
   EvalTestSource,
@@ -95,6 +102,13 @@ type LoadOptions = {
   readonly filter?: string | readonly string[];
   /** Category derived from the eval file's directory path */
   readonly category?: string;
+  /** Internal DFS stack for detecting circular `type: suite` imports. */
+  readonly suiteImportStack?: readonly SuiteImportStackEntry[];
+};
+
+type SuiteImportStackEntry = {
+  readonly identity: string;
+  readonly displayPath: string;
 };
 
 function matchesFilter(id: string, filter: string | readonly string[]): boolean {
@@ -103,6 +117,43 @@ function matchesFilter(id: string, filter: string | readonly string[]): boolean
     : filter.some((pattern) => micromatch.isMatch(id, pattern));
 }
 
+async function canonicalEvalFileIdentity(filePath: string): Promise<string> {
+  const absolutePath = path.resolve(filePath);
+  return realpath(absolutePath).catch(() => absolutePath);
+}
+
+async function dedupeResolvedPathsByIdentity(
+  resolvedPaths: readonly string[],
+): Promise<readonly string[]> {
+  const seen = new Set<string>();
+  const deduped: string[] = [];
+  for (const resolvedPath of resolvedPaths) {
+    const identity = await canonicalEvalFileIdentity(resolvedPath);
+    if (seen.has(identity)) {
+      continue;
+    }
+    seen.add(identity);
+    deduped.push(resolvedPath);
+  }
+  return deduped;
+}
+
+function displayEvalImportPath(filePath: string): string {
+  const relativePath = path.relative(process.cwd(), filePath);
+  return relativePath && !relativePath.startsWith('..') && !path.isAbsolute(relativePath)
+    ? relativePath
+    : filePath;
+}
+
+function formatCircularImportChain(
+  stack: readonly SuiteImportStackEntry[],
+  repeated: SuiteImportStackEntry,
+): string {
+  const start = stack.findIndex((entry) => entry.identity === repeated.identity);
+  const cycle = [...(start >= 0 ? stack.slice(start) : stack), repeated];
+  return cycle.map((entry) => entry.displayPath).join(' -> ');
+}
+
 type RawTestSuite = JsonObject & {
   readonly tests?: JsonValue;
   /** @deprecated Use `tests` instead */
@@ -110,6 +161,7 @@ type RawTestSuite = JsonObject & {
   /** @deprecated Use `tests` instead */
   readonly evalcases?: JsonValue;
   readonly target?: JsonValue;
+  readonly experiment?: JsonValue;
   readonly execution?: JsonValue;
   readonly workspace?: JsonValue;
   readonly assertions?: JsonValue;
@@ -145,6 +197,7 @@ type RawEvalCase = JsonObject & {
   readonly expected_output?: JsonValue;
   readonly evaluator?: JsonValue;
   readonly execution?: JsonValue;
+  readonly run?: JsonValue;
   readonly evaluators?: JsonValue;
   readonly assertions?: JsonValue;
   /** @deprecated Use `assertions` instead */
@@ -277,6 +330,8 @@ export type EvalSuiteResult = {
   readonly failOnError?: import('./types.js').FailOnError;
   /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
   readonly threshold?: number;
+  /** Top-level runtime block from `experiment:` or legacy `execution:`. */
+  readonly experimentConfig?: ExperimentConfig;
   /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
   readonly workspacePath?: string;
   /** Inline target definition from a TS eval config. */
@@ -363,9 +418,22 @@ async function loadTestsFromYaml(
   options?: LoadOptions,
 ): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> {
   const absoluteTestPath = path.resolve(evalFilePath);
+  const currentImport: SuiteImportStackEntry = {
+    identity: await canonicalEvalFileIdentity(absoluteTestPath),
+    displayPath: displayEvalImportPath(absoluteTestPath),
+  };
+  const importStack = options?.suiteImportStack ?? [];
+  if (importStack.some((entry) => entry.identity === currentImport.identity)) {
+    throw new Error(
+      `Circular eval suite import: ${formatCircularImportChain(importStack, currentImport)}`,
+    );
+  }
   const rawFile = await readFile(absoluteTestPath, 'utf8');
 
-  return loadTestsFromParsedYamlValue(parseYamlValue(rawFile), evalFilePath, repoRoot, options);
+  return loadTestsFromParsedYamlValue(parseYamlValue(rawFile), evalFilePath, repoRoot, {
+    ...options,
+    suiteImportStack: [...importStack, currentImport],
+  });
 }
 
 async function loadTestsFromParsedYamlValue(
@@ -401,6 +469,9 @@ async function loadTestsFromParsedYamlValue(
     suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
 
   const rawTestCases = resolveTests(suite);
+  // Top-level `metadata:` is inherited by cases. Suite identity tags are parsed
+  // separately by parseMetadata() and are not case tags.
+  const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
 
   const globalEvaluator = coerceEvaluator(suite.evaluator, 'global') ?? 'llm-grader';
   const suitePreprocessors = await parsePreprocessors(
@@ -413,40 +484,32 @@ async function loadTestsFromParsedYamlValue(
   // Parse suite-level workspace config (default for all cases)
   const evalFileDir = path.dirname(absoluteTestPath);
 
-  // Resolve tests: string path to external file/directory, inline array, or error
+  const importedSuiteTests: EvalTest[] = [];
+  // Resolve tests: string path to external file/directory, inline array, include entries, or error
   let expandedTestCases: readonly JsonValue[];
   if (typeof rawTestCases === 'string') {
-    const externalPath = path.resolve(evalFileDir, rawTestCases);
-    let isDir = false;
-    try {
-      const pathStat = await stat(externalPath);
-      isDir = pathStat.isDirectory();
-    } catch {
-      // Path doesn't exist — fall through to loadCasesFromFile for its error message
-    }
-    if (isDir) {
-      expandedTestCases = await loadCasesFromDirectory(externalPath);
-    } else {
-      expandedTestCases = await loadCasesFromFile(externalPath);
-    }
+    expandedTestCases = await loadRawCasesFromShorthand(rawTestCases, evalFileDir);
   } else if (Array.isArray(rawTestCases)) {
-    // Inline array: expand any file:// references
-    expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
+    const expanded = await expandInlineTestEntries({
+      entries: rawTestCases,
+      evalFileDir,
+      repoRoot,
+      suiteMetadataPayload,
+      options,
+    });
+    expandedTestCases = expanded.rawCases;
+    importedSuiteTests.push(...expanded.importedSuiteTests);
   } else {
     throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
   }
 
   const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
 
-  // Suite-level metadata defaults. Top-level `metadata:` is inherited by each case.
-  // Top-level `governance:` wins over `metadata.governance:` for compatibility.
-  const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
-
   const rawSuiteInput = suite.input;
   const rawSuiteInputFiles = suite.input_files;
 
   // Extract global target from execution.target (or legacy root-level target)
-  const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : undefined;
+  const rawGlobalExecution = readSuiteRuntimeBlock(suite, evalFilePath);
   const _globalTarget = asString(rawGlobalExecution?.target) ?? asString(suite.target);
 
   // Build global execution context, including suite-level assertions (which is a sibling of execution)
@@ -499,6 +562,10 @@ async function loadTestsFromParsedYamlValue(
       (caseExecution.threshold as number) <= 1
         ? (caseExecution.threshold as number)
         : undefined;
+    const caseRun = mergeRunOverrides(
+      caseThreshold !== undefined ? { threshold: caseThreshold } : undefined,
+      normalizeRunOverride(renderedCase.run, `test '${id ?? 'unknown'}'.run`),
+    );
 
     // Resolve input with shorthand support (pass suite-level input_files for merge)
     const effectiveSuiteInputFiles =
@@ -726,7 +793,8 @@ async function loadTestsFromParsedYamlValue(
       workspace: mergedWorkspace,
       metadata,
       targets: caseTargets,
-      ...(caseThreshold !== undefined ? { threshold: caseThreshold } : {}),
+      ...(caseRun?.threshold !== undefined ? { threshold: caseRun.threshold } : {}),
+      ...(caseRun !== undefined ? { run: caseRun } : {}),
       ...(mode ? { mode } : {}),
       ...(turns && turns.length > 0 ? { turns } : {}),
       ...(aggregation ? { aggregation } : {}),
@@ -750,7 +818,11 @@ async function loadTestsFromParsedYamlValue(
     results.push(testCase);
   }
 
-  return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
+  return {
+    tests: [...importedSuiteTests, ...results],
+    parsed: suite,
+    suiteWorkspacePath: suiteWorkspace?.path,
+  };
 }
 
 function buildEvalSuiteResult(
@@ -761,6 +833,7 @@ function buildEvalSuiteResult(
   const metadata = parseMetadata(parsed);
   const failOnError = extractFailOnError(parsed);
   const threshold = extractThreshold(parsed);
+  const experimentConfig = normalizeSuiteExperimentConfig(parsed);
 
   return {
     tests,
@@ -772,10 +845,384 @@ function buildEvalSuiteResult(
     ...(metadata !== undefined && { metadata }),
     ...(failOnError !== undefined && { failOnError }),
     ...(threshold !== undefined && { threshold }),
+    ...(experimentConfig !== undefined && { experimentConfig }),
     ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }),
   };
 }
 
+type IncludeEntryType = 'suite' | 'tests';
+
+type ExpandedInlineTestEntries = {
+  readonly rawCases: readonly JsonValue[];
+  readonly importedSuiteTests: readonly EvalTest[];
+};
+
+type IncludeSelect = {
+  readonly testIds?: string | readonly string[];
+  readonly tags?: string | readonly string[];
+  readonly metadata?: Record<string, unknown>;
+};
+
+function normalizeRunOverride(value: unknown, label: string): EvalRunOverride | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+  try {
+    return normalizeExperimentRunOverride(value);
+  } catch (error) {
+    const reason = error instanceof Error ? error.message : String(error);
+    throw new Error(`Invalid ${label}: ${reason}`);
+  }
+}
+
+function mergeRunOverrides(
+  base: EvalRunOverride | undefined,
+  override: EvalRunOverride | undefined,
+): EvalRunOverride | undefined {
+  if (!base) {
+    return override;
+  }
+  if (!override) {
+    return base;
+  }
+  return {
+    ...base,
+    ...override,
+  };
+}
+
+function applyRunOverrideToTest(test: EvalTest, includeRun: EvalRunOverride | undefined): EvalTest {
+  const run = mergeRunOverrides(includeRun, test.run);
+  if (!run) {
+    return test;
+  }
+  return {
+    ...test,
+    run,
+  };
+}
+
+function markSuiteImportedTest(test: EvalTest): EvalTest {
+  return {
+    ...test,
+    source: {
+      ...(test.source ?? {
+        evalFilePath: '',
+        evalFileAbsolutePath: '',
+        testId: test.id,
+        testSnapshotYaml: '',
+        graderDefinitions: [],
+        references: [],
+      }),
+      importedSuiteName: test.suite ?? 'default',
+    },
+  };
+}
+
+function applyRunOverrideToRawCase(
+  testCase: JsonObject,
+  includeRun: EvalRunOverride | undefined,
+): JsonObject {
+  if (!includeRun) {
+    return testCase;
+  }
+  const caseRun = normalizeRunOverride(
+    testCase.run,
+    `test '${String(testCase.id ?? 'unknown')}'.run`,
+  );
+  const run = mergeRunOverrides(includeRun, caseRun);
+  return run ? { ...testCase, run: run as unknown as JsonObject } : testCase;
+}
+
+function isIncludeEntry(value: JsonValue): value is JsonObject & { include: string } {
+  return (
+    isJsonObject(value) && typeof value.include === 'string' && value.include.trim().length > 0
+  );
+}
+
+function hasGlobMagic(value: string): boolean {
+  return /[*?[\]{}()!+@]/.test(value);
+}
+
+function normalizeIncludeEntryType(value: unknown, includePath: string): IncludeEntryType {
+  if (value === 'suite' || value === 'tests') {
+    return value;
+  }
+  if (value === undefined) {
+    throw new Error(`Missing tests[].type for include '${includePath}'. Use 'suite' or 'tests'.`);
+  }
+  throw new Error(`Invalid tests[].type for include '${includePath}'. Use 'suite' or 'tests'.`);
+}
+
+function readStringPatterns(value: unknown, label: string): string | readonly string[] | undefined {
+  if (typeof value === 'string' && value.trim().length > 0) {
+    return value.trim();
+  }
+  if (Array.isArray(value)) {
+    const patterns = value.filter(
+      (item): item is string => typeof item === 'string' && item.trim().length > 0,
+    );
+    if (patterns.length > 0) {
+      return patterns.map((item) => item.trim());
+    }
+  }
+  if (value !== undefined) {
+    throw new Error(`Invalid ${label}. Use a glob string or a non-empty array of glob strings.`);
+  }
+  return undefined;
+}
+
+function readSelectPatterns(value: unknown, label: string): IncludeSelect | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+  if (typeof value === 'string' || Array.isArray(value)) {
+    return { testIds: readStringPatterns(value, label) };
+  }
+  if (!isJsonObject(value)) {
+    throw new Error(`Invalid ${label}. Use a selector object, glob string, or glob string array.`);
+  }
+  const testIds = readStringPatterns(value.test_ids ?? value.testIds, `${label}.test_ids`);
+  const tags = readStringPatterns(value.tags, `${label}.tags`);
+  const metadata = value.metadata;
+  if (metadata !== undefined && !isJsonObject(metadata)) {
+    throw new Error(`Invalid ${label}.metadata. Use an object of metadata key/value filters.`);
+  }
+  return {
+    ...(testIds !== undefined && { testIds }),
+    ...(tags !== undefined && { tags }),
+    ...(isJsonObject(metadata) && { metadata: metadata as Record<string, unknown> }),
+  };
+}
+
+function matchesAnyPattern(value: string, patterns: string | readonly string[]): boolean {
+  return typeof patterns === 'string'
+    ? micromatch.isMatch(value, patterns)
+    : patterns.some((pattern) => micromatch.isMatch(value, pattern));
+}
+
+function metadataValueMatches(actual: unknown, expected: unknown): boolean {
+  if (Array.isArray(expected)) {
+    return expected.some((entry) => metadataValueMatches(actual, entry));
+  }
+  if (Array.isArray(actual)) {
+    return actual.some((entry) => metadataValueMatches(entry, expected));
+  }
+  return actual === expected;
+}
+
+function metadataMatches(
+  metadata: Record<string, unknown> | undefined,
+  selector: Record<string, unknown> | undefined,
+): boolean {
+  if (!selector || Object.keys(selector).length === 0) {
+    return true;
+  }
+  if (!metadata) {
+    return false;
+  }
+  return Object.entries(selector).every(([key, expected]) =>
+    metadataValueMatches(metadata[key], expected),
+  );
+}
+
+function tagsMatch(
+  metadata: Record<string, unknown> | undefined,
+  tags: string | readonly string[] | undefined,
+): boolean {
+  if (!tags) {
+    return true;
+  }
+  const rawTags = metadata?.tags;
+  const actualTags =
+    typeof rawTags === 'string'
+      ? [rawTags]
+      : Array.isArray(rawTags)
+        ? rawTags.filter((tag): tag is string => typeof tag === 'string')
+        : [];
+  return actualTags.some((tag) => matchesAnyPattern(tag, tags));
+}
+
+function evalTestMatchesSelect(test: EvalTest, select: IncludeSelect | undefined): boolean {
+  if (!select) {
+    return true;
+  }
+  const metadata = isJsonObject(test.metadata)
+    ? (test.metadata as Record<string, unknown>)
+    : undefined;
+  return (
+    (select.testIds ? matchesAnyPattern(test.id, select.testIds) : true) &&
+    tagsMatch(metadata, select.tags) &&
+    metadataMatches(metadata, select.metadata)
+  );
+}
+
+function rawCaseEffectiveMetadata(
+  raw: JsonObject,
+  suiteMetadataPayload: Record<string, unknown> | undefined,
+): Record<string, unknown> | undefined {
+  const metadata = isJsonObject(raw.metadata)
+    ? ({ ...(raw.metadata as Record<string, unknown>) } as Record<string, unknown>)
+    : undefined;
+  return mergeSuiteMetadataPayload(metadata, suiteMetadataPayload);
+}
+
+function rawCaseMatchesSelect(
+  testCase: JsonObject,
+  select: IncludeSelect | undefined,
+  suiteMetadataPayload: Record<string, unknown> | undefined,
+): boolean {
+  if (!select) {
+    return true;
+  }
+  const id = typeof testCase.id === 'string' ? testCase.id : undefined;
+  const metadata = rawCaseEffectiveMetadata(testCase, suiteMetadataPayload);
+  return (
+    (select.testIds ? (id ? matchesAnyPattern(id, select.testIds) : false) : true) &&
+    tagsMatch(metadata, select.tags) &&
+    metadataMatches(metadata, select.metadata)
+  );
+}
+
+async function resolveIncludePaths(
+  includePath: string,
+  evalFileDir: string,
+): Promise<readonly string[]> {
+  const absolutePattern = path.resolve(evalFileDir, includePath);
+  if (hasGlobMagic(includePath)) {
+    const matches = await fg(absolutePattern.replaceAll('\\', '/'), {
+      onlyFiles: true,
+      absolute: true,
+    });
+    return dedupeResolvedPathsByIdentity([...new Set(matches.sort())]);
+  }
+  return [absolutePattern];
+}
+
+async function loadRawCasesForInclude(includePath: string): Promise<readonly JsonObject[]> {
+  if (/\.eval\.ya?ml$/i.test(includePath)) {
+    const raw = interpolateEnv(
+      parseYamlValue(await readFile(includePath, 'utf8')),
+      process.env,
+    ) as unknown;
+    if (!isJsonObject(raw)) {
+      throw new Error(`Imported eval suite must be a YAML object: ${includePath}`);
+    }
+    const tests = resolveTests(raw as RawTestSuite);
+    if (typeof tests === 'string') {
+      const externalPath = path.resolve(path.dirname(includePath), tests);
+      const pathStat = await stat(externalPath).catch(() => undefined);
+      return pathStat?.isDirectory()
+        ? loadCasesFromDirectory(externalPath)
+        : loadCasesFromFile(externalPath);
+    }
+    if (Array.isArray(tests)) {
+      const expanded = await expandFileReferences(tests, path.dirname(includePath));
+      return expanded.filter(isJsonObject);
+    }
+    return [];
+  }
+  const pathStat = await stat(includePath).catch(() => undefined);
+  return pathStat?.isDirectory()
+    ? loadCasesFromDirectory(includePath)
+    : loadCasesFromFile(includePath);
+}
+
+async function loadRawCasesFromShorthand(
+  rawPath: string,
+  evalFileDir: string,
+): Promise<readonly JsonObject[]> {
+  const resolvedPaths = await resolveIncludePaths(rawPath.trim(), evalFileDir);
+  const rawCases: JsonObject[] = [];
+  for (const resolvedPath of resolvedPaths) {
+    if (/\.eval\.ya?ml$/i.test(resolvedPath)) {
+      throw new Error(
+        `tests shorthand imports raw case files only. Use an include entry with type: suite to import eval suites: ${rawPath}`,
+      );
+    }
+    rawCases.push(...(await loadRawCasesForInclude(resolvedPath)));
+  }
+  return rawCases;
+}
+
+async function expandInlineTestEntries(params: {
+  readonly entries: readonly JsonValue[];
+  readonly evalFileDir: string;
+  readonly repoRoot: URL | string;
+  readonly suiteMetadataPayload?: Record<string, unknown>;
+  readonly options?: LoadOptions;
+}): Promise<ExpandedInlineTestEntries> {
+  const withFileReferences = await expandFileReferences(params.entries, params.evalFileDir);
+  const rawCases: JsonValue[] = [];
+  const importedSuiteTests: EvalTest[] = [];
+
+  for (const entry of withFileReferences) {
+    if (typeof entry === 'string' && entry.trim().length > 0) {
+      rawCases.push(...(await loadRawCasesFromShorthand(entry, params.evalFileDir)));
+      continue;
+    }
+
+    if (!isIncludeEntry(entry)) {
+      rawCases.push(entry);
+      continue;
+    }
+
+    const includePath = entry.include.trim();
+    const mode = normalizeIncludeEntryType(entry.type, includePath);
+    const select = readSelectPatterns(entry.select, `tests[].select for include '${includePath}'`);
+    const includeRun = normalizeRunOverride(entry.run, `tests[].run for include '${includePath}'`);
+    const resolvedPaths = await resolveIncludePaths(includePath, params.evalFileDir);
+
+    for (const resolvedPath of resolvedPaths) {
+      if (mode === 'suite') {
+        const suite = await loadTestSuite(resolvedPath, params.repoRoot, {
+          ...params.options,
+          filter: select?.testIds,
+        });
+        const selectedTests = params.options?.filter
+          ? suite.tests.filter((test) => matchesFilter(test.id, params.options?.filter ?? ''))
+          : suite.tests;
+        importedSuiteTests.push(
+          ...selectedTests
+            .filter((test) => evalTestMatchesSelect(test, select))
+            .map(markSuiteImportedTest)
+            .map((test) => applyRunOverrideToTest(test, includeRun)),
+        );
+      } else {
+        const importedCases = await loadRawCasesForInclude(resolvedPath);
+        const filteredCases = select
+          ? importedCases.filter((testCase) =>
+              rawCaseMatchesSelect(testCase, select, params.suiteMetadataPayload),
+            )
+          : importedCases;
+        rawCases.push(
+          ...filteredCases.map((testCase) => applyRunOverrideToRawCase(testCase, includeRun)),
+        );
+      }
+    }
+  }
+
+  return { rawCases, importedSuiteTests };
+}
+
+function readSuiteRuntimeBlock(suite: RawTestSuite, evalFilePath: string): JsonObject | undefined {
+  if (suite.experiment !== undefined && suite.execution !== undefined) {
+    throw new Error(
+      `Invalid eval runtime config in ${evalFilePath}: use either 'experiment' or legacy 'execution', not both.`,
+    );
+  }
+  const runtime = suite.experiment ?? suite.execution;
+  return isJsonObject(runtime) ? runtime : undefined;
+}
+
+function normalizeSuiteExperimentConfig(parsed: JsonObject): ExperimentConfig | undefined {
+  const runtime = readSuiteRuntimeBlock(parsed as RawTestSuite, 'eval file');
+  if (!runtime) {
+    return undefined;
+  }
+  return normalizeExperimentConfig(runtime);
+}
+
 const SOURCE_SECRET_KEY_PATTERN =
   /(api[_-]?key|authorization|bearer|credential|password|private[_-]?key|secret|token)/i;
 const REDACTED_SOURCE_VALUE = '[redacted]';
@@ -1397,6 +1844,12 @@ function extractSuiteMetadataPayload(suite: RawTestSuite): Record<string, unknow
     ? ({ ...(suite.metadata as Record<string, unknown>) } as Record<string, unknown>)
     : {};
 
+  const suiteTags = readMetadataTags(suite.tags);
+  const metadataTags = readMetadataTags(payload.tags);
+  if (suiteTags.length > 0 || metadataTags.length > 0) {
+    payload.tags = dedupeMetadataArray([...suiteTags, ...metadataTags]);
+  }
+
   const top = (suite as JsonObject).governance;
   if (isJsonObject(top)) {
     payload.governance = top as Record<string, unknown>;
@@ -1410,6 +1863,29 @@ function extractSuiteMetadataPayload(suite: RawTestSuite): Record<string, unknow
   return Object.keys(payload).length > 0 ? payload : undefined;
 }
 
+function readMetadataTags(value: unknown): readonly string[] {
+  if (typeof value === 'string' && value.trim().length > 0) {
+    return [value.trim()];
+  }
+  if (Array.isArray(value)) {
+    return value.filter((entry): entry is string => typeof entry === 'string' && entry.length > 0);
+  }
+  return [];
+}
+
+function dedupeMetadataArray(values: readonly unknown[]): readonly unknown[] {
+  const seen = new Set<string>();
+  const out: unknown[] = [];
+  for (const value of values) {
+    const key = typeof value === 'string' ? value : JSON.stringify(value);
+    if (!seen.has(key)) {
+      seen.add(key);
+      out.push(value);
+    }
+  }
+  return out;
+}
+
 /**
  * Merge a suite-level metadata payload into a case's metadata map. The same rules apply to
  * every key in the payload: arrays concatenate suite-first and deduplicate; nested objects
@@ -1425,16 +1901,7 @@ function mergeSuiteMetadataPayload(
   for (const [key, suiteVal] of Object.entries(suitePayload)) {
     const caseVal = result[key];
     if (Array.isArray(suiteVal) && Array.isArray(caseVal)) {
-      const seen = new Set<string>();
-      const out: unknown[] = [];
-      for (const v of [...suiteVal, ...caseVal]) {
-        const k = typeof v === 'string' ? v : JSON.stringify(v);
-        if (!seen.has(k)) {
-          seen.add(k);
-          out.push(v);
-        }
-      }
-      result[key] = out;
+      result[key] = dedupeMetadataArray([...suiteVal, ...caseVal]);
     } else if (isJsonObject(suiteVal) && isJsonObject(caseVal)) {
       result[key] = mergeSuiteMetadataPayload(
         caseVal as Record<string, unknown>,
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 07e8bd821..3d8d7e8aa 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -23,10 +23,8 @@ export {
 } from './evaluation/loaders/agent-skills-parser.js';
 export {
   loadConfig,
-  resolveDefaultExperimentReference,
   resolveResultsConfigForProject,
   type AgentVConfig as AgentVYamlConfig,
-  type ExperimentsConfig,
   type ResultsConfig,
 } from './evaluation/loaders/config-loader.js';
 export {
diff --git a/packages/core/test/evaluation/config.test.ts b/packages/core/test/evaluation/config.test.ts
index af0b7a8f0..818debae1 100644
--- a/packages/core/test/evaluation/config.test.ts
+++ b/packages/core/test/evaluation/config.test.ts
@@ -41,15 +41,6 @@ describe('defineConfig execution defaults', () => {
     });
   });
 
-  it('accepts typed experiment defaults', () => {
-    const config = defineConfig({
-      defaultExperiment: 'smoke',
-      experiments: { default: 'experiments/default.yaml' },
-    });
-    expect(config.defaultExperiment).toBe('smoke');
-    expect(config.experiments?.default).toBe('experiments/default.yaml');
-  });
-
   it('rejects non-boolean verbose', () => {
     expect(() => defineConfig({ execution: { verbose: 'yes' } } as never)).toThrow();
   });
diff --git a/packages/core/test/evaluation/eval-inline-experiment.test.ts b/packages/core/test/evaluation/eval-inline-experiment.test.ts
new file mode 100644
index 000000000..08234ba71
--- /dev/null
+++ b/packages/core/test/evaluation/eval-inline-experiment.test.ts
@@ -0,0 +1,548 @@
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
+import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
+import os from 'node:os';
+import path from 'node:path';
+
+import { loadTestSuite } from '../../src/evaluation/yaml-parser.js';
+
+describe('eval.yaml inline experiment and tests imports', () => {
+  let tempDir: string;
+
+  beforeEach(async () => {
+    tempDir = await mkdtemp(path.join(os.tmpdir(), 'agentv-inline-experiment-'));
+  });
+
+  afterEach(async () => {
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it('parses top-level experiment as the canonical runtime block', async () => {
+    const evalPath = path.join(tempDir, 'runtime.eval.yaml');
+    await writeFile(
+      evalPath,
+      [
+        'experiment:',
+        '  targets: [codex, claude]',
+        '  workers: 2',
+        '  threshold: 0.7',
+        '  repeat:',
+        '    count: 2',
+        '    strategy: mean',
+        '  timeout_seconds: 30',
+        '  budget_usd: 1.5',
+        'tests:',
+        '  - id: one',
+        '    input: hello',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(evalPath, tempDir);
+
+    expect(suite.experimentConfig).toMatchObject({
+      targets: ['codex', 'claude'],
+      workers: 2,
+      threshold: 0.7,
+      repeat: { count: 2, strategy: 'mean' },
+      timeoutSeconds: 30,
+      budgetUsd: 1.5,
+    });
+    expect(suite.targets).toEqual(['codex', 'claude']);
+    expect(suite.workers).toBe(2);
+  });
+
+  it('accepts top-level execution as a legacy runtime alias but rejects both blocks', async () => {
+    const legacyPath = path.join(tempDir, 'legacy.eval.yaml');
+    await writeFile(
+      legacyPath,
+      [
+        'execution:',
+        '  target: mock',
+        'tests:',
+        '  - id: one',
+        '    input: hello',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+
+    const legacy = await loadTestSuite(legacyPath, tempDir);
+    expect(legacy.experimentConfig?.target).toBe('mock');
+    expect(legacy.targets).toBeUndefined();
+
+    const conflictPath = path.join(tempDir, 'conflict.eval.yaml');
+    await writeFile(
+      conflictPath,
+      [
+        'experiment:',
+        '  target: codex',
+        'execution:',
+        '  target: claude',
+        'tests:',
+        '  - id: one',
+        '    input: hello',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+
+    await expect(loadTestSuite(conflictPath, tempDir)).rejects.toThrow(/experiment.*execution/);
+  });
+
+  it('globs raw case files through tests[].include with deterministic ordering and select filters', async () => {
+    const casesDir = path.join(tempDir, 'cases');
+    await mkdir(casesDir, { recursive: true });
+    await writeFile(
+      path.join(casesDir, 'b.cases.yaml'),
+      [
+        '- id: b-2',
+        '  input: b2',
+        '  criteria: ok',
+        '- id: b-1',
+        '  input: b1',
+        '  criteria: ok',
+      ].join('\n'),
+    );
+    await writeFile(
+      path.join(casesDir, 'a.cases.yaml'),
+      ['- id: a-1', '  input: a1', '  criteria: ok'].join('\n'),
+    );
+    await writeFile(path.join(casesDir, 'c.jsonl'), '{"id":"c-1","input":"c1","criteria":"ok"}\n');
+    const evalPath = path.join(tempDir, 'parent.eval.yaml');
+    await writeFile(
+      evalPath,
+      [
+        'tests:',
+        '  - include: cases/*.cases.yaml',
+        '    type: tests',
+        '    select:',
+        '      test_ids: ["a-*", "b-1"]',
+        '  - include: cases/*.jsonl',
+        '    type: tests',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(evalPath, tempDir);
+
+    expect(suite.tests.map((test) => test.id)).toEqual(['a-1', 'b-1', 'c-1']);
+  });
+
+  it('keeps raw-case shorthand imports for tests strings and list entries', async () => {
+    const casesDir = path.join(tempDir, 'cases');
+    const suitesDir = path.join(tempDir, 'suites');
+    await mkdir(casesDir, { recursive: true });
+    await mkdir(suitesDir, { recursive: true });
+    await writeFile(
+      path.join(casesDir, 'a.cases.yaml'),
+      '- id: a-1\n  input: a1\n  criteria: ok\n',
+    );
+    await writeFile(
+      path.join(casesDir, 'b.cases.yaml'),
+      '- id: b-1\n  input: b1\n  criteria: ok\n',
+    );
+    await writeFile(path.join(casesDir, 'c.jsonl'), '{"id":"c-1","input":"c1","criteria":"ok"}\n');
+    await writeFile(
+      path.join(suitesDir, 'child.eval.yaml'),
+      [
+        'name: child-suite',
+        'tests:',
+        '  - id: suite-1',
+        '    input: suite',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+
+    const topLevelPath = path.join(tempDir, 'top-level.eval.yaml');
+    await writeFile(topLevelPath, 'tests: cases/*.cases.yaml\n');
+    const topLevelSuite = await loadTestSuite(topLevelPath, tempDir);
+    expect(topLevelSuite.tests.map((test) => test.id)).toEqual(['a-1', 'b-1']);
+
+    const mixedPath = path.join(tempDir, 'mixed.eval.yaml');
+    await writeFile(
+      mixedPath,
+      [
+        'tests:',
+        '  - cases/*.jsonl',
+        '  - include: suites/*.eval.yaml',
+        '    type: suite',
+        '',
+      ].join('\n'),
+    );
+    const mixedSuite = await loadTestSuite(mixedPath, tempDir);
+    expect(mixedSuite.tests.map((test) => test.id)).toEqual(['suite-1', 'c-1']);
+    expect(mixedSuite.tests[0]?.suite).toBe('child-suite');
+    expect(mixedSuite.tests[0]?.source?.importedSuiteName).toBe('child-suite');
+
+    const invalidPath = path.join(tempDir, 'invalid.eval.yaml');
+    await writeFile(invalidPath, 'tests: suites/*.eval.yaml\n');
+    await expect(loadTestSuite(invalidPath, tempDir)).rejects.toThrow(
+      /shorthand imports raw case files only/,
+    );
+  });
+
+  it('rejects direct circular suite imports', async () => {
+    const evalPath = path.join(tempDir, 'self.eval.yaml');
+    await writeFile(
+      evalPath,
+      ['name: self', 'tests:', '  - include: self.eval.yaml', '    type: suite', ''].join('\n'),
+    );
+
+    await expect(loadTestSuite(evalPath, tempDir)).rejects.toThrow(
+      /Circular eval suite import: .*self\.eval\.yaml -> .*self\.eval\.yaml/,
+    );
+  });
+
+  it('rejects indirect circular suite imports with the import chain', async () => {
+    const aPath = path.join(tempDir, 'a.eval.yaml');
+    const bPath = path.join(tempDir, 'b.eval.yaml');
+    await writeFile(
+      aPath,
+      ['name: a', 'tests:', '  - include: b.eval.yaml', '    type: suite', ''].join('\n'),
+    );
+    await writeFile(
+      bPath,
+      ['name: b', 'tests:', '  - include: a.eval.yaml', '    type: suite', ''].join('\n'),
+    );
+
+    await expect(loadTestSuite(aPath, tempDir)).rejects.toThrow(
+      /Circular eval suite import: .*a\.eval\.yaml -> .*b\.eval\.yaml -> .*a\.eval\.yaml/,
+    );
+  });
+
+  it('allows sibling re-imports of the same suite', async () => {
+    const childPath = path.join(tempDir, 'child.eval.yaml');
+    await writeFile(
+      childPath,
+      [
+        'name: child',
+        'tests:',
+        '  - id: child-case',
+        '    input: child',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+    const parentPath = path.join(tempDir, 'parent.eval.yaml');
+    await writeFile(
+      parentPath,
+      [
+        'name: parent',
+        'tests:',
+        '  - include: child.eval.yaml',
+        '    type: suite',
+        '  - include: child.eval.yaml',
+        '    type: suite',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(parentPath, tempDir);
+
+    expect(suite.tests.map((test) => test.id)).toEqual(['child-case', 'child-case']);
+  });
+
+  it('loads deep non-cyclic suite import chains', async () => {
+    const aPath = path.join(tempDir, 'chain-a.eval.yaml');
+    const bPath = path.join(tempDir, 'chain-b.eval.yaml');
+    const cPath = path.join(tempDir, 'chain-c.eval.yaml');
+    await writeFile(
+      aPath,
+      ['name: chain-a', 'tests:', '  - include: chain-b.eval.yaml', '    type: suite', ''].join(
+        '\n',
+      ),
+    );
+    await writeFile(
+      bPath,
+      ['name: chain-b', 'tests:', '  - include: chain-c.eval.yaml', '    type: suite', ''].join(
+        '\n',
+      ),
+    );
+    await writeFile(
+      cPath,
+      [
+        'name: chain-c',
+        'tests:',
+        '  - id: c-case',
+        '    input: deepest',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(aPath, tempDir);
+
+    expect(suite.tests.map((test) => test.id)).toEqual(['c-case']);
+    expect(suite.tests[0]?.suite).toBe('chain-c');
+    expect(suite.tests[0]?.source?.importedSuiteName).toBe('chain-c');
+    expect(suite.tests[0]?.source?.evalFileAbsolutePath).toBe(cPath);
+  });
+
+  it('filters include entries by tags and metadata selectors', async () => {
+    const casesDir = path.join(tempDir, 'cases');
+    await mkdir(casesDir, { recursive: true });
+    await writeFile(
+      path.join(casesDir, 'selected.cases.yaml'),
+      [
+        '- id: selected',
+        '  input: selected',
+        '  criteria: ok',
+        '  metadata:',
+        '    tags: [sql-migration, review]',
+        '    type: e2e',
+        '    priority: high',
+        '- id: wrong-priority',
+        '  input: wrong',
+        '  criteria: ok',
+        '  metadata:',
+        '    tags: [sql-migration]',
+        '    type: e2e',
+        '    priority: low',
+      ].join('\n'),
+    );
+    const evalPath = path.join(tempDir, 'parent.eval.yaml');
+    await writeFile(
+      evalPath,
+      [
+        'tests:',
+        '  - include: cases/*.cases.yaml',
+        '    type: tests',
+        '    select:',
+        '      tags: sql-*',
+        '      metadata:',
+        '        type: [e2e, regression]',
+        '        priority: high',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(evalPath, tempDir);
+
+    expect(suite.tests.map((test) => test.id)).toEqual(['selected']);
+  });
+
+  it('select.tags filters effective case metadata tags including suite identity tags', async () => {
+    const casesDir = path.join(tempDir, 'cases');
+    await mkdir(casesDir, { recursive: true });
+    await writeFile(
+      path.join(casesDir, 'cases.cases.yaml'),
+      [
+        '- id: inherited-tag',
+        '  input: inherited',
+        '  criteria: ok',
+        '- id: case-tag',
+        '  input: case',
+        '  criteria: ok',
+        '  metadata:',
+        '    tags: [review]',
+      ].join('\n'),
+    );
+    const inheritedPath = path.join(tempDir, 'inherited.eval.yaml');
+    await writeFile(
+      inheritedPath,
+      [
+        'tags: [suite-identity]',
+        'metadata:',
+        '  tags: [sql-migration]',
+        'tests:',
+        '  - include: cases/*.cases.yaml',
+        '    type: tests',
+        '    select:',
+        '      tags: sql-*',
+        '',
+      ].join('\n'),
+    );
+
+    const inheritedSuite = await loadTestSuite(inheritedPath, tempDir);
+    expect(inheritedSuite.tests.map((test) => test.id)).toEqual(['inherited-tag', 'case-tag']);
+    expect(inheritedSuite.tests[1]?.metadata?.tags).toEqual([
+      'suite-identity',
+      'sql-migration',
+      'review',
+    ]);
+
+    const identityPath = path.join(tempDir, 'identity.eval.yaml');
+    await writeFile(
+      identityPath,
+      [
+        'tags: [suite-identity]',
+        'tests:',
+        '  - include: cases/*.cases.yaml',
+        '    type: tests',
+        '    select:',
+        '      tags: suite-identity',
+        '',
+      ].join('\n'),
+    );
+
+    const identitySuite = await loadTestSuite(identityPath, tempDir);
+    expect(identitySuite.tests.map((test) => test.id)).toEqual(['inherited-tag', 'case-tag']);
+    expect(identitySuite.tests[0]?.metadata?.tags).toEqual(['suite-identity']);
+  });
+
+  it('type: suite preserves child suite context and ignores child runtime config', async () => {
+    await writeFile(
+      path.join(tempDir, 'child.eval.yaml'),
+      [
+        'name: child-suite',
+        'experiment:',
+        '  target: child-target',
+        'workspace:',
+        '  path: ./child-workspace',
+        'input: child shared input',
+        'assertions:',
+        '  - type: contains',
+        '    value: child',
+        'tests:',
+        '  - id: child-case',
+        '    input: child case input',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+    const parentPath = path.join(tempDir, 'parent.eval.yaml');
+    await writeFile(
+      parentPath,
+      [
+        'name: parent-suite',
+        'experiment:',
+        '  target: parent-target',
+        'workspace:',
+        '  path: ./parent-workspace',
+        'input: parent shared input',
+        'assertions:',
+        '  - type: contains',
+        '    value: parent',
+        'tests:',
+        '  - include: child.eval.yaml',
+        '    type: suite',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(parentPath, tempDir);
+    const test = suite.tests[0];
+
+    expect(suite.experimentConfig?.target).toBe('parent-target');
+    expect(test.suite).toBe('child-suite');
+    expect(test.workspace?.path).toBe('./child-workspace');
+    expect(test.input.map((message) => message.content)).toEqual([
+      'child shared input',
+      'child case input',
+    ]);
+    expect(test.assertions?.[0]?.type).toBe('contains');
+    expect(test.assertions?.[0]).toMatchObject({ value: 'child' });
+  });
+
+  it('applies scoped run overrides with test.run taking precedence over tests[].run', async () => {
+    await writeFile(
+      path.join(tempDir, 'child.eval.yaml'),
+      [
+        'name: child-suite',
+        'experiment:',
+        '  threshold: 0.2',
+        '  repeat:',
+        '    count: 5',
+        'tests:',
+        '  - id: child-default',
+        '    input: default',
+        '    criteria: ok',
+        '  - id: child-critical',
+        '    input: critical',
+        '    criteria: ok',
+        '    run:',
+        '      threshold: 1.0',
+        '      repeat:',
+        '        count: 1',
+        '',
+      ].join('\n'),
+    );
+    const parentPath = path.join(tempDir, 'parent.eval.yaml');
+    await writeFile(
+      parentPath,
+      [
+        'name: parent-suite',
+        'experiment:',
+        '  threshold: 0.8',
+        '  repeat:',
+        '    count: 3',
+        '    strategy: pass_at_k',
+        'tests:',
+        '  - include: child.eval.yaml',
+        '    type: suite',
+        '    run:',
+        '      threshold: 0.9',
+        '      repeat:',
+        '        count: 2',
+        '        strategy: pass_all',
+        '      timeout_seconds: 30',
+        '      budget_usd: 1.25',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(parentPath, tempDir);
+    const byId = new Map(suite.tests.map((test) => [test.id, test]));
+
+    expect(suite.experimentConfig?.threshold).toBe(0.8);
+    expect(suite.experimentConfig?.repeat).toMatchObject({ count: 3, strategy: 'pass_at_k' });
+    expect(byId.get('child-default')?.run).toMatchObject({
+      threshold: 0.9,
+      repeat: { count: 2, strategy: 'pass_all' },
+      timeoutSeconds: 30,
+      budgetUsd: 1.25,
+    });
+    expect(byId.get('child-critical')?.run).toMatchObject({
+      threshold: 1.0,
+      repeat: { count: 1 },
+      timeoutSeconds: 30,
+      budgetUsd: 1.25,
+    });
+    expect(byId.get('child-critical')?.threshold).toBe(1.0);
+  });
+
+  it('type: tests imports only raw cases and applies parent suite context', async () => {
+    await writeFile(
+      path.join(tempDir, 'child.eval.yaml'),
+      [
+        'name: child-suite',
+        'input: child shared input',
+        'assertions:',
+        '  - type: contains',
+        '    value: child',
+        'tests:',
+        '  - id: raw-case',
+        '    input: raw case input',
+        '    criteria: ok',
+        '',
+      ].join('\n'),
+    );
+    const parentPath = path.join(tempDir, 'parent.eval.yaml');
+    await writeFile(
+      parentPath,
+      [
+        'name: parent-suite',
+        'input: parent shared input',
+        'assertions:',
+        '  - type: contains',
+        '    value: parent',
+        'tests:',
+        '  - include: child.eval.yaml',
+        '    type: tests',
+        '',
+      ].join('\n'),
+    );
+
+    const suite = await loadTestSuite(parentPath, tempDir);
+    const test = suite.tests[0];
+
+    expect(test.suite).toBe('parent-suite');
+    expect(test.input.map((message) => message.content)).toEqual([
+      'parent shared input',
+      'raw case input',
+    ]);
+    expect(test.assertions?.[0]).toMatchObject({ type: 'contains', value: 'parent' });
+  });
+});
diff --git a/packages/core/test/evaluation/experiment.test.ts b/packages/core/test/evaluation/experiment.test.ts
index e9cecddf9..7524edec1 100644
--- a/packages/core/test/evaluation/experiment.test.ts
+++ b/packages/core/test/evaluation/experiment.test.ts
@@ -1,65 +1,48 @@
 import { describe, expect, it } from 'bun:test';
-import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
-import os from 'node:os';
-import path from 'node:path';
 
 import {
   buildExperimentArtifactMetadata,
-  deriveExperimentNameFromPath,
-  isExperimentFileReference,
-  loadExperimentConfig,
   normalizeExperimentConfig,
 } from '../../src/evaluation/experiment.js';
 
-describe('experiment config', () => {
+describe('inline experiment config', () => {
   it('normalizes snake_case wire fields to camelCase runtime fields', () => {
     const config = normalizeExperimentConfig({
       name: 'baseline',
       target: 'codex-gpt5',
+      targets: [{ name: 'codex-gpt5', use_target: 'codex' }],
       agent: 'codex',
       model: 'openai/gpt-5.5',
       agent_options: { reasoning_effort: 'high' },
-      suites: [
-        {
-          ref: 'evals/support.eval.yaml',
-          select: { test_ids: ['refund-eligibility', 'missing-order-date'] },
-        },
-      ],
-      scripts: ['build', { script: 'bun test', timeout_seconds: 120 }],
       runs: 3,
       early_exit: false,
       timeout_seconds: 900,
       workers: 4,
+      threshold: 0.8,
       budget_usd: 1.25,
       sandbox: 'auto',
-      setup: [{ script: 'bun install' }],
+      workspace: { mode: 'static', path: './workspace' },
     });
 
     expect(config).toMatchObject({
       name: 'baseline',
       target: 'codex-gpt5',
+      targets: [{ name: 'codex-gpt5', useTarget: 'codex' }],
       agent: 'codex',
       model: 'openai/gpt-5.5',
       agentOptions: { reasoning_effort: 'high' },
-      suites: [
-        {
-          ref: 'evals/support.eval.yaml',
-          select: { testIds: ['refund-eligibility', 'missing-order-date'] },
-        },
-      ],
-      scripts: [{ script: 'build' }, { script: 'bun test', timeoutSeconds: 120 }],
       runs: 3,
       earlyExit: false,
       timeoutSeconds: 900,
       workers: 4,
       budgetUsd: 1.25,
       sandbox: 'auto',
-      setup: [{ script: 'bun install' }],
+      workspace: { mode: 'static', path: './workspace' },
     });
     expect(config.fingerprint).toMatch(/^[a-f0-9]{64}$/);
   });
 
-  it('normalizes experiment-level repeat config with legacy trial strategy parity', () => {
+  it('normalizes repeat config with legacy trial strategy parity', () => {
     const config = normalizeExperimentConfig({
       repeat: {
         count: 4,
@@ -75,34 +58,6 @@ describe('experiment config', () => {
     });
   });
 
-  it('normalizes suite references with suite-local test id selectors', () => {
-    const config = normalizeExperimentConfig({
-      suites: [
-        {
-          ref: 'evals/support-regression.eval.yaml',
-          select: {
-            test_ids: ['refund-eligibility', 'missing-order-date'],
-          },
-        },
-        {
-          ref: 'evals/billing-*.eval.yaml',
-        },
-      ],
-    });
-
-    expect(config.suites).toEqual([
-      {
-        ref: 'evals/support-regression.eval.yaml',
-        select: {
-          testIds: ['refund-eligibility', 'missing-order-date'],
-        },
-      },
-      {
-        ref: 'evals/billing-*.eval.yaml',
-      },
-    ]);
-  });
-
   it('accepts the prerelease trials costLimitUsd spelling only inside repeat', () => {
     const config = normalizeExperimentConfig({
       repeat: {
@@ -118,36 +73,6 @@ describe('experiment config', () => {
     });
   });
 
-  it('loads a YAML experiment file', async () => {
-    const tempDir = mkdtempSync(path.join(os.tmpdir(), 'agentv-experiment-'));
-    try {
-      const experimentPath = path.join(tempDir, 'default.yaml');
-      writeFileSync(
-        experimentPath,
-        [
-          'name: with-skill',
-          'target: copilot',
-          'agent_options:',
-          '  cli_package: "@github/copilot@latest"',
-          'runs: 2',
-          'setup:',
-          '  - script: cp skills/AGENTS.md AGENTS.md',
-          '',
-        ].join('\n'),
-      );
-
-      const config = await loadExperimentConfig(experimentPath);
-
-      expect(config.name).toBe('with-skill');
-      expect(config.target).toBe('copilot');
-      expect(config.agentOptions).toEqual({ cli_package: '@github/copilot@latest' });
-      expect(config.runs).toBe(2);
-      expect(config.sourcePath).toBe(experimentPath);
-    } finally {
-      rmSync(tempDir, { recursive: true, force: true });
-    }
-  });
-
   it('rejects invalid run counts and sandbox values', () => {
     expect(() => normalizeExperimentConfig({ runs: 0 })).toThrow(/runs/);
     expect(() => normalizeExperimentConfig({ repeat: {} })).toThrow(/repeat.count/);
@@ -161,27 +86,19 @@ describe('experiment config', () => {
       /repeat and runs/,
     );
     expect(() => normalizeExperimentConfig({ sandbox: 'host' })).toThrow(/sandbox/);
-    expect(() => normalizeExperimentConfig({ suites: [] })).toThrow(/suites/);
-    expect(() =>
-      normalizeExperimentConfig({
-        suites: [{ ref: 'evals/support.eval.yaml', select: { test_ids: [] } }],
-      }),
-    ).toThrow(/suites\[0\]\.select\.test_ids/);
+    expect(() => normalizeExperimentConfig({ setup: [{ script: 'bun install' }] })).toThrow(
+      /setup is not supported/,
+    );
+    expect(() => normalizeExperimentConfig({ scripts: ['bun test'] })).toThrow(
+      /scripts are not supported/,
+    );
   });
 
-  it('builds safe snake_case artifact metadata', () => {
+  it('builds safe snake_case artifact metadata without agent options', () => {
     const config = normalizeExperimentConfig({
       name: 'baseline',
       target: 'codex',
       agent_options: { secret: 'not persisted' },
-      setup: [{ script: 'bun install' }],
-      scripts: [{ script: 'bun test' }],
-      suites: [
-        {
-          ref: 'evals/support.eval.yaml',
-          select: { test_ids: ['refund-*'] },
-        },
-      ],
       repeat: { count: 2, strategy: 'mean', cost_limit_usd: 0.5 },
       early_exit: true,
       timeout_seconds: 120,
@@ -193,12 +110,6 @@ describe('experiment config', () => {
     expect(metadata).toMatchObject({
       name: 'baseline',
       target: 'codex',
-      suites: [
-        {
-          ref: 'evals/support.eval.yaml',
-          select: { test_ids: ['refund-*'] },
-        },
-      ],
       repeat: {
         count: 2,
         strategy: 'mean',
@@ -211,18 +122,6 @@ describe('experiment config', () => {
     expect(metadata).not.toHaveProperty('agent_options');
     expect(metadata).not.toHaveProperty('setup');
     expect(metadata).not.toHaveProperty('scripts');
-  });
-
-  it('detects experiment file references separately from labels', () => {
-    expect(isExperimentFileReference('experiments/default.yaml')).toBe(true);
-    expect(isExperimentFileReference('default.yaml')).toBe(true);
-    expect(isExperimentFileReference('baseline')).toBe(false);
-  });
-
-  it('derives experiment names from file paths', () => {
-    expect(deriveExperimentNameFromPath('/repo/experiments/baseline.experiment.ts')).toBe(
-      'baseline',
-    );
-    expect(deriveExperimentNameFromPath('/repo/experiments/with-skill.yaml')).toBe('with-skill');
+    expect(metadata).not.toHaveProperty('source_path');
   });
 });
diff --git a/packages/core/test/evaluation/loaders/config-loader.test.ts b/packages/core/test/evaluation/loaders/config-loader.test.ts
index b34329da1..adb5a0c62 100644
--- a/packages/core/test/evaluation/loaders/config-loader.test.ts
+++ b/packages/core/test/evaluation/loaders/config-loader.test.ts
@@ -13,9 +13,7 @@ import {
   extractThreshold,
   loadConfig,
   parseExecutionDefaults,
-  parseExperimentsConfig,
   parseResultsConfig,
-  resolveDefaultExperimentReference,
   resolveResultsConfigForProject,
 } from '../../../src/evaluation/loaders/config-loader.js';
 import type { JsonObject } from '../../../src/evaluation/types.js';
@@ -241,30 +239,30 @@ describe('loadConfig', () => {
     }
   });
 
-  it('loads configured default experiment references', async () => {
+  it('ignores removed configured experiment defaults', async () => {
     const tempDir = mkdtempSync(path.join(os.tmpdir(), 'agentv-default-experiment-'));
+    const warnSpy = spyOn(console, 'warn').mockImplementation(() => {});
     try {
       const projectDir = path.join(tempDir, 'project');
       const evalDir = path.join(projectDir, 'evals');
       const localConfigDir = path.join(projectDir, '.agentv');
       mkdirSync(evalDir, { recursive: true });
       mkdirSync(localConfigDir, { recursive: true });
-      writeFileSync(
-        path.join(localConfigDir, 'config.yaml'),
-        'experiments:\n  default: experiments/default.yaml\n',
-      );
+      writeFileSync(path.join(localConfigDir, 'config.yaml'), 'experiments:\n  default: smoke\n');
 
       const config = await loadConfig(path.join(evalDir, 'suite.eval.yaml'), projectDir);
 
-      expect(config?.experiments?.default).toBe('experiments/default.yaml');
-      expect(resolveDefaultExperimentReference(config)).toBe('experiments/default.yaml');
+      expect(config).not.toHaveProperty('experiments');
+      expect(warnSpy.mock.calls.some((call) => String(call[0]).includes('experiments'))).toBe(true);
     } finally {
+      warnSpy.mockRestore();
       rmSync(tempDir, { recursive: true, force: true });
     }
   });
 
-  it('supports top-level default_experiment as a compatibility shorthand', async () => {
+  it('ignores removed top-level default_experiment shorthand', async () => {
     const tempDir = mkdtempSync(path.join(os.tmpdir(), 'agentv-default-experiment-alias-'));
+    const warnSpy = spyOn(console, 'warn').mockImplementation(() => {});
     try {
       const projectDir = path.join(tempDir, 'project');
       const evalDir = path.join(projectDir, 'evals');
@@ -275,22 +273,17 @@ describe('loadConfig', () => {
 
       const config = await loadConfig(path.join(evalDir, 'suite.eval.yaml'), projectDir);
 
-      expect(config?.default_experiment).toBe('smoke');
-      expect(resolveDefaultExperimentReference(config)).toBe('smoke');
+      expect(config).not.toHaveProperty('default_experiment');
+      expect(
+        warnSpy.mock.calls.some((call) => String(call[0]).includes('default_experiment')),
+      ).toBe(true);
     } finally {
+      warnSpy.mockRestore();
       rmSync(tempDir, { recursive: true, force: true });
     }
   });
 });
 
-describe('parseExperimentsConfig', () => {
-  it('parses experiments.default', () => {
-    expect(parseExperimentsConfig({ default: 'experiments/default.yaml' }, 'config.yaml')).toEqual({
-      default: 'experiments/default.yaml',
-    });
-  });
-});
-
 describe('parseResultsConfig', () => {
   it('parses valid results config with explicit path', () => {
     const result = parseResultsConfig(
diff --git a/packages/core/test/evaluation/trials.test.ts b/packages/core/test/evaluation/trials.test.ts
index a5889c9ed..b3489e475 100644
--- a/packages/core/test/evaluation/trials.test.ts
+++ b/packages/core/test/evaluation/trials.test.ts
@@ -49,6 +49,27 @@ describe('aggregateTrials', () => {
     });
   });
 
+  describe('pass_all strategy', () => {
+    it('uses the weakest attempt score so every attempt must pass', () => {
+      const config: TrialsConfig = { count: 3, strategy: 'pass_all' };
+      const trials: TrialResult[] = [
+        { attempt: 1, score: 1, verdict: 'pass' },
+        { attempt: 2, score: 0.7, verdict: 'fail' },
+        { attempt: 3, score: 0.9, verdict: 'pass' },
+      ];
+
+      const result = aggregateTrials(trials, config);
+
+      expect(result.score).toBe(0.7);
+      expect(result.aggregation.strategy).toBe('pass_all');
+      if (result.aggregation.strategy === 'pass_all') {
+        expect(result.aggregation.passedAttempts).toBe(2);
+        expect(result.aggregation.totalAttempts).toBe(3);
+        expect(result.aggregation.min).toBe(0.7);
+      }
+    });
+  });
+
   describe('mean strategy', () => {
     it('averages scores correctly', () => {
       const trials: TrialResult[] = [
diff --git a/packages/core/test/evaluation/validation/eval-file-schema.test.ts b/packages/core/test/evaluation/validation/eval-file-schema.test.ts
index 5b15429cf..06093ddda 100644
--- a/packages/core/test/evaluation/validation/eval-file-schema.test.ts
+++ b/packages/core/test/evaluation/validation/eval-file-schema.test.ts
@@ -49,4 +49,82 @@ describe('EvalFileSchema input shorthand', () => {
 
     expect(result.success).toBe(false);
   });
+
+  it('accepts inline experiment runtime and include selection entries', () => {
+    const result = EvalFileSchema.safeParse({
+      name: 'wrapper',
+      experiment: {
+        targets: ['codex', 'claude'],
+        workers: 2,
+        threshold: 0.8,
+        repeat: { count: 2, strategy: 'mean' },
+      },
+      tests: [
+        {
+          include: './evals/**/*.eval.yaml',
+          type: 'suite',
+          select: {
+            test_ids: ['pr50857-*'],
+            tags: ['sql-migration'],
+            metadata: {
+              type: ['e2e', 'regression'],
+              priority: 'high',
+            },
+          },
+          run: {
+            threshold: 1,
+            repeat: { count: 2, strategy: 'pass_all' },
+            timeout_seconds: 120,
+            budget_usd: 2,
+          },
+        },
+        {
+          include: './cases/**/*.cases.yaml',
+          type: 'tests',
+        },
+      ],
+    });
+
+    expect(result.success).toBe(true);
+  });
+
+  it('rejects eval files that set both experiment and legacy execution', () => {
+    const result = EvalFileSchema.safeParse({
+      experiment: { target: 'codex' },
+      execution: { target: 'claude' },
+      tests: [baseTest],
+    });
+
+    expect(result.success).toBe(false);
+  });
+
+  it('rejects experiment lifecycle commands', () => {
+    const result = EvalFileSchema.safeParse({
+      experiment: {
+        setup: [{ script: 'bun install' }],
+        scripts: ['bun test'],
+      },
+      tests: [baseTest],
+    });
+
+    expect(result.success).toBe(false);
+  });
+
+  it('rejects scoped run overrides that change the target or setup', () => {
+    const result = EvalFileSchema.safeParse({
+      tests: [
+        {
+          id: 'case-1',
+          input: 'Question',
+          criteria: 'Goal',
+          run: {
+            target: 'other-agent',
+            setup: [{ script: 'bun install' }],
+          },
+        },
+      ],
+    });
+
+    expect(result.success).toBe(false);
+  });
 });
diff --git a/packages/core/test/evaluation/validation/eval-schema-sync.test.ts b/packages/core/test/evaluation/validation/eval-schema-sync.test.ts
index 7db6c29a9..89e410861 100644
--- a/packages/core/test/evaluation/validation/eval-schema-sync.test.ts
+++ b/packages/core/test/evaluation/validation/eval-schema-sync.test.ts
@@ -3,7 +3,6 @@ import { readFile } from 'node:fs/promises';
 import path from 'node:path';
 import { zodToJsonSchema } from 'zod-to-json-schema';
 import { EvalFileSchema } from '../../../src/evaluation/validation/eval-file.schema.js';
-import { ExperimentFileSchema } from '../../../src/evaluation/validation/experiment-file.schema.js';
 
 describe('generated schema sync', () => {
   it('keeps eval-schema.json synced with the Zod schema', async () => {
@@ -33,28 +32,4 @@ describe('generated schema sync', () => {
     // Compare (ignoring formatting differences)
     expect(JSON.parse(JSON.stringify(committed))).toEqual(JSON.parse(JSON.stringify(expected)));
   });
-
-  it('keeps experiment-schema.json synced with the Zod schema', async () => {
-    const repoRoot = path.resolve(import.meta.dirname, '../../../../..');
-    const schemaPath = path.join(
-      repoRoot,
-      'skills-data/agentv-eval-writer/references/experiment-schema.json',
-    );
-
-    const committed = JSON.parse(await readFile(schemaPath, 'utf8'));
-    const generated = zodToJsonSchema(ExperimentFileSchema, {
-      name: 'ExperimentFile',
-      $refStrategy: 'none',
-      target: 'jsonSchema2019-09',
-    });
-
-    const expected = {
-      $schema: 'https://json-schema.org/draft/2019-09/schema',
-      title: 'AgentV Experiment File',
-      description: 'Schema for AgentV experiment YAML files (experiments/*.yaml)',
-      ...generated,
-    };
-
-    expect(JSON.parse(JSON.stringify(committed))).toEqual(JSON.parse(JSON.stringify(expected)));
-  });
 });
diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts
index dd285c692..f23a05e59 100644
--- a/packages/core/test/evaluation/validation/eval-validator.test.ts
+++ b/packages/core/test/evaluation/validation/eval-validator.test.ts
@@ -34,6 +34,148 @@ describe('validateEvalFile', () => {
     expect(result.errors).toHaveLength(0);
   });
 
+  it('validates inline experiment runtime and tests include entries', async () => {
+    const filePath = path.join(tempDir, 'inline-experiment-include.yaml');
+    await writeFile(
+      filePath,
+      `name: wrapper
+experiment:
+  targets: [codex, claude]
+  workers: 2
+tests:
+  - include: ./evals/**/*.eval.yaml
+    type: suite
+    select:
+      test_ids: [pr50857-*]
+      tags: [sql-migration]
+      metadata:
+        type: [e2e, regression]
+        priority: high
+    run:
+      threshold: 1.0
+      repeat:
+        count: 2
+        strategy: pass_all
+      timeout_seconds: 120
+      budget_usd: 2
+  - include: ./cases/**/*.cases.yaml
+    type: tests
+`,
+    );
+
+    const result = await validateEvalFile(filePath);
+
+    expect(result.valid).toBe(true);
+    expect(result.errors).toHaveLength(0);
+  });
+
+  it('rejects include entries without type', async () => {
+    const filePath = path.join(tempDir, 'include-missing-type.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - include: ./cases/**/*.cases.yaml
+`,
+    );
+
+    const result = await validateEvalFile(filePath);
+
+    expect(result.valid).toBe(false);
+    expect(result.errors.some((error) => error.message.includes("Missing 'type'"))).toBe(true);
+  });
+
+  it('rejects eval files with both experiment and legacy execution', async () => {
+    const filePath = path.join(tempDir, 'runtime-conflict.yaml');
+    await writeFile(
+      filePath,
+      `experiment:
+  target: codex
+execution:
+  target: claude
+tests:
+  - id: test-1
+    criteria: Goal
+    input: Query
+`,
+    );
+
+    const result = await validateEvalFile(filePath);
+
+    expect(result.valid).toBe(false);
+    expect(result.errors.some((error) => error.message.includes('experiment'))).toBe(true);
+  });
+
+  it('rejects scoped run overrides that include target-changing fields', async () => {
+    const filePath = path.join(tempDir, 'invalid-run-override.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: test-1
+    criteria: Goal
+    input: Query
+    run:
+      threshold: 1.0
+      target: other-agent
+`,
+    );
+
+    const result = await validateEvalFile(filePath);
+
+    expect(result.valid).toBe(false);
+    expect(result.errors.some((error) => error.location === 'tests[0].run.target')).toBe(true);
+  });
+
+  it('rejects direct circular suite imports', async () => {
+    const filePath = path.join(tempDir, 'validator-self-cycle.eval.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - include: validator-self-cycle.eval.yaml
+    type: suite
+`,
+    );
+
+    const result = await validateEvalFile(filePath);
+
+    expect(result.valid).toBe(false);
+    expect(
+      result.errors.some((error) => error.message.includes('Circular eval suite import')),
+    ).toBe(true);
+    expect(
+      result.errors.some((error) => /validator-self-cycle\.eval\.yaml/.test(error.message)),
+    ).toBe(true);
+  });
+
+  it('rejects indirect circular suite imports', async () => {
+    const aPath = path.join(tempDir, 'validator-a.eval.yaml');
+    const bPath = path.join(tempDir, 'validator-b.eval.yaml');
+    await writeFile(
+      aPath,
+      `tests:
+  - include: validator-b.eval.yaml
+    type: suite
+`,
+    );
+    await writeFile(
+      bPath,
+      `tests:
+  - include: validator-a.eval.yaml
+    type: suite
+`,
+    );
+
+    const result = await validateEvalFile(aPath);
+
+    expect(result.valid).toBe(false);
+    expect(
+      result.errors.some((error) =>
+        /validator-a\.eval\.yaml.*validator-b\.eval\.yaml.*validator-a\.eval\.yaml/.test(
+          error.message,
+        ),
+      ),
+    ).toBe(true);
+  });
+
   it('validates eval file that omits input when sibling PROMPT.md exists', async () => {
     const evalDir = path.join(tempDir, 'prompt-md-fallback');
     await mkdir(evalDir, { recursive: true });
diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts
index 0d46e7867..e9653f61d 100644
--- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts
+++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts
@@ -176,18 +176,19 @@ tests:
 
   it('merges arbitrary suite metadata into each case and lets case scalars override', async () => {
     const { filePath, dir } = createTempYaml(`
+tags: [cargowise, database]
 metadata:
   source_repo: https://github.com/virattt/dexter
   source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629
   source_file: src/evals/dataset/finance_agent.csv
-  tags: [suite]
+  tags: [sql, database]
 tests:
   - id: case-1
     criteria: "Answer"
     input: "Query"
     metadata:
       source_file: override.csv
-      tags: [case]
+      tags: [review, sql]
 `);
 
     const suite = await loadTestSuite(filePath, dir);
@@ -195,7 +196,7 @@ tests:
       source_repo: 'https://github.com/virattt/dexter',
       source_commit: '8d9419829f443f84b804d033bb2c3b1fbd788629',
       source_file: 'override.csv',
-      tags: ['suite', 'case'],
+      tags: ['cargowise', 'database', 'sql', 'review'],
     });
   });
 
diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md
index 86427366d..eaa2a19b8 100644
--- a/skills-data/agentv-eval-writer/SKILL.md
+++ b/skills-data/agentv-eval-writer/SKILL.md
@@ -18,12 +18,15 @@ Comprehensive docs: https://agentv.dev
 
 Treat YAML as the canonical portable model. Prefer authoring `.eval.yaml` / `EVAL.yaml` first, then use TypeScript helpers, Python scripts, or executable graders only when they lower to the same fields or when the evaluation logic must actually run code.
 
-Eval files define what is tested: prompts, datasets, assertions, and task fixtures.
-Experiment files define how those evals run: targets, setup, scripts, timeout,
-sandbox, suite selection, and repeat-run policy. Use `experiments/*.yaml` for
-committed run configurations. In eval YAML, keep `tests[]` as the atomic eval
-definition. In experiment YAML, reference eval suites with `suites[]` and select
-suite-local tests with `select.test_ids[]`.
+Eval files define what is tested and how it runs: prompts, datasets, assertions,
+task fixtures, and the inline `experiment:` runtime block. Use `tests[]` include
+entries for composition. `type: suite` preserves imported suite context;
+`type: tests` imports raw cases only. String-valued `tests` and string entries
+inside `tests[]` are raw-case import shorthand for direct paths, directories, and
+globs; suite imports must use `include:` with `type: suite`. Use scoped `run:`
+on include entries or individual tests only for `threshold`, `repeat`,
+`timeout_seconds`, and `budget_usd`; keep target selection, setup, and workspace
+mutation under the parent `experiment:`.
 
 Use `@agentv/sdk` for TypeScript helper imports. Do not use `@agentv/eval` for new evals, examples, scaffolds, or skill guidance; it was a deprecated compatibility package and has been removed from this repository.
 
@@ -89,9 +92,7 @@ tests:
         content: "What's my name?"
     expected_output: "Your name is Alice."
     assertions:
-      - type: rubrics
-        criteria:
-          - Correctly recalls the user's name from earlier in the conversation
+      - Correctly recalls the user's name from earlier in the conversation
 ```
 
 **Guidelines:** preserve exact wording in `expected_output`; aim for 5–15 tests per transcript; pick exchanges that test different capabilities.
@@ -100,7 +101,7 @@ tests:
 
 ```yaml
 description: Example eval
-execution:
+experiment:
   target: default
 
 tests:
@@ -109,16 +110,14 @@ tests:
     input: "Say hello"
     expected_output: "Hello! How can I help you?"
     assertions:
-      - type: rubrics
-        criteria:
-          - Greeting is friendly and warm
-          - Offers to help
+      - Greeting is friendly and warm
+      - Offers to help
 ```
 
 ## Eval File Structure
 
-**Required:** `tests` (array or string path)
-**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `suite`, `workspace`, `assertions`, `input`
+**Required:** `tests` (array or string raw-case path)
+**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `experiment`, `suite`, `workspace`, `assertions`, `input`
 
 **Test fields:**
 
@@ -202,23 +201,30 @@ The external file can be YAML (array of test objects) or JSONL.
 `assertions` defines graders at the suite level or per-test level. It is the canonical field for all graders:
 
 ```yaml
-# Suite-level (appended to every test)
+# Mix exact checks with rubric shorthand when both matter.
 assertions:
   - type: is-json
     required: true
   - type: contains
     value: "status"
+  - Correctly answers the user's question
+  - Explains the reasoning clearly
 
 tests:
   - id: test-1
-    criteria: Returns JSON
+    criteria: Returns a useful status payload
     input: Get status
-    # Per-test assertions (runs before suite-level)
     assertions:
       - type: equals
         value: '{"status": "ok"}'
+      - Explains what the status means
 ```
 
+Plain strings in `assertions` are rubric criteria and are the preferred shape for
+qualitative agent behavior. Use deterministic assertions (`contains`, `regex`,
+`is-json`, `equals`) only for exact machine-verifiable outputs, and code graders
+when the check must inspect files, run commands, or validate structured state.
+
 ## How `criteria` and `assertions` Interact
 
 `criteria` is a **data field** — it describes what the response should accomplish. It is **not** a grader. How it gets evaluated depends on whether `assertions` is present:
@@ -227,7 +233,7 @@ tests:
 |----------|-------------|----------|
 | `criteria` + **no `assertions`** | Implicit `llm-grader` runs automatically against `criteria` | No |
 | `criteria` + **`assertions` with only deterministic graders** (contains, regex, etc.) | Only declared graders run. `criteria` is **not evaluated**. | Yes — warns that no grader will consume criteria |
-| `criteria` + **`assertions` with a grader** (`llm-grader`, `code-grader`, `rubrics`) | Declared graders run. Graders receive `criteria` as input. | No |
+| `criteria` + **`assertions` with rubric shorthand or a grader** (plain strings, `llm-grader`, `code-grader`, `rubrics`) | Declared graders run. Graders receive `criteria` as input. | No |
 
 ### No assertions → implicit llm-grader
 
@@ -243,7 +249,9 @@ tests:
 
 ### assertions present → no implicit grader
 
-When `assertions` is defined, **only the declared graders run**. If you want an LLM grader alongside deterministic checks, declare it explicitly:
+When `assertions` is defined, **only the declared graders run**. For semantic
+checks, add plain rubric strings. If you need a custom LLM prompt or grader
+target, declare `llm-grader` explicitly:
 
 ```yaml
 tests:
@@ -251,7 +259,7 @@ tests:
     criteria: Response is helpful and mentions the fix
     input: "Debug this function..."
     assertions:
-      - type: llm-grader       # must be explicit when assertions is present
+      - Explains why the bug happens
       - type: contains
         value: "fix"
 ```
@@ -506,16 +514,13 @@ Binary check: is the output valid JSON?
 
 ### rubrics
 ```yaml
-- type: rubrics
-  criteria:
-    - id: accuracy
-      outcome: Correctly identifies the denied party
-      weight: 5.0
-    - id: reasoning
-      outcome: Provides clear reasoning
-      weight: 3.0
-```
-LLM-judged structured evaluation with weighted criteria. Criteria items support `id`, `outcome`, `weight`, and `required` fields.
+- Correctly identifies the denied party
+- Provides clear reasoning
+```
+LLM-judged structured evaluation. Plain strings are the preferred shorthand.
+Use `type: rubrics` only when you need weighted criteria, `required: false`,
+`min_score`, or score ranges. Criteria items support `id`, `outcome`, `weight`,
+and `required` fields.
 Use optional `operator: correctness` for positive support checks or `operator: contradiction` for guard criteria where omission is acceptable but incompatible claims fail.
 
 See `references/rubric-grader.md` for score-range mode and scoring formula.
@@ -766,7 +771,6 @@ Do not invent a separate Opik-specific eval surface. Keep the eval definition in
 ## Schemas
 
 - Eval file: `references/eval-schema.json`
-- Experiment file: `references/experiment-schema.json`
 - Config: `references/config-schema.json`
 
 ## Accessing reference files
@@ -775,14 +779,12 @@ To load a specific reference without pulling the entire skill into context:
 
 ```bash
 agentv skills get agentv-eval-writer --ref eval-schema.json
-agentv skills get agentv-eval-writer --ref experiment-schema.json
 ```
 
 Or resolve the skill directory and read files directly:
 
 ```bash
 cat $(agentv skills path agentv-eval-writer)/references/eval-schema.json
-cat $(agentv skills path agentv-eval-writer)/references/experiment-schema.json
 ```
 
 Use `--full` to retrieve every file in the skill at once.
diff --git a/skills-data/agentv-eval-writer/references/eval-schema.json b/skills-data/agentv-eval-writer/references/eval-schema.json
index 2abcdd98d..21514a2bc 100644
--- a/skills-data/agentv-eval-writer/references/eval-schema.json
+++ b/skills-data/agentv-eval-writer/references/eval-schema.json
@@ -154,2699 +154,2080 @@
             {
               "type": "array",
               "items": {
-                "type": "object",
-                "properties": {
-                  "id": {
-                    "type": "string",
-                    "minLength": 1
-                  },
-                  "vars": {
+                "anyOf": [
+                  {
                     "type": "object",
-                    "properties": {},
-                    "additionalProperties": {}
-                  },
-                  "criteria": {
-                    "type": "string"
-                  },
-                  "input": {
-                    "anyOf": [
-                      {
-                        "type": "string"
+                    "properties": {
+                      "id": {
+                        "type": "string",
+                        "minLength": 1
                       },
-                      {
+                      "vars": {
                         "type": "object",
-                        "properties": {
-                          "role": {
-                            "type": "string",
-                            "enum": ["system", "user", "assistant", "tool"]
+                        "properties": {},
+                        "additionalProperties": {}
+                      },
+                      "criteria": {
+                        "type": "string"
+                      },
+                      "input": {
+                        "anyOf": [
+                          {
+                            "type": "string"
                           },
-                          "content": {
-                            "anyOf": [
-                              {
-                                "type": "string"
-                              },
-                              {
-                                "type": "object",
-                                "properties": {},
-                                "additionalProperties": {}
+                          {
+                            "type": "object",
+                            "properties": {
+                              "role": {
+                                "type": "string",
+                                "enum": ["system", "user", "assistant", "tool"]
                               },
-                              {
-                                "type": "array",
-                                "items": {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "enum": ["text", "file", "image"]
-                                    },
-                                    "value": {
+                              "content": {
+                                "anyOf": [
+                                  {
+                                    "type": "string"
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {},
+                                    "additionalProperties": {}
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "enum": ["text", "file", "image"]
+                                        },
+                                        "value": {
+                                          "type": "string"
+                                        }
+                                      },
+                                      "required": ["type", "value"],
+                                      "additionalProperties": false
+                                    }
+                                  }
+                                ]
+                              }
+                            },
+                            "required": ["role", "content"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "role": {
+                                "not": {}
+                              }
+                            },
+                            "additionalProperties": {}
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "object",
+                              "properties": {
+                                "role": {
+                                  "type": "string",
+                                  "enum": ["system", "user", "assistant", "tool"]
+                                },
+                                "content": {
+                                  "anyOf": [
+                                    {
                                       "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
                                     }
-                                  },
-                                  "required": ["type", "value"],
-                                  "additionalProperties": false
+                                  ]
                                 }
-                              }
-                            ]
-                          }
-                        },
-                        "required": ["role", "content"],
-                        "additionalProperties": false
-                      },
-                      {
-                        "type": "object",
-                        "properties": {
-                          "role": {
-                            "not": {}
+                              },
+                              "required": ["role", "content"],
+                              "additionalProperties": false
+                            }
                           }
-                        },
-                        "additionalProperties": {}
+                        ]
                       },
-                      {
+                      "input_files": {
                         "type": "array",
                         "items": {
-                          "type": "object",
-                          "properties": {
-                            "role": {
-                              "type": "string",
-                              "enum": ["system", "user", "assistant", "tool"]
-                            },
-                            "content": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
+                          "type": "string"
+                        }
+                      },
+                      "expected_output": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "object",
+                            "properties": {},
+                            "additionalProperties": {}
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "object",
+                              "properties": {
+                                "role": {
+                                  "type": "string",
+                                  "enum": ["system", "user", "assistant", "tool"]
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                "content": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  ]
                                 }
-                              ]
+                              },
+                              "required": ["role", "content"],
+                              "additionalProperties": false
                             }
-                          },
-                          "required": ["role", "content"],
-                          "additionalProperties": false
-                        }
-                      }
-                    ]
-                  },
-                  "input_files": {
-                    "type": "array",
-                    "items": {
-                      "type": "string"
-                    }
-                  },
-                  "expected_output": {
-                    "anyOf": [
-                      {
-                        "type": "string"
-                      },
-                      {
-                        "type": "object",
-                        "properties": {},
-                        "additionalProperties": {}
+                          }
+                        ]
                       },
-                      {
+                      "assertions": {
                         "type": "array",
                         "items": {
-                          "type": "object",
-                          "properties": {
-                            "role": {
-                              "type": "string",
-                              "enum": ["system", "user", "assistant", "tool"]
-                            },
-                            "content": {
-                              "anyOf": [
-                                {
+                          "anyOf": [
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
                                   "type": "string"
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["role", "content"],
-                          "additionalProperties": false
-                        }
-                      }
-                    ]
-                  },
-                  "assertions": {
-                    "type": "array",
-                    "items": {
-                      "anyOf": [
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
                                 },
-                                {
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["code-grader", "code_grader"]
-                            },
-                            "command": {
-                              "anyOf": [
-                                {
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["code-grader", "code_grader"]
+                                },
+                                "command": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "script": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "cwd": {
                                   "type": "string"
                                 },
-                                {
+                                "target": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "max_calls": {
+                                          "type": "number"
+                                        }
+                                      },
+                                      "additionalProperties": false
+                                    }
+                                  ]
+                                },
+                                "config": {
+                                  "type": "object",
+                                  "additionalProperties": {}
+                                },
+                                "preprocessors": {
                                   "type": "array",
                                   "items": {
-                                    "type": "string"
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "minLength": 1
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
                                   }
                                 }
-                              ]
+                              },
+                              "required": ["type", "command"],
+                              "additionalProperties": false
                             },
-                            "script": {
-                              "anyOf": [
-                                {
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
                                   "type": "string"
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "cwd": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "max_calls": {
-                                      "type": "number"
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
                                     }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string"
-                                      },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type", "command"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                  ]
                                 },
-                                {
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["llm-grader", "llm_grader"]
-                            },
-                            "prompt": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "command": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
-                                        },
-                                        {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
-                                          }
-                                        }
-                                      ]
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["llm-grader", "llm_grader"]
+                                },
+                                "prompt": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
                                     },
-                                    "script": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
-                                        },
-                                        {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
-                                          }
-                                        }
-                                      ]
-                                    },
-                                    "config": {
-                                      "type": "object",
-                                      "additionalProperties": {}
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
+                                    {
                                       "type": "object",
                                       "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
+                                        "command": {
+                                          "anyOf": [
                                             {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                              "type": "string"
                                             },
                                             {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
                                             }
                                           ]
                                         },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
+                                        "script": {
+                                          "anyOf": [
+                                            {
+                                              "type": "string"
+                                            },
+                                            {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
+                                            }
+                                          ]
+                                        },
+                                        "config": {
+                                          "type": "object",
+                                          "additionalProperties": {}
                                         }
                                       },
-                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
-                                  }
+                                  ]
                                 },
-                                "additionalProperties": false
-                              }
-                            },
-                            "model": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "type": "string"
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
+                                "rubrics": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
                                         "type": "string"
                                       },
-                                      {
+                                      "outcome": {
+                                        "type": "string"
+                                      },
+                                      "operator": {
+                                        "type": "string",
+                                        "enum": ["correctness", "contradiction"]
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
                                         "type": "array",
                                         "items": {
-                                          "type": "string"
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
+                                              "type": "array",
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
+                                            },
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            }
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
                                         }
                                       }
-                                    ]
+                                    },
+                                    "additionalProperties": false
                                   }
                                 },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "include": {
-                              "type": "string",
-                              "minLength": 1
-                            }
-                          },
-                          "required": ["include"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "model": {
+                                  "type": "string"
                                 },
-                                {
+                                "target": {
+                                  "type": "string"
+                                },
+                                "config": {
+                                  "type": "object",
+                                  "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
                                   "minimum": 0,
-                                  "maximum": 1
+                                  "maximum": 2
+                                },
+                                "preprocessors": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "minLength": 1
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
+                                  }
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "composite"
-                            },
-                            "assertions": {
-                              "type": "array",
-                              "items": {}
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "evaluators": {
-                              "type": "array",
-                              "items": {}
+                            {
+                              "type": "object",
+                              "properties": {
+                                "include": {
+                                  "type": "string",
+                                  "minLength": 1
+                                }
+                              },
+                              "required": ["include"],
+                              "additionalProperties": false
                             },
-                            "aggregator": {
-                              "anyOf": [
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "weighted_average"
-                                    },
-                                    "weights": {
-                                      "type": "object",
-                                      "additionalProperties": {
-                                        "type": "number"
-                                      }
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "threshold"
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
                                     },
-                                    "threshold": {
+                                    {
                                       "type": "number",
+                                      "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
                                     }
-                                  },
-                                  "required": ["type", "threshold"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "code-grader"
-                                    },
-                                    "path": {
-                                      "type": "string"
-                                    },
-                                    "cwd": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type", "path"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "llm-grader"
-                                    },
-                                    "prompt": {
-                                      "type": "string"
-                                    },
-                                    "model": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "aggregator"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                  ]
                                 },
-                                {
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["tool-trajectory", "tool_trajectory"]
-                            },
-                            "mode": {
-                              "type": "string",
-                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
-                            },
-                            "minimums": {
-                              "type": "object",
-                              "additionalProperties": {
-                                "type": "integer",
-                                "minimum": 0
-                              }
-                            },
-                            "expected": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "tool": {
-                                    "type": "string"
-                                  },
-                                  "args": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "const": "any"
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "composite"
+                                },
+                                "assertions": {
+                                  "type": "array",
+                                  "items": {}
+                                },
+                                "evaluators": {
+                                  "type": "array",
+                                  "items": {}
+                                },
+                                "aggregator": {
+                                  "anyOf": [
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "weighted_average"
+                                        },
+                                        "weights": {
+                                          "type": "object",
+                                          "additionalProperties": {
+                                            "type": "number"
+                                          }
+                                        }
                                       },
-                                      {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      }
-                                    ]
-                                  },
-                                  "max_duration_ms": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "maxDurationMs": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "args_match": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                      "required": ["type"],
+                                      "additionalProperties": false
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "threshold"
+                                        },
+                                        "threshold": {
+                                          "type": "number",
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
                                       },
-                                      {
-                                        "type": "array",
-                                        "items": {
+                                      "required": ["type", "threshold"],
+                                      "additionalProperties": false
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "code-grader"
+                                        },
+                                        "path": {
+                                          "type": "string"
+                                        },
+                                        "cwd": {
                                           "type": "string"
                                         }
-                                      }
-                                    ]
-                                  },
-                                  "argsMatch": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
-                                      {
-                                        "type": "array",
-                                        "items": {
+                                      "required": ["type", "path"],
+                                      "additionalProperties": false
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "llm-grader"
+                                        },
+                                        "prompt": {
+                                          "type": "string"
+                                        },
+                                        "model": {
                                           "type": "string"
                                         }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["tool"],
-                                "additionalProperties": false
-                              }
-                            },
-                            "args_match": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
+                                      },
+                                      "required": ["type"],
+                                      "additionalProperties": false
+                                    }
+                                  ]
                                 }
-                              ]
+                              },
+                              "required": ["type", "aggregator"],
+                              "additionalProperties": false
                             },
-                            "argsMatch": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "mode"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                {
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["field-accuracy", "field_accuracy"]
-                            },
-                            "fields": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "path": {
-                                    "type": "string"
-                                  },
-                                  "match": {
-                                    "type": "string",
-                                    "enum": ["exact", "numeric_tolerance", "date"]
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "tolerance": {
-                                    "type": "number",
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["tool-trajectory", "tool_trajectory"]
+                                },
+                                "mode": {
+                                  "type": "string",
+                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                                },
+                                "minimums": {
+                                  "type": "object",
+                                  "additionalProperties": {
+                                    "type": "integer",
                                     "minimum": 0
-                                  },
-                                  "relative": {
-                                    "type": "boolean"
-                                  },
-                                  "formats": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
                                   }
                                 },
-                                "required": ["path", "match"],
-                                "additionalProperties": false
+                                "expected": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "tool": {
+                                        "type": "string"
+                                      },
+                                      "args": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "const": "any"
+                                          },
+                                          {
+                                            "type": "object",
+                                            "additionalProperties": {}
+                                          }
+                                        ]
+                                      },
+                                      "max_duration_ms": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "maxDurationMs": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "args_match": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      },
+                                      "argsMatch": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["tool"],
+                                    "additionalProperties": false
+                                  }
+                                },
+                                "args_match": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string",
+                                      "enum": ["exact", "ignore", "subset", "superset"]
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "argsMatch": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string",
+                                      "enum": ["exact", "ignore", "subset", "superset"]
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                }
                               },
-                              "minItems": 1
-                            },
-                            "aggregation": {
-                              "type": "string",
-                              "enum": ["weighted_average", "all_or_nothing"]
-                            }
-                          },
-                          "required": ["type", "fields"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              "required": ["type", "mode"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "latency"
-                            },
-                            "threshold": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "threshold"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                  "minimum": 0
                                 },
-                                {
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "cost"
-                            },
-                            "budget": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "budget"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
+                                },
+                                "negate": {
                                   "type": "boolean"
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["field-accuracy", "field_accuracy"]
+                                },
+                                "fields": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "path": {
+                                        "type": "string"
+                                      },
+                                      "match": {
+                                        "type": "string",
+                                        "enum": ["exact", "numeric_tolerance", "date"]
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "tolerance": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "relative": {
+                                        "type": "boolean"
+                                      },
+                                      "formats": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    },
+                                    "required": ["path", "match"],
+                                    "additionalProperties": false
+                                  },
+                                  "minItems": 1
+                                },
+                                "aggregation": {
+                                  "type": "string",
+                                  "enum": ["weighted_average", "all_or_nothing"]
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["token-usage", "token_usage"]
-                            },
-                            "max_total": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_input": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_output": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "fields"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "latency"
+                                },
+                                "threshold": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["execution-metrics", "execution_metrics"]
-                            },
-                            "max_tool_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_llm_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_tokens": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_cost_usd": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_duration_ms": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "target_exploration_ratio": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "exploration_tolerance": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "threshold"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "cost"
+                                },
+                                "budget": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
+                              },
+                              "required": ["type", "budget"],
+                              "additionalProperties": false
                             },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["token-usage", "token_usage"]
+                                },
+                                "max_total": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_input": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_output": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "regex"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["is-json", "is_json"]
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
+                                },
+                                "negate": {
                                   "type": "boolean"
                                 },
-                                {
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["execution-metrics", "execution_metrics"]
+                                },
+                                "max_tool_calls": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_llm_calls": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_tokens": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_cost_usd": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_duration_ms": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "target_exploration_ratio": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "exploration_tolerance": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "equals"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "contains"
+                                },
+                                "value": {
+                                  "type": "string"
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
                             },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "rubrics"
-                            },
-                            "criteria": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                "additionalProperties": false
-                              },
-                              "minItems": 1
-                            }
-                          },
-                          "required": ["type", "criteria"],
-                          "additionalProperties": false
-                        }
-                      ]
-                    }
-                  },
-                  "evaluators": {
-                    "type": "array",
-                    "items": {
-                      "anyOf": [
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                {
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["code-grader", "code_grader"]
-                            },
-                            "command": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "regex"
+                                },
+                                "value": {
+                                  "type": "string"
                                 }
-                              ]
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
                             },
-                            "script": {
-                              "anyOf": [
-                                {
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
                                   "type": "string"
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "cwd": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "max_calls": {
-                                      "type": "number"
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
                                     }
-                                  },
-                                  "additionalProperties": false
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["is-json", "is_json"]
                                 }
-                              ]
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "config": {
+                            {
                               "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string"
-                                      },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type", "command"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                {
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "equals"
+                                },
+                                "value": {
+                                  "type": "string"
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["llm-grader", "llm_grader"]
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
                             },
-                            "prompt": {
-                              "anyOf": [
-                                {
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
                                   "type": "string"
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "command": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
-                                        },
-                                        {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
-                                          }
-                                        }
-                                      ]
-                                    },
-                                    "script": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
-                                        },
-                                        {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
-                                          }
-                                        }
-                                      ]
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
                                     },
-                                    "config": {
-                                      "type": "object",
-                                      "additionalProperties": {}
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
                                     }
-                                  }
+                                  ]
                                 },
-                                "additionalProperties": false
-                              }
-                            },
-                            "model": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "type": "string"
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "rubrics"
+                                },
+                                "criteria": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
                                         "type": "string"
                                       },
-                                      {
+                                      "outcome": {
+                                        "type": "string"
+                                      },
+                                      "operator": {
+                                        "type": "string",
+                                        "enum": ["correctness", "contradiction"]
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
                                         "type": "array",
                                         "items": {
-                                          "type": "string"
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
+                                              "type": "array",
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
+                                            },
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            }
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
                                         }
                                       }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "include": {
-                              "type": "string",
-                              "minLength": 1
+                                    },
+                                    "additionalProperties": false
+                                  },
+                                  "minItems": 1
+                                }
+                              },
+                              "required": ["type", "criteria"],
+                              "additionalProperties": false
                             }
-                          },
-                          "required": ["include"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                          ]
+                        }
+                      },
+                      "evaluators": {
+                        "type": "array",
+                        "items": {
+                          "anyOf": [
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "composite"
-                            },
-                            "assertions": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "evaluators": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "aggregator": {
-                              "anyOf": [
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "weighted_average"
-                                    },
-                                    "weights": {
-                                      "type": "object",
-                                      "additionalProperties": {
-                                        "type": "number"
-                                      }
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
+                                  "minimum": 0
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "threshold"
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
                                     },
-                                    "threshold": {
+                                    {
                                       "type": "number",
+                                      "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
                                     }
-                                  },
-                                  "required": ["type", "threshold"],
-                                  "additionalProperties": false
+                                  ]
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "code-grader"
-                                    },
-                                    "path": {
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["code-grader", "code_grader"]
+                                },
+                                "command": {
+                                  "anyOf": [
+                                    {
                                       "type": "string"
                                     },
-                                    "cwd": {
-                                      "type": "string"
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
                                     }
-                                  },
-                                  "required": ["type", "path"],
-                                  "additionalProperties": false
+                                  ]
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "llm-grader"
-                                    },
-                                    "prompt": {
+                                "script": {
+                                  "anyOf": [
+                                    {
                                       "type": "string"
                                     },
-                                    "model": {
-                                      "type": "string"
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
                                     }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "aggregator"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                  ]
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["tool-trajectory", "tool_trajectory"]
-                            },
-                            "mode": {
-                              "type": "string",
-                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
-                            },
-                            "minimums": {
-                              "type": "object",
-                              "additionalProperties": {
-                                "type": "integer",
-                                "minimum": 0
-                              }
-                            },
-                            "expected": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "tool": {
-                                    "type": "string"
-                                  },
-                                  "args": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "const": "any"
-                                      },
-                                      {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      }
-                                    ]
-                                  },
-                                  "max_duration_ms": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "maxDurationMs": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "args_match": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
-                                      },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
+                                "cwd": {
+                                  "type": "string"
+                                },
+                                "target": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "max_calls": {
+                                          "type": "number"
                                         }
-                                      }
-                                    ]
-                                  },
-                                  "argsMatch": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
+                                      "additionalProperties": false
+                                    }
+                                  ]
                                 },
-                                "required": ["tool"],
-                                "additionalProperties": false
-                              }
-                            },
-                            "args_match": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                "config": {
+                                  "type": "object",
+                                  "additionalProperties": {}
                                 },
-                                {
+                                "preprocessors": {
                                   "type": "array",
                                   "items": {
-                                    "type": "string"
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "minLength": 1
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
                                   }
                                 }
-                              ]
+                              },
+                              "required": ["type", "command"],
+                              "additionalProperties": false
                             },
-                            "argsMatch": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "mode"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                {
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["field-accuracy", "field_accuracy"]
-                            },
-                            "fields": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "path": {
-                                    "type": "string"
-                                  },
-                                  "match": {
-                                    "type": "string",
-                                    "enum": ["exact", "numeric_tolerance", "date"]
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "tolerance": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "relative": {
-                                    "type": "boolean"
-                                  },
-                                  "formats": {
-                                    "type": "array",
-                                    "items": {
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["llm-grader", "llm_grader"]
+                                },
+                                "prompt": {
+                                  "anyOf": [
+                                    {
                                       "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "command": {
+                                          "anyOf": [
+                                            {
+                                              "type": "string"
+                                            },
+                                            {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
+                                            }
+                                          ]
+                                        },
+                                        "script": {
+                                          "anyOf": [
+                                            {
+                                              "type": "string"
+                                            },
+                                            {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
+                                            }
+                                          ]
+                                        },
+                                        "config": {
+                                          "type": "object",
+                                          "additionalProperties": {}
+                                        }
+                                      },
+                                      "additionalProperties": false
                                     }
+                                  ]
+                                },
+                                "rubrics": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
+                                        "type": "string"
+                                      },
+                                      "outcome": {
+                                        "type": "string"
+                                      },
+                                      "operator": {
+                                        "type": "string",
+                                        "enum": ["correctness", "contradiction"]
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
+                                              "type": "array",
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
+                                            },
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            }
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
+                                        }
+                                      }
+                                    },
+                                    "additionalProperties": false
                                   }
                                 },
-                                "required": ["path", "match"],
-                                "additionalProperties": false
-                              },
-                              "minItems": 1
-                            },
-                            "aggregation": {
-                              "type": "string",
-                              "enum": ["weighted_average", "all_or_nothing"]
-                            }
-                          },
-                          "required": ["type", "fields"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "model": {
+                                  "type": "string"
                                 },
-                                {
+                                "target": {
+                                  "type": "string"
+                                },
+                                "config": {
+                                  "type": "object",
+                                  "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
                                   "minimum": 0,
-                                  "maximum": 1
+                                  "maximum": 2
+                                },
+                                "preprocessors": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "minLength": 1
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
+                                  }
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "latency"
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "threshold": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "threshold"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                            {
+                              "type": "object",
+                              "properties": {
+                                "include": {
+                                  "type": "string",
+                                  "minLength": 1
+                                }
+                              },
+                              "required": ["include"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "cost"
-                            },
-                            "budget": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "budget"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                  "minimum": 0
                                 },
-                                {
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "composite"
+                                },
+                                "assertions": {
+                                  "type": "array",
+                                  "items": {}
+                                },
+                                "evaluators": {
+                                  "type": "array",
+                                  "items": {}
+                                },
+                                "aggregator": {
+                                  "anyOf": [
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "weighted_average"
+                                        },
+                                        "weights": {
+                                          "type": "object",
+                                          "additionalProperties": {
+                                            "type": "number"
+                                          }
+                                        }
+                                      },
+                                      "required": ["type"],
+                                      "additionalProperties": false
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "threshold"
+                                        },
+                                        "threshold": {
+                                          "type": "number",
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      },
+                                      "required": ["type", "threshold"],
+                                      "additionalProperties": false
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "code-grader"
+                                        },
+                                        "path": {
+                                          "type": "string"
+                                        },
+                                        "cwd": {
+                                          "type": "string"
+                                        }
+                                      },
+                                      "required": ["type", "path"],
+                                      "additionalProperties": false
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "llm-grader"
+                                        },
+                                        "prompt": {
+                                          "type": "string"
+                                        },
+                                        "model": {
+                                          "type": "string"
+                                        }
+                                      },
+                                      "required": ["type"],
+                                      "additionalProperties": false
+                                    }
+                                  ]
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["token-usage", "token_usage"]
-                            },
-                            "max_total": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_input": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_output": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "aggregator"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["execution-metrics", "execution_metrics"]
-                            },
-                            "max_tool_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_llm_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_tokens": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_cost_usd": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_duration_ms": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "target_exploration_ratio": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "exploration_tolerance": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
+                                },
+                                "negate": {
                                   "type": "boolean"
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["tool-trajectory", "tool_trajectory"]
+                                },
+                                "mode": {
+                                  "type": "string",
+                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                                },
+                                "minimums": {
+                                  "type": "object",
+                                  "additionalProperties": {
+                                    "type": "integer",
+                                    "minimum": 0
+                                  }
+                                },
+                                "expected": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "tool": {
+                                        "type": "string"
+                                      },
+                                      "args": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "const": "any"
+                                          },
+                                          {
+                                            "type": "object",
+                                            "additionalProperties": {}
+                                          }
+                                        ]
+                                      },
+                                      "max_duration_ms": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "maxDurationMs": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "args_match": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      },
+                                      "argsMatch": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["tool"],
+                                    "additionalProperties": false
+                                  }
+                                },
+                                "args_match": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string",
+                                      "enum": ["exact", "ignore", "subset", "superset"]
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "argsMatch": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string",
+                                      "enum": ["exact", "ignore", "subset", "superset"]
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "mode"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "regex"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                  "minimum": 0
                                 },
-                                {
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["is-json", "is_json"]
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
+                                },
+                                "negate": {
                                   "type": "boolean"
                                 },
-                                {
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["field-accuracy", "field_accuracy"]
+                                },
+                                "fields": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "path": {
+                                        "type": "string"
+                                      },
+                                      "match": {
+                                        "type": "string",
+                                        "enum": ["exact", "numeric_tolerance", "date"]
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "tolerance": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "relative": {
+                                        "type": "boolean"
+                                      },
+                                      "formats": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    },
+                                    "required": ["path", "match"],
+                                    "additionalProperties": false
+                                  },
+                                  "minItems": 1
+                                },
+                                "aggregation": {
+                                  "type": "string",
+                                  "enum": ["weighted_average", "all_or_nothing"]
+                                }
+                              },
+                              "required": ["type", "fields"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "latency"
+                                },
+                                "threshold": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "equals"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "threshold"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "rubrics"
-                            },
-                            "criteria": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
                                 },
-                                "additionalProperties": false
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "cost"
+                                },
+                                "budget": {
+                                  "type": "number",
+                                  "minimum": 0
+                                }
                               },
-                              "minItems": 1
-                            }
-                          },
-                          "required": ["type", "criteria"],
-                          "additionalProperties": false
-                        }
-                      ]
-                    }
-                  },
-                  "execution": {
-                    "type": "object",
-                    "properties": {
-                      "target": {
-                        "type": "string"
-                      },
-                      "targets": {
-                        "type": "array",
-                        "items": {
-                          "anyOf": [
-                            {
-                              "type": "string"
+                              "required": ["type", "budget"],
+                              "additionalProperties": false
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string",
-                                  "minLength": 1
-                                },
-                                "use_target": {
-                                  "type": "string"
-                                },
-                                "hooks": {
-                                  "type": "object",
-                                  "properties": {
-                                    "before_all": {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    },
-                                    "before_each": {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    },
-                                    "after_each": {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    },
-                                    "after_all": {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              },
-                              "required": ["name"],
-                              "additionalProperties": false
-                            }
-                          ]
-                        }
-                      },
-                      "workers": {
-                        "type": "integer",
-                        "minimum": 1,
-                        "maximum": 50
-                      },
-                      "assertions": {
-                        "type": "array",
-                        "items": {
-                          "anyOf": [
                             {
                               "type": "object",
                               "properties": {
@@ -2881,86 +2262,22 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["code-grader", "code_grader"]
-                                },
-                                "command": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                },
-                                "script": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                },
-                                "cwd": {
-                                  "type": "string"
+                                  "enum": ["token-usage", "token_usage"]
                                 },
-                                "target": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "max_calls": {
-                                          "type": "number"
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  ]
+                                "max_total": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                "config": {
-                                  "type": "object",
-                                  "additionalProperties": {}
+                                "max_input": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                "preprocessors": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "minLength": 1
-                                      },
-                                      "command": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["type", "command"],
-                                    "additionalProperties": false
-                                  }
+                                "max_output": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
                               },
-                              "required": ["type", "command"],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -2997,175 +2314,41 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["llm-grader", "llm_grader"]
+                                  "enum": ["execution-metrics", "execution_metrics"]
                                 },
-                                "prompt": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "config": {
-                                          "type": "object",
-                                          "additionalProperties": {}
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  ]
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "operator": {
-                                        "type": "string",
-                                        "enum": ["correctness", "contradiction"]
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
+                                "max_tool_calls": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                "model": {
-                                  "type": "string"
+                                "max_llm_calls": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                "target": {
-                                  "type": "string"
+                                "max_tokens": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                "config": {
-                                  "type": "object",
-                                  "additionalProperties": {}
+                                "max_cost_usd": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
+                                "max_duration_ms": {
+                                  "type": "number",
+                                  "minimum": 0
                                 },
-                                "temperature": {
+                                "target_exploration_ratio": {
                                   "type": "number",
                                   "minimum": 0,
-                                  "maximum": 2
+                                  "maximum": 1
                                 },
-                                "preprocessors": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "minLength": 1
-                                      },
-                                      "command": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["type", "command"],
-                                    "additionalProperties": false
-                                  }
+                                "exploration_tolerance": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
                               },
                               "required": ["type"],
                               "additionalProperties": false
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "include": {
-                                  "type": "string",
-                                  "minLength": 1
-                                }
-                              },
-                              "required": ["include"],
-                              "additionalProperties": false
-                            },
                             {
                               "type": "object",
                               "properties": {
@@ -3200,89 +2383,13 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "const": "composite"
-                                },
-                                "assertions": {
-                                  "type": "array",
-                                  "items": {}
-                                },
-                                "evaluators": {
-                                  "type": "array",
-                                  "items": {}
+                                  "const": "contains"
                                 },
-                                "aggregator": {
-                                  "anyOf": [
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "weighted_average"
-                                        },
-                                        "weights": {
-                                          "type": "object",
-                                          "additionalProperties": {
-                                            "type": "number"
-                                          }
-                                        }
-                                      },
-                                      "required": ["type"],
-                                      "additionalProperties": false
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "threshold"
-                                        },
-                                        "threshold": {
-                                          "type": "number",
-                                          "minimum": 0,
-                                          "maximum": 1
-                                        }
-                                      },
-                                      "required": ["type", "threshold"],
-                                      "additionalProperties": false
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "code-grader"
-                                        },
-                                        "path": {
-                                          "type": "string"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        }
-                                      },
-                                      "required": ["type", "path"],
-                                      "additionalProperties": false
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "llm-grader"
-                                        },
-                                        "prompt": {
-                                          "type": "string"
-                                        },
-                                        "model": {
-                                          "type": "string"
-                                        }
-                                      },
-                                      "required": ["type"],
-                                      "additionalProperties": false
-                                    }
-                                  ]
+                                "value": {
+                                  "type": "string"
                                 }
                               },
-                              "required": ["type", "aggregator"],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -3319,110 +2426,13 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["tool-trajectory", "tool_trajectory"]
-                                },
-                                "mode": {
-                                  "type": "string",
-                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                                  "const": "regex"
                                 },
-                                "minimums": {
-                                  "type": "object",
-                                  "additionalProperties": {
-                                    "type": "integer",
-                                    "minimum": 0
-                                  }
-                                },
-                                "expected": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "tool": {
-                                        "type": "string"
-                                      },
-                                      "args": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "const": "any"
-                                          },
-                                          {
-                                            "type": "object",
-                                            "additionalProperties": {}
-                                          }
-                                        ]
-                                      },
-                                      "max_duration_ms": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "maxDurationMs": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "args_match": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "argsMatch": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["tool"],
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "args_match": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string",
-                                      "enum": ["exact", "ignore", "subset", "superset"]
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                },
-                                "argsMatch": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string",
-                                      "enum": ["exact", "ignore", "subset", "superset"]
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
+                                "value": {
+                                  "type": "string"
                                 }
                               },
-                              "required": ["type", "mode"],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -3459,51 +2469,10 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["field-accuracy", "field_accuracy"]
-                                },
-                                "fields": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "path": {
-                                        "type": "string"
-                                      },
-                                      "match": {
-                                        "type": "string",
-                                        "enum": ["exact", "numeric_tolerance", "date"]
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "tolerance": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "relative": {
-                                        "type": "boolean"
-                                      },
-                                      "formats": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    },
-                                    "required": ["path", "match"],
-                                    "additionalProperties": false
-                                  },
-                                  "minItems": 1
-                                },
-                                "aggregation": {
-                                  "type": "string",
-                                  "enum": ["weighted_average", "all_or_nothing"]
+                                  "enum": ["is-json", "is_json"]
                                 }
                               },
-                              "required": ["type", "fields"],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -3540,14 +2509,13 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "const": "latency"
+                                  "const": "equals"
                                 },
-                                "threshold": {
-                                  "type": "number",
-                                  "minimum": 0
+                                "value": {
+                                  "type": "string"
                                 }
                               },
-                              "required": ["type", "threshold"],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -3584,3844 +2552,2737 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "const": "cost"
+                                  "const": "rubrics"
                                 },
-                                "budget": {
-                                  "type": "number",
-                                  "minimum": 0
+                                "criteria": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
+                                        "type": "string"
+                                      },
+                                      "outcome": {
+                                        "type": "string"
+                                      },
+                                      "operator": {
+                                        "type": "string",
+                                        "enum": ["correctness", "contradiction"]
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
+                                              "type": "array",
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
+                                            },
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            }
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
+                                        }
+                                      }
+                                    },
+                                    "additionalProperties": false
+                                  },
+                                  "minItems": 1
                                 }
                               },
-                              "required": ["type", "budget"],
+                              "required": ["type", "criteria"],
                               "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                            }
+                          ]
+                        }
+                      },
+                      "execution": {
+                        "type": "object",
+                        "properties": {
+                          "target": {
+                            "type": "string"
+                          },
+                          "targets": {
+                            "type": "array",
+                            "items": {
+                              "anyOf": [
+                                {
                                   "type": "string"
                                 },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string",
+                                      "minLength": 1
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["token-usage", "token_usage"]
-                                },
-                                "max_total": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_input": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_output": {
-                                  "type": "number",
-                                  "minimum": 0
+                                    "use_target": {
+                                      "type": "string"
+                                    },
+                                    "hooks": {
+                                      "type": "object",
+                                      "properties": {
+                                        "before_all": {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "timeout_ms": {
+                                              "type": "number"
+                                            },
+                                            "timeoutMs": {
+                                              "type": "number"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            },
+                                            "reset": {
+                                              "type": "string",
+                                              "enum": ["none", "fast", "strict"]
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        },
+                                        "before_each": {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "timeout_ms": {
+                                              "type": "number"
+                                            },
+                                            "timeoutMs": {
+                                              "type": "number"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            },
+                                            "reset": {
+                                              "type": "string",
+                                              "enum": ["none", "fast", "strict"]
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        },
+                                        "after_each": {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "timeout_ms": {
+                                              "type": "number"
+                                            },
+                                            "timeoutMs": {
+                                              "type": "number"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            },
+                                            "reset": {
+                                              "type": "string",
+                                              "enum": ["none", "fast", "strict"]
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        },
+                                        "after_all": {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "timeout_ms": {
+                                              "type": "number"
+                                            },
+                                            "timeoutMs": {
+                                              "type": "number"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            },
+                                            "reset": {
+                                              "type": "string",
+                                              "enum": ["none", "fast", "strict"]
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        }
+                                      },
+                                      "additionalProperties": false
+                                    }
+                                  },
+                                  "required": ["name"],
+                                  "additionalProperties": false
                                 }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                              ]
+                            }
+                          },
+                          "workers": {
+                            "type": "integer",
+                            "minimum": 1,
+                            "maximum": 50
+                          },
+                          "assertions": {
+                            "type": "array",
+                            "items": {
+                              "anyOf": [
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["execution-metrics", "execution_metrics"]
-                                },
-                                "max_tool_calls": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_llm_calls": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_tokens": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_cost_usd": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_duration_ms": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "target_exploration_ratio": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "exploration_tolerance": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
+                                    },
+                                    "negate": {
                                       "type": "boolean"
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["code-grader", "code_grader"]
+                                    },
+                                    "command": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "script": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "cwd": {
+                                      "type": "string"
+                                    },
+                                    "target": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "max_calls": {
+                                              "type": "number"
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        }
+                                      ]
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "preprocessors": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
+                                      }
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "contains"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
+                                  },
+                                  "required": ["type", "command"],
+                                  "additionalProperties": false
                                 },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
                                       "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "regex"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                      "minimum": 0
                                     },
-                                    {
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["is-json", "is_json"]
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
+                                    },
+                                    "negate": {
                                       "type": "boolean"
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "equals"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["llm-grader", "llm_grader"]
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "rubrics"
-                                },
-                                "criteria": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "operator": {
-                                        "type": "string",
-                                        "enum": ["correctness", "contradiction"]
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
+                                    "prompt": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
                                           "type": "object",
                                           "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
+                                            "command": {
+                                              "anyOf": [
                                                 {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
+                                                  "type": "string"
                                                 },
                                                 {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
                                                 }
                                               ]
                                             },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "config": {
+                                              "type": "object",
+                                              "additionalProperties": {}
                                             }
                                           },
-                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  },
-                                  "minItems": 1
-                                }
-                              },
-                              "required": ["type", "criteria"],
-                              "additionalProperties": false
-                            }
-                          ]
-                        }
-                      },
-                      "evaluators": {
-                        "type": "array",
-                        "items": {
-                          "anyOf": [
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["code-grader", "code_grader"]
-                                },
-                                "command": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
+                                      ]
                                     },
-                                    {
+                                    "rubrics": {
                                       "type": "array",
                                       "items": {
-                                        "type": "string"
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "additionalProperties": false
                                       }
-                                    }
-                                  ]
-                                },
-                                "script": {
-                                  "anyOf": [
-                                    {
+                                    },
+                                    "model": {
                                       "type": "string"
                                     },
-                                    {
+                                    "target": {
+                                      "type": "string"
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "max_steps": {
+                                      "type": "integer",
+                                      "minimum": 1,
+                                      "maximum": 50
+                                    },
+                                    "temperature": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 2
+                                    },
+                                    "preprocessors": {
                                       "type": "array",
                                       "items": {
-                                        "type": "string"
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
                                       }
                                     }
-                                  ]
-                                },
-                                "cwd": {
-                                  "type": "string"
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
                                 },
-                                "target": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "max_calls": {
-                                          "type": "number"
-                                        }
-                                      },
-                                      "additionalProperties": false
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "include": {
+                                      "type": "string",
+                                      "minLength": 1
                                     }
-                                  ]
+                                  },
+                                  "required": ["include"],
+                                  "additionalProperties": false
                                 },
-                                "config": {
+                                {
                                   "type": "object",
-                                  "additionalProperties": {}
-                                },
-                                "preprocessors": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "minLength": 1
-                                      },
-                                      "command": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "required": ["type", "command"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              },
-                              "required": ["type", "command"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
                                     },
-                                    {
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["llm-grader", "llm_grader"]
-                                },
-                                "prompt": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
                                     },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "composite"
+                                    },
+                                    "assertions": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "evaluators": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "aggregator": {
+                                      "anyOf": [
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "weighted_average"
                                             },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
+                                            "weights": {
+                                              "type": "object",
+                                              "additionalProperties": {
+                                                "type": "number"
                                               }
                                             }
-                                          ]
+                                          },
+                                          "required": ["type"],
+                                          "additionalProperties": false
                                         },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "threshold"
                                             },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
+                                            "threshold": {
+                                              "type": "number",
+                                              "minimum": 0,
+                                              "maximum": 1
                                             }
-                                          ]
+                                          },
+                                          "required": ["type", "threshold"],
+                                          "additionalProperties": false
                                         },
-                                        "config": {
-                                          "type": "object",
-                                          "additionalProperties": {}
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  ]
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "operator": {
-                                        "type": "string",
-                                        "enum": ["correctness", "contradiction"]
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
+                                        {
                                           "type": "object",
                                           "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
+                                            "type": {
+                                              "type": "string",
+                                              "const": "code-grader"
                                             },
-                                            "outcome": {
+                                            "path": {
+                                              "type": "string"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            }
+                                          },
+                                          "required": ["type", "path"],
+                                          "additionalProperties": false
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
                                               "type": "string",
-                                              "minLength": 1
+                                              "const": "llm-grader"
+                                            },
+                                            "prompt": {
+                                              "type": "string"
+                                            },
+                                            "model": {
+                                              "type": "string"
                                             }
                                           },
-                                          "required": ["score_range", "outcome"],
+                                          "required": ["type"],
                                           "additionalProperties": false
                                         }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "model": {
-                                  "type": "string"
-                                },
-                                "target": {
-                                  "type": "string"
+                                      ]
+                                    }
+                                  },
+                                  "required": ["type", "aggregator"],
+                                  "additionalProperties": false
                                 },
-                                "config": {
+                                {
                                   "type": "object",
-                                  "additionalProperties": {}
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "preprocessors": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "minLength": 1
-                                      },
-                                      "command": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["type", "command"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "include": {
-                                  "type": "string",
-                                  "minLength": 1
-                                }
-                              },
-                              "required": ["include"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
                                       "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "composite"
-                                },
-                                "assertions": {
-                                  "type": "array",
-                                  "items": {}
-                                },
-                                "evaluators": {
-                                  "type": "array",
-                                  "items": {}
-                                },
-                                "aggregator": {
-                                  "anyOf": [
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "weighted_average"
-                                        },
-                                        "weights": {
-                                          "type": "object",
-                                          "additionalProperties": {
-                                            "type": "number"
-                                          }
-                                        }
-                                      },
-                                      "required": ["type"],
-                                      "additionalProperties": false
+                                      "minimum": 0
                                     },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "threshold"
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
                                         },
-                                        "threshold": {
+                                        {
                                           "type": "number",
+                                          "exclusiveMinimum": true,
                                           "minimum": 0,
                                           "maximum": 1
                                         }
-                                      },
-                                      "required": ["type", "threshold"],
-                                      "additionalProperties": false
+                                      ]
                                     },
-                                    {
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["tool-trajectory", "tool_trajectory"]
+                                    },
+                                    "mode": {
+                                      "type": "string",
+                                      "enum": [
+                                        "any_order",
+                                        "in_order",
+                                        "exact",
+                                        "subset",
+                                        "superset"
+                                      ]
+                                    },
+                                    "minimums": {
                                       "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "code-grader"
+                                      "additionalProperties": {
+                                        "type": "integer",
+                                        "minimum": 0
+                                      }
+                                    },
+                                    "expected": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "tool": {
+                                            "type": "string"
+                                          },
+                                          "args": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "const": "any"
+                                              },
+                                              {
+                                                "type": "object",
+                                                "additionalProperties": {}
+                                              }
+                                            ]
+                                          },
+                                          "max_duration_ms": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "maxDurationMs": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "args_match": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "argsMatch": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
                                         },
-                                        "path": {
-                                          "type": "string"
+                                        "required": ["tool"],
+                                        "additionalProperties": false
+                                      }
+                                    },
+                                    "args_match": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string",
+                                          "enum": ["exact", "ignore", "subset", "superset"]
                                         },
-                                        "cwd": {
-                                          "type": "string"
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
                                         }
-                                      },
-                                      "required": ["type", "path"],
-                                      "additionalProperties": false
+                                      ]
                                     },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
+                                    "argsMatch": {
+                                      "anyOf": [
+                                        {
                                           "type": "string",
-                                          "const": "llm-grader"
+                                          "enum": ["exact", "ignore", "subset", "superset"]
                                         },
-                                        "prompt": {
-                                          "type": "string"
-                                        },
-                                        "model": {
-                                          "type": "string"
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
                                         }
-                                      },
-                                      "required": ["type"],
-                                      "additionalProperties": false
+                                      ]
                                     }
-                                  ]
-                                }
-                              },
-                              "required": ["type", "aggregator"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
+                                  },
+                                  "required": ["type", "mode"],
+                                  "additionalProperties": false
                                 },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["field-accuracy", "field_accuracy"]
+                                    },
+                                    "fields": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "path": {
+                                            "type": "string"
+                                          },
+                                          "match": {
+                                            "type": "string",
+                                            "enum": ["exact", "numeric_tolerance", "date"]
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "tolerance": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "relative": {
+                                            "type": "boolean"
+                                          },
+                                          "formats": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        },
+                                        "required": ["path", "match"],
+                                        "additionalProperties": false
+                                      },
+                                      "minItems": 1
+                                    },
+                                    "aggregation": {
+                                      "type": "string",
+                                      "enum": ["weighted_average", "all_or_nothing"]
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["tool-trajectory", "tool_trajectory"]
-                                },
-                                "mode": {
-                                  "type": "string",
-                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                                  },
+                                  "required": ["type", "fields"],
+                                  "additionalProperties": false
                                 },
-                                "minimums": {
+                                {
                                   "type": "object",
-                                  "additionalProperties": {
-                                    "type": "integer",
-                                    "minimum": 0
-                                  }
-                                },
-                                "expected": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "tool": {
-                                        "type": "string"
-                                      },
-                                      "args": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "const": "any"
-                                          },
-                                          {
-                                            "type": "object",
-                                            "additionalProperties": {}
-                                          }
-                                        ]
-                                      },
-                                      "max_duration_ms": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "maxDurationMs": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "args_match": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "argsMatch": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["tool"],
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "args_match": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string",
-                                      "enum": ["exact", "ignore", "subset", "superset"]
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                },
-                                "argsMatch": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string",
-                                      "enum": ["exact", "ignore", "subset", "superset"]
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
                                     },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                }
-                              },
-                              "required": ["type", "mode"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
                                     },
-                                    {
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["field-accuracy", "field_accuracy"]
-                                },
-                                "fields": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "path": {
-                                        "type": "string"
-                                      },
-                                      "match": {
-                                        "type": "string",
-                                        "enum": ["exact", "numeric_tolerance", "date"]
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "tolerance": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "relative": {
-                                        "type": "boolean"
-                                      },
-                                      "formats": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
                                     },
-                                    "required": ["path", "match"],
-                                    "additionalProperties": false
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "latency"
+                                    },
+                                    "threshold": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
                                   },
-                                  "minItems": 1
-                                },
-                                "aggregation": {
-                                  "type": "string",
-                                  "enum": ["weighted_average", "all_or_nothing"]
-                                }
-                              },
-                              "required": ["type", "fields"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
+                                  "required": ["type", "threshold"],
+                                  "additionalProperties": false
                                 },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "cost"
+                                    },
+                                    "budget": {
+                                      "type": "number",
+                                      "minimum": 0
                                     }
-                                  ]
+                                  },
+                                  "required": ["type", "budget"],
+                                  "additionalProperties": false
                                 },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "latency"
-                                },
-                                "threshold": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type", "threshold"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "cost"
-                                },
-                                "budget": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type", "budget"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
+                                    },
+                                    "negate": {
                                       "type": "boolean"
                                     },
-                                    {
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["token-usage", "token_usage"]
+                                    },
+                                    "max_total": {
                                       "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
+                                      "minimum": 0
+                                    },
+                                    "max_input": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_output": {
+                                      "type": "number",
+                                      "minimum": 0
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["token-usage", "token_usage"]
-                                },
-                                "max_total": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_input": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_output": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
                                 },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["execution-metrics", "execution_metrics"]
-                                },
-                                "max_tool_calls": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_llm_calls": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_tokens": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_cost_usd": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_duration_ms": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "target_exploration_ratio": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "exploration_tolerance": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
+                                    },
+                                    "negate": {
                                       "type": "boolean"
                                     },
-                                    {
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["execution-metrics", "execution_metrics"]
+                                    },
+                                    "max_tool_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_llm_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_tokens": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_cost_usd": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_duration_ms": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "target_exploration_ratio": {
                                       "type": "number",
-                                      "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
+                                    },
+                                    "exploration_tolerance": {
+                                      "type": "number",
+                                      "minimum": 0
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
                                 },
-                                "type": {
-                                  "type": "string",
-                                  "const": "contains"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "contains"
+                                    },
+                                    "value": {
+                                      "type": "string"
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "regex"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
                                 },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "regex"
+                                    },
+                                    "value": {
+                                      "type": "string"
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["is-json", "is_json"]
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
                                 },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["is-json", "is_json"]
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "equals"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
                                 },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    {
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
                                       "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "equals"
+                                    },
+                                    "value": {
+                                      "type": "string"
                                     }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "rubrics"
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
                                 },
-                                "criteria": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "operator": {
-                                        "type": "string",
-                                        "enum": ["correctness", "contradiction"]
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "rubrics"
+                                    },
+                                    "criteria": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "additionalProperties": false
                                       },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
+                                      "minItems": 1
+                                    }
+                                  },
+                                  "required": ["type", "criteria"],
+                                  "additionalProperties": false
+                                }
+                              ]
+                            }
+                          },
+                          "evaluators": {
+                            "type": "array",
+                            "items": {
+                              "anyOf": [
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["code-grader", "code_grader"]
+                                    },
+                                    "command": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "script": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "cwd": {
+                                      "type": "string"
+                                    },
+                                    "target": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
                                           "type": "object",
                                           "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
+                                            "max_calls": {
+                                              "type": "number"
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        }
+                                      ]
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "preprocessors": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  },
+                                  "required": ["type", "command"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["llm-grader", "llm_grader"]
+                                    },
+                                    "prompt": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
                                                 {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
+                                                  "type": "string"
                                                 },
                                                 {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
                                                 }
                                               ]
                                             },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "config": {
+                                              "type": "object",
+                                              "additionalProperties": {}
                                             }
                                           },
-                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
-                                      }
+                                      ]
                                     },
-                                    "additionalProperties": false
-                                  },
-                                  "minItems": 1
-                                }
-                              },
-                              "required": ["type", "criteria"],
-                              "additionalProperties": false
-                            }
-                          ]
-                        }
-                      },
-                      "skip_defaults": {
-                        "type": "boolean"
-                      },
-                      "cache": {
-                        "type": "boolean"
-                      },
-                      "trials": {
-                        "not": {}
-                      },
-                      "budget_usd": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "budgetUsd": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "fail_on_error": {
-                        "type": "boolean"
-                      },
-                      "failOnError": {
-                        "type": "boolean"
-                      },
-                      "threshold": {
-                        "type": "number",
-                        "minimum": 0,
-                        "maximum": 1
-                      }
-                    },
-                    "additionalProperties": false
-                  },
-                  "workspace": {
-                    "type": "object",
-                    "properties": {
-                      "template": {
-                        "type": "string"
-                      },
-                      "isolation": {
-                        "type": "string",
-                        "enum": ["shared", "per_test"]
-                      },
-                      "repos": {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "path": {
-                              "type": "string"
-                            },
-                            "repo": {
-                              "type": "string",
-                              "minLength": 1
-                            },
-                            "commit": {
-                              "type": "string",
-                              "minLength": 1
-                            },
-                            "base_commit": {
-                              "type": "string",
-                              "minLength": 1
-                            },
-                            "ancestor": {
-                              "type": "integer",
-                              "minimum": 0
-                            },
-                            "sparse": {
-                              "type": "array",
-                              "items": {
-                                "type": "string"
-                              }
-                            }
-                          },
-                          "additionalProperties": false
-                        }
-                      },
-                      "hooks": {
-                        "type": "object",
-                        "properties": {
-                          "enabled": {
-                            "type": "boolean"
-                          },
-                          "before_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "before_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
+                                    "rubrics": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "additionalProperties": false
+                                      }
+                                    },
+                                    "model": {
                                       "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
+                                    },
+                                    "target": {
                                       "type": "string"
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "max_steps": {
+                                      "type": "integer",
+                                      "minimum": 1,
+                                      "maximum": 50
+                                    },
+                                    "temperature": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 2
+                                    },
+                                    "preprocessors": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
+                                      }
                                     }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
                                   },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "include": {
+                                      "type": "string",
+                                      "minLength": 1
                                     }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
                                   },
-                                  {
-                                    "type": "array",
-                                    "items": {
+                                  "required": ["include"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
                                       "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          }
-                        },
-                        "additionalProperties": false
-                      },
-                      "mode": {
-                        "type": "string",
-                        "enum": ["pooled", "temp", "static"]
-                      },
-                      "path": {
-                        "type": "string"
-                      },
-                      "docker": {
-                        "type": "object",
-                        "properties": {
-                          "image": {
-                            "type": "string"
-                          },
-                          "timeout": {
-                            "type": "integer",
-                            "minimum": 1
-                          },
-                          "memory": {
-                            "type": "string"
-                          },
-                          "cpus": {
-                            "type": "number",
-                            "minimum": 0.1
-                          }
-                        },
-                        "required": ["image"],
-                        "additionalProperties": false
-                      }
-                    },
-                    "additionalProperties": false
-                  },
-                  "metadata": {
-                    "type": "object",
-                    "additionalProperties": {}
-                  },
-                  "conversation_id": {
-                    "type": "string"
-                  },
-                  "suite": {
-                    "type": "string"
-                  },
-                  "depends_on": {
-                    "type": "array",
-                    "items": {
-                      "type": "string"
-                    }
-                  },
-                  "on_dependency_failure": {
-                    "type": "string",
-                    "enum": ["skip", "fail", "run"]
-                  },
-                  "mode": {
-                    "type": "string",
-                    "enum": ["conversation"]
-                  },
-                  "turns": {
-                    "type": "array",
-                    "items": {
-                      "type": "object",
-                      "properties": {
-                        "input": {
-                          "anyOf": [
-                            {
-                              "type": "string"
-                            },
-                            {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              ]
-                            }
-                          ]
-                        },
-                        "expected_output": {
-                          "anyOf": [
-                            {
-                              "type": "string"
-                            },
-                            {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              ]
-                            }
-                          ]
-                        },
-                        "assertions": {
-                          "type": "array",
-                          "items": {
-                            "anyOf": [
-                              {
-                                "type": "string"
-                              },
-                              {
-                                "anyOf": [
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["code-grader", "code_grader"]
-                                      },
-                                      "command": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "script": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "cwd": {
-                                        "type": "string"
-                                      },
-                                      "target": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "max_calls": {
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "composite"
+                                    },
+                                    "assertions": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "evaluators": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "aggregator": {
+                                      "anyOf": [
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "weighted_average"
+                                            },
+                                            "weights": {
+                                              "type": "object",
+                                              "additionalProperties": {
                                                 "type": "number"
                                               }
-                                            },
-                                            "additionalProperties": false
-                                          }
-                                        ]
-                                      },
-                                      "config": {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      },
-                                      "preprocessors": {
-                                        "type": "array",
-                                        "items": {
+                                            }
+                                          },
+                                          "required": ["type"],
+                                          "additionalProperties": false
+                                        },
+                                        {
                                           "type": "object",
                                           "properties": {
                                             "type": {
                                               "type": "string",
-                                              "minLength": 1
+                                              "const": "threshold"
                                             },
-                                            "command": {
-                                              "anyOf": [
-                                                {
-                                                  "type": "string"
-                                                },
-                                                {
-                                                  "type": "array",
-                                                  "items": {
-                                                    "type": "string"
-                                                  }
-                                                }
-                                              ]
+                                            "threshold": {
+                                              "type": "number",
+                                              "minimum": 0,
+                                              "maximum": 1
                                             }
                                           },
-                                          "required": ["type", "command"],
+                                          "required": ["type", "threshold"],
                                           "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "required": ["type", "command"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["llm-grader", "llm_grader"]
-                                      },
-                                      "prompt": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "command": {
-                                                "anyOf": [
-                                                  {
-                                                    "type": "string"
-                                                  },
-                                                  {
-                                                    "type": "array",
-                                                    "items": {
-                                                      "type": "string"
-                                                    }
-                                                  }
-                                                ]
-                                              },
-                                              "script": {
-                                                "anyOf": [
-                                                  {
-                                                    "type": "string"
-                                                  },
-                                                  {
-                                                    "type": "array",
-                                                    "items": {
-                                                      "type": "string"
-                                                    }
-                                                  }
-                                                ]
-                                              },
-                                              "config": {
-                                                "type": "object",
-                                                "additionalProperties": {}
-                                              }
-                                            },
-                                            "additionalProperties": false
-                                          }
-                                        ]
-                                      },
-                                      "rubrics": {
-                                        "type": "array",
-                                        "items": {
+                                        },
+                                        {
                                           "type": "object",
                                           "properties": {
-                                            "id": {
-                                              "type": "string"
-                                            },
-                                            "outcome": {
-                                              "type": "string"
-                                            },
-                                            "operator": {
+                                            "type": {
                                               "type": "string",
-                                              "enum": ["correctness", "contradiction"]
-                                            },
-                                            "weight": {
-                                              "type": "number"
-                                            },
-                                            "required": {
-                                              "type": "boolean"
+                                              "const": "code-grader"
                                             },
-                                            "min_score": {
-                                              "type": "number",
-                                              "exclusiveMinimum": true,
-                                              "minimum": 0,
-                                              "maximum": 1
+                                            "path": {
+                                              "type": "string"
                                             },
-                                            "score_ranges": {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "object",
-                                                "properties": {
-                                                  "score_range": {
-                                                    "type": "array",
-                                                    "minItems": 2,
-                                                    "maxItems": 2,
-                                                    "items": [
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      },
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      }
-                                                    ]
-                                                  },
-                                                  "outcome": {
-                                                    "type": "string",
-                                                    "minLength": 1
-                                                  }
-                                                },
-                                                "required": ["score_range", "outcome"],
-                                                "additionalProperties": false
-                                              }
+                                            "cwd": {
+                                              "type": "string"
                                             }
                                           },
+                                          "required": ["type", "path"],
                                           "additionalProperties": false
-                                        }
-                                      },
-                                      "model": {
-                                        "type": "string"
-                                      },
-                                      "target": {
-                                        "type": "string"
-                                      },
-                                      "config": {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      },
-                                      "max_steps": {
-                                        "type": "integer",
-                                        "minimum": 1,
-                                        "maximum": 50
-                                      },
-                                      "temperature": {
-                                        "type": "number",
-                                        "minimum": 0,
-                                        "maximum": 2
-                                      },
-                                      "preprocessors": {
-                                        "type": "array",
-                                        "items": {
+                                        },
+                                        {
                                           "type": "object",
                                           "properties": {
                                             "type": {
                                               "type": "string",
-                                              "minLength": 1
+                                              "const": "llm-grader"
                                             },
-                                            "command": {
-                                              "anyOf": [
-                                                {
-                                                  "type": "string"
-                                                },
-                                                {
-                                                  "type": "array",
-                                                  "items": {
-                                                    "type": "string"
-                                                  }
-                                                }
-                                              ]
+                                            "prompt": {
+                                              "type": "string"
+                                            },
+                                            "model": {
+                                              "type": "string"
                                             }
                                           },
-                                          "required": ["type", "command"],
+                                          "required": ["type"],
                                           "additionalProperties": false
                                         }
-                                      }
-                                    },
-                                    "required": ["type"],
-                                    "additionalProperties": false
+                                      ]
+                                    }
                                   },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "include": {
-                                        "type": "string",
-                                        "minLength": 1
-                                      }
+                                  "required": ["type", "aggregator"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "required": ["include"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["tool-trajectory", "tool_trajectory"]
+                                    },
+                                    "mode": {
+                                      "type": "string",
+                                      "enum": [
+                                        "any_order",
+                                        "in_order",
+                                        "exact",
+                                        "subset",
+                                        "superset"
+                                      ]
+                                    },
+                                    "minimums": {
+                                      "type": "object",
+                                      "additionalProperties": {
+                                        "type": "integer",
                                         "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
+                                      }
+                                    },
+                                    "expected": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "tool": {
+                                            "type": "string"
                                           },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "composite"
-                                      },
-                                      "assertions": {
-                                        "type": "array",
-                                        "items": {}
-                                      },
-                                      "evaluators": {
-                                        "type": "array",
-                                        "items": {}
-                                      },
-                                      "aggregator": {
-                                        "anyOf": [
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
+                                          "args": {
+                                            "anyOf": [
+                                              {
                                                 "type": "string",
-                                                "const": "weighted_average"
+                                                "const": "any"
                                               },
-                                              "weights": {
+                                              {
                                                 "type": "object",
-                                                "additionalProperties": {
-                                                  "type": "number"
-                                                }
+                                                "additionalProperties": {}
                                               }
-                                            },
-                                            "required": ["type"],
-                                            "additionalProperties": false
+                                            ]
                                           },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
+                                          "max_duration_ms": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "maxDurationMs": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "args_match": {
+                                            "anyOf": [
+                                              {
                                                 "type": "string",
-                                                "const": "threshold"
+                                                "enum": ["exact", "ignore", "subset", "superset"]
                                               },
-                                              "threshold": {
-                                                "type": "number",
-                                                "minimum": 0,
-                                                "maximum": 1
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
                                               }
-                                            },
-                                            "required": ["type", "threshold"],
-                                            "additionalProperties": false
+                                            ]
                                           },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
+                                          "argsMatch": {
+                                            "anyOf": [
+                                              {
                                                 "type": "string",
-                                                "const": "code-grader"
-                                              },
-                                              "path": {
-                                                "type": "string"
+                                                "enum": ["exact", "ignore", "subset", "superset"]
                                               },
-                                              "cwd": {
-                                                "type": "string"
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
                                               }
-                                            },
-                                            "required": ["type", "path"],
-                                            "additionalProperties": false
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
-                                                "type": "string",
-                                                "const": "llm-grader"
-                                              },
-                                              "prompt": {
-                                                "type": "string"
-                                              },
-                                              "model": {
-                                                "type": "string"
-                                              }
-                                            },
-                                            "required": ["type"],
-                                            "additionalProperties": false
+                                            ]
                                           }
-                                        ]
+                                        },
+                                        "required": ["tool"],
+                                        "additionalProperties": false
                                       }
                                     },
-                                    "required": ["type", "aggregator"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
+                                    "args_match": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string",
+                                          "enum": ["exact", "ignore", "subset", "superset"]
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
                                           }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["tool-trajectory", "tool_trajectory"]
-                                      },
-                                      "mode": {
-                                        "type": "string",
-                                        "enum": [
-                                          "any_order",
-                                          "in_order",
-                                          "exact",
-                                          "subset",
-                                          "superset"
-                                        ]
-                                      },
-                                      "minimums": {
-                                        "type": "object",
-                                        "additionalProperties": {
-                                          "type": "integer",
-                                          "minimum": 0
                                         }
-                                      },
-                                      "expected": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "tool": {
-                                              "type": "string"
-                                            },
-                                            "args": {
-                                              "anyOf": [
-                                                {
-                                                  "type": "string",
-                                                  "const": "any"
-                                                },
-                                                {
-                                                  "type": "object",
-                                                  "additionalProperties": {}
-                                                }
-                                              ]
-                                            },
-                                            "max_duration_ms": {
-                                              "type": "number",
-                                              "minimum": 0
-                                            },
-                                            "maxDurationMs": {
-                                              "type": "number",
-                                              "minimum": 0
-                                            },
-                                            "args_match": {
-                                              "anyOf": [
-                                                {
-                                                  "type": "string",
-                                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                                },
-                                                {
-                                                  "type": "array",
-                                                  "items": {
-                                                    "type": "string"
-                                                  }
-                                                }
-                                              ]
-                                            },
-                                            "argsMatch": {
-                                              "anyOf": [
-                                                {
-                                                  "type": "string",
-                                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                                },
-                                                {
-                                                  "type": "array",
-                                                  "items": {
-                                                    "type": "string"
-                                                  }
-                                                }
-                                              ]
-                                            }
-                                          },
-                                          "required": ["tool"],
-                                          "additionalProperties": false
+                                      ]
+                                    },
+                                    "argsMatch": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string",
+                                          "enum": ["exact", "ignore", "subset", "superset"]
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
                                         }
-                                      },
-                                      "args_match": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                      ]
+                                    }
+                                  },
+                                  "required": ["type", "mode"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["field-accuracy", "field_accuracy"]
+                                    },
+                                    "fields": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "path": {
+                                            "type": "string"
                                           },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "argsMatch": {
-                                        "anyOf": [
-                                          {
+                                          "match": {
                                             "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                            "enum": ["exact", "numeric_tolerance", "date"]
                                           },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["type", "mode"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
+                                          "required": {
                                             "type": "boolean"
                                           },
-                                          {
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "tolerance": {
                                             "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["field-accuracy", "field_accuracy"]
-                                      },
-                                      "fields": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "path": {
+                                            "minimum": 0
+                                          },
+                                          "relative": {
+                                            "type": "boolean"
+                                          },
+                                          "formats": {
+                                            "type": "array",
+                                            "items": {
                                               "type": "string"
-                                            },
-                                            "match": {
-                                              "type": "string",
-                                              "enum": ["exact", "numeric_tolerance", "date"]
-                                            },
-                                            "required": {
-                                              "type": "boolean"
-                                            },
-                                            "weight": {
-                                              "type": "number"
-                                            },
-                                            "tolerance": {
-                                              "type": "number",
-                                              "minimum": 0
-                                            },
-                                            "relative": {
-                                              "type": "boolean"
-                                            },
-                                            "formats": {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
                                             }
-                                          },
-                                          "required": ["path", "match"],
-                                          "additionalProperties": false
+                                          }
                                         },
-                                        "minItems": 1
+                                        "required": ["path", "match"],
+                                        "additionalProperties": false
                                       },
-                                      "aggregation": {
-                                        "type": "string",
-                                        "enum": ["weighted_average", "all_or_nothing"]
-                                      }
+                                      "minItems": 1
                                     },
-                                    "required": ["type", "fields"],
-                                    "additionalProperties": false
+                                    "aggregation": {
+                                      "type": "string",
+                                      "enum": ["weighted_average", "all_or_nothing"]
+                                    }
                                   },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "latency"
-                                      },
-                                      "threshold": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      }
+                                  "required": ["type", "fields"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "required": ["type", "threshold"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "cost"
-                                      },
-                                      "budget": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      }
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
                                     },
-                                    "required": ["type", "budget"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["token-usage", "token_usage"]
-                                      },
-                                      "max_total": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_input": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_output": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      }
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
                                     },
-                                    "required": ["type"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["execution-metrics", "execution_metrics"]
-                                      },
-                                      "max_tool_calls": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_llm_calls": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_tokens": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_cost_usd": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_duration_ms": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "target_exploration_ratio": {
-                                        "type": "number",
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "exploration_tolerance": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      }
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
                                     },
-                                    "required": ["type"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "contains"
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                    "negate": {
+                                      "type": "boolean"
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
+                                    "type": {
+                                      "type": "string",
+                                      "const": "latency"
+                                    },
+                                    "threshold": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
                                   },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "regex"
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
-                                    },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["is-json", "is_json"]
-                                      }
+                                  "required": ["type", "threshold"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "required": ["type"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "equals"
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "rubrics"
-                                      },
-                                      "criteria": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "id": {
-                                              "type": "string"
-                                            },
-                                            "outcome": {
-                                              "type": "string"
-                                            },
-                                            "operator": {
-                                              "type": "string",
-                                              "enum": ["correctness", "contradiction"]
-                                            },
-                                            "weight": {
-                                              "type": "number"
-                                            },
-                                            "required": {
-                                              "type": "boolean"
-                                            },
-                                            "min_score": {
-                                              "type": "number",
-                                              "exclusiveMinimum": true,
-                                              "minimum": 0,
-                                              "maximum": 1
-                                            },
-                                            "score_ranges": {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "object",
-                                                "properties": {
-                                                  "score_range": {
-                                                    "type": "array",
-                                                    "minItems": 2,
-                                                    "maxItems": 2,
-                                                    "items": [
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      },
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      }
-                                                    ]
-                                                  },
-                                                  "outcome": {
-                                                    "type": "string",
-                                                    "minLength": 1
-                                                  }
-                                                },
-                                                "required": ["score_range", "outcome"],
-                                                "additionalProperties": false
-                                              }
-                                            }
-                                          },
-                                          "additionalProperties": false
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
                                         },
-                                        "minItems": 1
-                                      }
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
                                     },
-                                    "required": ["type", "criteria"],
-                                    "additionalProperties": false
-                                  }
-                                ]
-                              }
-                            ]
-                          }
-                        }
-                      },
-                      "required": ["input"],
-                      "additionalProperties": false
-                    },
-                    "minItems": 1
-                  },
-                  "aggregation": {
-                    "type": "string",
-                    "enum": ["mean", "min", "max"]
-                  },
-                  "on_turn_failure": {
-                    "type": "string",
-                    "enum": ["continue", "stop"]
-                  },
-                  "window_size": {
-                    "type": "integer",
-                    "minimum": 1
-                  }
-                },
-                "required": ["id"],
-                "additionalProperties": false
-              }
-            },
-            {
-              "type": "string"
-            }
-          ]
-        },
-        "eval_cases": {
-          "anyOf": [
-            {
-              "type": "array",
-              "items": {
-                "type": "object",
-                "properties": {
-                  "id": {
-                    "type": "string",
-                    "minLength": 1
-                  },
-                  "vars": {
-                    "type": "object",
-                    "properties": {},
-                    "additionalProperties": {}
-                  },
-                  "criteria": {
-                    "type": "string"
-                  },
-                  "input": {
-                    "anyOf": [
-                      {
-                        "type": "string"
-                      },
-                      {
-                        "type": "object",
-                        "properties": {
-                          "role": {
-                            "type": "string",
-                            "enum": ["system", "user", "assistant", "tool"]
-                          },
-                          "content": {
-                            "anyOf": [
-                              {
-                                "type": "string"
-                              },
-                              {
-                                "type": "object",
-                                "properties": {},
-                                "additionalProperties": {}
-                              },
-                              {
-                                "type": "array",
-                                "items": {
-                                  "type": "object",
-                                  "properties": {
                                     "type": {
                                       "type": "string",
-                                      "enum": ["text", "file", "image"]
+                                      "const": "cost"
                                     },
-                                    "value": {
-                                      "type": "string"
+                                    "budget": {
+                                      "type": "number",
+                                      "minimum": 0
                                     }
                                   },
-                                  "required": ["type", "value"],
+                                  "required": ["type", "budget"],
                                   "additionalProperties": false
-                                }
-                              }
-                            ]
-                          }
-                        },
-                        "required": ["role", "content"],
-                        "additionalProperties": false
-                      },
-                      {
-                        "type": "object",
-                        "properties": {
-                          "role": {
-                            "not": {}
-                          }
-                        },
-                        "additionalProperties": {}
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "role": {
-                              "type": "string",
-                              "enum": ["system", "user", "assistant", "tool"]
-                            },
-                            "content": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
                                 },
                                 {
                                   "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["role", "content"],
-                          "additionalProperties": false
-                        }
-                      }
-                    ]
-                  },
-                  "input_files": {
-                    "type": "array",
-                    "items": {
-                      "type": "string"
-                    }
-                  },
-                  "expected_output": {
-                    "anyOf": [
-                      {
-                        "type": "string"
-                      },
-                      {
-                        "type": "object",
-                        "properties": {},
-                        "additionalProperties": {}
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "role": {
-                              "type": "string",
-                              "enum": ["system", "user", "assistant", "tool"]
-                            },
-                            "content": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["token-usage", "token_usage"]
+                                    },
+                                    "max_total": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_input": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_output": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
                                 },
                                 {
                                   "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["role", "content"],
-                          "additionalProperties": false
-                        }
-                      }
-                    ]
-                  },
-                  "assertions": {
-                    "type": "array",
-                    "items": {
-                      "anyOf": [
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["code-grader", "code_grader"]
-                            },
-                            "command": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "script": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "cwd": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["execution-metrics", "execution_metrics"]
+                                    },
+                                    "max_tool_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_llm_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_tokens": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_cost_usd": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_duration_ms": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "target_exploration_ratio": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "exploration_tolerance": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
                                 },
                                 {
                                   "type": "object",
                                   "properties": {
-                                    "max_calls": {
-                                      "type": "number"
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "contains"
+                                    },
+                                    "value": {
+                                      "type": "string"
                                     }
                                   },
+                                  "required": ["type", "value"],
                                   "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string"
-                                      },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type", "command"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["llm-grader", "llm_grader"]
-                            },
-                            "prompt": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
                                 },
                                 {
                                   "type": "object",
                                   "properties": {
-                                    "command": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
                                       "anyOf": [
                                         {
-                                          "type": "string"
+                                          "type": "boolean"
                                         },
                                         {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
-                                          }
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
                                         }
                                       ]
                                     },
-                                    "script": {
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "regex"
+                                    },
+                                    "value": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
                                       "anyOf": [
                                         {
-                                          "type": "string"
+                                          "type": "boolean"
                                         },
                                         {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
-                                          }
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
                                         }
                                       ]
                                     },
-                                    "config": {
-                                      "type": "object",
-                                      "additionalProperties": {}
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["is-json", "is_json"]
                                     }
                                   },
+                                  "required": ["type"],
                                   "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
                                         },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
                                         }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "equals"
+                                    },
+                                    "value": {
+                                      "type": "string"
                                     }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "model": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "type": "string"
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
                                   },
-                                  "command": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string"
-                                      },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "include": {
-                              "type": "string",
-                              "minLength": 1
-                            }
-                          },
-                          "required": ["include"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "composite"
-                            },
-                            "assertions": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "evaluators": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "aggregator": {
-                              "anyOf": [
                                 {
                                   "type": "object",
                                   "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "weighted_average"
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "weights": {
-                                      "type": "object",
-                                      "additionalProperties": {
-                                        "type": "number"
-                                      }
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "threshold"
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
                                     },
-                                    "threshold": {
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
                                       "type": "number",
+                                      "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
-                                    }
-                                  },
-                                  "required": ["type", "threshold"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "code-grader"
                                     },
-                                    "path": {
-                                      "type": "string"
+                                    "negate": {
+                                      "type": "boolean"
                                     },
-                                    "cwd": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type", "path"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
                                     "type": {
                                       "type": "string",
-                                      "const": "llm-grader"
+                                      "const": "rubrics"
                                     },
-                                    "prompt": {
-                                      "type": "string"
-                                    },
-                                    "model": {
-                                      "type": "string"
+                                    "criteria": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "additionalProperties": false
+                                      },
+                                      "minItems": 1
                                     }
                                   },
-                                  "required": ["type"],
+                                  "required": ["type", "criteria"],
                                   "additionalProperties": false
                                 }
                               ]
                             }
                           },
-                          "required": ["type", "aggregator"],
-                          "additionalProperties": false
+                          "skip_defaults": {
+                            "type": "boolean"
+                          },
+                          "cache": {
+                            "type": "boolean"
+                          },
+                          "trials": {
+                            "not": {}
+                          },
+                          "budget_usd": {
+                            "type": "number",
+                            "minimum": 0
+                          },
+                          "budgetUsd": {
+                            "type": "number",
+                            "minimum": 0
+                          },
+                          "fail_on_error": {
+                            "type": "boolean"
+                          },
+                          "failOnError": {
+                            "type": "boolean"
+                          },
+                          "threshold": {
+                            "type": "number",
+                            "minimum": 0,
+                            "maximum": 1
+                          }
                         },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["tool-trajectory", "tool_trajectory"]
-                            },
-                            "mode": {
-                              "type": "string",
-                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
-                            },
-                            "minimums": {
-                              "type": "object",
-                              "additionalProperties": {
+                        "additionalProperties": false
+                      },
+                      "run": {
+                        "type": "object",
+                        "properties": {
+                          "threshold": {
+                            "type": "number",
+                            "minimum": 0,
+                            "maximum": 1
+                          },
+                          "repeat": {
+                            "type": "object",
+                            "properties": {
+                              "count": {
                                 "type": "integer",
+                                "minimum": 1
+                              },
+                              "strategy": {
+                                "type": "string",
+                                "enum": ["pass_at_k", "pass_all", "mean", "confidence_interval"]
+                              },
+                              "cost_limit_usd": {
+                                "type": "number",
+                                "minimum": 0
+                              },
+                              "costLimitUsd": {
+                                "type": "number",
                                 "minimum": 0
                               }
                             },
-                            "expected": {
-                              "type": "array",
-                              "items": {
+                            "required": ["count"],
+                            "additionalProperties": false
+                          },
+                          "timeout_seconds": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          },
+                          "budget_usd": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          }
+                        },
+                        "additionalProperties": false
+                      },
+                      "workspace": {
+                        "type": "object",
+                        "properties": {
+                          "template": {
+                            "type": "string"
+                          },
+                          "isolation": {
+                            "type": "string",
+                            "enum": ["shared", "per_test"]
+                          },
+                          "repos": {
+                            "type": "array",
+                            "items": {
+                              "type": "object",
+                              "properties": {
+                                "path": {
+                                  "type": "string"
+                                },
+                                "repo": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "commit": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "base_commit": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "ancestor": {
+                                  "type": "integer",
+                                  "minimum": 0
+                                },
+                                "sparse": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              },
+                              "additionalProperties": false
+                            }
+                          },
+                          "hooks": {
+                            "type": "object",
+                            "properties": {
+                              "enabled": {
+                                "type": "boolean"
+                              },
+                              "before_all": {
                                 "type": "object",
                                 "properties": {
-                                  "tool": {
-                                    "type": "string"
-                                  },
-                                  "args": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "const": "any"
-                                      },
-                                      {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      }
-                                    ]
-                                  },
-                                  "max_duration_ms": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "maxDurationMs": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "args_match": {
+                                  "command": {
                                     "anyOf": [
                                       {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                        "type": "string"
                                       },
                                       {
                                         "type": "array",
@@ -7431,11 +5292,10 @@
                                       }
                                     ]
                                   },
-                                  "argsMatch": {
+                                  "script": {
                                     "anyOf": [
                                       {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                        "type": "string"
                                       },
                                       {
                                         "type": "array",
@@ -7444,1783 +5304,1981 @@
                                         }
                                       }
                                     ]
-                                  }
-                                },
-                                "required": ["tool"],
-                                "additionalProperties": false
-                              }
-                            },
-                            "args_match": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
+                                  },
+                                  "timeout_ms": {
+                                    "type": "number"
+                                  },
+                                  "timeoutMs": {
+                                    "type": "number"
+                                  },
+                                  "cwd": {
                                     "type": "string"
+                                  },
+                                  "reset": {
+                                    "type": "string",
+                                    "enum": ["none", "fast", "strict"]
                                   }
-                                }
-                              ]
-                            },
-                            "argsMatch": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
+                                "additionalProperties": false
+                              },
+                              "before_each": {
+                                "type": "object",
+                                "properties": {
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "script": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "timeout_ms": {
+                                    "type": "number"
+                                  },
+                                  "timeoutMs": {
+                                    "type": "number"
+                                  },
+                                  "cwd": {
                                     "type": "string"
+                                  },
+                                  "reset": {
+                                    "type": "string",
+                                    "enum": ["none", "fast", "strict"]
                                   }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "mode"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["field-accuracy", "field_accuracy"]
-                            },
-                            "fields": {
-                              "type": "array",
-                              "items": {
+                                "additionalProperties": false
+                              },
+                              "after_each": {
                                 "type": "object",
                                 "properties": {
-                                  "path": {
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "script": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "timeout_ms": {
+                                    "type": "number"
+                                  },
+                                  "timeoutMs": {
+                                    "type": "number"
+                                  },
+                                  "cwd": {
                                     "type": "string"
                                   },
-                                  "match": {
+                                  "reset": {
                                     "type": "string",
-                                    "enum": ["exact", "numeric_tolerance", "date"]
+                                    "enum": ["none", "fast", "strict"]
+                                  }
+                                },
+                                "additionalProperties": false
+                              },
+                              "after_all": {
+                                "type": "object",
+                                "properties": {
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
                                   },
-                                  "required": {
-                                    "type": "boolean"
+                                  "script": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
                                   },
-                                  "weight": {
+                                  "timeout_ms": {
                                     "type": "number"
                                   },
-                                  "tolerance": {
-                                    "type": "number",
-                                    "minimum": 0
+                                  "timeoutMs": {
+                                    "type": "number"
                                   },
-                                  "relative": {
-                                    "type": "boolean"
+                                  "cwd": {
+                                    "type": "string"
                                   },
-                                  "formats": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
+                                  "reset": {
+                                    "type": "string",
+                                    "enum": ["none", "fast", "strict"]
                                   }
                                 },
-                                "required": ["path", "match"],
                                 "additionalProperties": false
-                              },
-                              "minItems": 1
+                              }
                             },
-                            "aggregation": {
-                              "type": "string",
-                              "enum": ["weighted_average", "all_or_nothing"]
-                            }
+                            "additionalProperties": false
                           },
-                          "required": ["type", "fields"],
-                          "additionalProperties": false
+                          "mode": {
+                            "type": "string",
+                            "enum": ["pooled", "temp", "static"]
+                          },
+                          "path": {
+                            "type": "string"
+                          },
+                          "docker": {
+                            "type": "object",
+                            "properties": {
+                              "image": {
+                                "type": "string"
+                              },
+                              "timeout": {
+                                "type": "integer",
+                                "minimum": 1
+                              },
+                              "memory": {
+                                "type": "string"
+                              },
+                              "cpus": {
+                                "type": "number",
+                                "minimum": 0.1
+                              }
+                            },
+                            "required": ["image"],
+                            "additionalProperties": false
+                          }
                         },
-                        {
+                        "additionalProperties": false
+                      },
+                      "metadata": {
+                        "type": "object",
+                        "additionalProperties": {}
+                      },
+                      "conversation_id": {
+                        "type": "string"
+                      },
+                      "suite": {
+                        "type": "string"
+                      },
+                      "depends_on": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        }
+                      },
+                      "on_dependency_failure": {
+                        "type": "string",
+                        "enum": ["skip", "fail", "run"]
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["conversation"]
+                      },
+                      "turns": {
+                        "type": "array",
+                        "items": {
                           "type": "object",
                           "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
+                            "input": {
                               "anyOf": [
                                 {
-                                  "type": "boolean"
+                                  "type": "string"
                                 },
                                 {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  ]
                                 }
                               ]
                             },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "latency"
-                            },
-                            "threshold": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "threshold"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "cost"
-                            },
-                            "budget": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "budget"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["token-usage", "token_usage"]
-                            },
-                            "max_total": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_input": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_output": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["execution-metrics", "execution_metrics"]
-                            },
-                            "max_tool_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_llm_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_tokens": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_cost_usd": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_duration_ms": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "target_exploration_ratio": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "exploration_tolerance": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
+                            "expected_output": {
                               "anyOf": [
                                 {
-                                  "type": "boolean"
+                                  "type": "string"
                                 },
                                 {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  ]
                                 }
                               ]
                             },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "regex"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["is-json", "is_json"]
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "equals"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "rubrics"
-                            },
-                            "criteria": {
+                            "assertions": {
                               "type": "array",
                               "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
+                                "anyOf": [
+                                  {
                                     "type": "string"
                                   },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                  {
+                                    "anyOf": [
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["code-grader", "code_grader"]
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "script": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "cwd": {
+                                            "type": "string"
+                                          },
+                                          "target": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "max_calls": {
+                                                    "type": "number"
+                                                  }
+                                                },
+                                                "additionalProperties": false
+                                              }
+                                            ]
+                                          },
+                                          "config": {
+                                            "type": "object",
+                                            "additionalProperties": {}
+                                          },
+                                          "preprocessors": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "type": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                },
+                                                "command": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string"
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                }
+                                              },
+                                              "required": ["type", "command"],
+                                              "additionalProperties": false
                                             }
-                                          ]
+                                          }
                                         },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
                                       },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              },
-                              "minItems": 1
-                            }
-                          },
-                          "required": ["type", "criteria"],
-                          "additionalProperties": false
-                        }
-                      ]
-                    }
-                  },
-                  "evaluators": {
-                    "type": "array",
-                    "items": {
-                      "anyOf": [
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["code-grader", "code_grader"]
-                            },
-                            "command": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "script": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "cwd": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "max_calls": {
-                                      "type": "number"
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
                                       {
-                                        "type": "string"
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["llm-grader", "llm_grader"]
+                                          },
+                                          "prompt": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "command": {
+                                                    "anyOf": [
+                                                      {
+                                                        "type": "string"
+                                                      },
+                                                      {
+                                                        "type": "array",
+                                                        "items": {
+                                                          "type": "string"
+                                                        }
+                                                      }
+                                                    ]
+                                                  },
+                                                  "script": {
+                                                    "anyOf": [
+                                                      {
+                                                        "type": "string"
+                                                      },
+                                                      {
+                                                        "type": "array",
+                                                        "items": {
+                                                          "type": "string"
+                                                        }
+                                                      }
+                                                    ]
+                                                  },
+                                                  "config": {
+                                                    "type": "object",
+                                                    "additionalProperties": {}
+                                                  }
+                                                },
+                                                "additionalProperties": false
+                                              }
+                                            ]
+                                          },
+                                          "rubrics": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "id": {
+                                                  "type": "string"
+                                                },
+                                                "outcome": {
+                                                  "type": "string"
+                                                },
+                                                "operator": {
+                                                  "type": "string",
+                                                  "enum": ["correctness", "contradiction"]
+                                                },
+                                                "weight": {
+                                                  "type": "number"
+                                                },
+                                                "required": {
+                                                  "type": "boolean"
+                                                },
+                                                "min_score": {
+                                                  "type": "number",
+                                                  "exclusiveMinimum": true,
+                                                  "minimum": 0,
+                                                  "maximum": 1
+                                                },
+                                                "score_ranges": {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "object",
+                                                    "properties": {
+                                                      "score_range": {
+                                                        "type": "array",
+                                                        "minItems": 2,
+                                                        "maxItems": 2,
+                                                        "items": [
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          },
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          }
+                                                        ]
+                                                      },
+                                                      "outcome": {
+                                                        "type": "string",
+                                                        "minLength": 1
+                                                      }
+                                                    },
+                                                    "required": ["score_range", "outcome"],
+                                                    "additionalProperties": false
+                                                  }
+                                                }
+                                              },
+                                              "additionalProperties": false
+                                            }
+                                          },
+                                          "model": {
+                                            "type": "string"
+                                          },
+                                          "target": {
+                                            "type": "string"
+                                          },
+                                          "config": {
+                                            "type": "object",
+                                            "additionalProperties": {}
+                                          },
+                                          "max_steps": {
+                                            "type": "integer",
+                                            "minimum": 1,
+                                            "maximum": 50
+                                          },
+                                          "temperature": {
+                                            "type": "number",
+                                            "minimum": 0,
+                                            "maximum": 2
+                                          },
+                                          "preprocessors": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "type": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                },
+                                                "command": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string"
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                }
+                                              },
+                                              "required": ["type", "command"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
                                       },
                                       {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type", "command"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["llm-grader", "llm_grader"]
-                            },
-                            "prompt": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "command": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
+                                        "type": "object",
+                                        "properties": {
+                                          "include": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          }
                                         },
-                                        {
-                                          "type": "array",
-                                          "items": {
+                                        "required": ["include"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
                                             "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "composite"
+                                          },
+                                          "assertions": {
+                                            "type": "array",
+                                            "items": {}
+                                          },
+                                          "evaluators": {
+                                            "type": "array",
+                                            "items": {}
+                                          },
+                                          "aggregator": {
+                                            "anyOf": [
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "weighted_average"
+                                                  },
+                                                  "weights": {
+                                                    "type": "object",
+                                                    "additionalProperties": {
+                                                      "type": "number"
+                                                    }
+                                                  }
+                                                },
+                                                "required": ["type"],
+                                                "additionalProperties": false
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "threshold"
+                                                  },
+                                                  "threshold": {
+                                                    "type": "number",
+                                                    "minimum": 0,
+                                                    "maximum": 1
+                                                  }
+                                                },
+                                                "required": ["type", "threshold"],
+                                                "additionalProperties": false
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "code-grader"
+                                                  },
+                                                  "path": {
+                                                    "type": "string"
+                                                  },
+                                                  "cwd": {
+                                                    "type": "string"
+                                                  }
+                                                },
+                                                "required": ["type", "path"],
+                                                "additionalProperties": false
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "llm-grader"
+                                                  },
+                                                  "prompt": {
+                                                    "type": "string"
+                                                  },
+                                                  "model": {
+                                                    "type": "string"
+                                                  }
+                                                },
+                                                "required": ["type"],
+                                                "additionalProperties": false
+                                              }
+                                            ]
                                           }
-                                        }
-                                      ]
-                                    },
-                                    "script": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
                                         },
-                                        {
-                                          "type": "array",
-                                          "items": {
+                                        "required": ["type", "aggregator"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
                                             "type": "string"
-                                          }
-                                        }
-                                      ]
-                                    },
-                                    "config": {
-                                      "type": "object",
-                                      "additionalProperties": {}
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["tool-trajectory", "tool_trajectory"]
+                                          },
+                                          "mode": {
+                                            "type": "string",
+                                            "enum": [
+                                              "any_order",
+                                              "in_order",
+                                              "exact",
+                                              "subset",
+                                              "superset"
+                                            ]
+                                          },
+                                          "minimums": {
+                                            "type": "object",
+                                            "additionalProperties": {
                                               "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                              "minimum": 0
                                             }
-                                          ]
+                                          },
+                                          "expected": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "tool": {
+                                                  "type": "string"
+                                                },
+                                                "args": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string",
+                                                      "const": "any"
+                                                    },
+                                                    {
+                                                      "type": "object",
+                                                      "additionalProperties": {}
+                                                    }
+                                                  ]
+                                                },
+                                                "max_duration_ms": {
+                                                  "type": "number",
+                                                  "minimum": 0
+                                                },
+                                                "maxDurationMs": {
+                                                  "type": "number",
+                                                  "minimum": 0
+                                                },
+                                                "args_match": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string",
+                                                      "enum": [
+                                                        "exact",
+                                                        "ignore",
+                                                        "subset",
+                                                        "superset"
+                                                      ]
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                },
+                                                "argsMatch": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string",
+                                                      "enum": [
+                                                        "exact",
+                                                        "ignore",
+                                                        "subset",
+                                                        "superset"
+                                                      ]
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                }
+                                              },
+                                              "required": ["tool"],
+                                              "additionalProperties": false
+                                            }
+                                          },
+                                          "args_match": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "argsMatch": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
                                         },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
+                                        "required": ["type", "mode"],
+                                        "additionalProperties": false
                                       },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "model": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "type": "string"
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
                                       {
-                                        "type": "string"
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["field-accuracy", "field_accuracy"]
+                                          },
+                                          "fields": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "path": {
+                                                  "type": "string"
+                                                },
+                                                "match": {
+                                                  "type": "string",
+                                                  "enum": ["exact", "numeric_tolerance", "date"]
+                                                },
+                                                "required": {
+                                                  "type": "boolean"
+                                                },
+                                                "weight": {
+                                                  "type": "number"
+                                                },
+                                                "tolerance": {
+                                                  "type": "number",
+                                                  "minimum": 0
+                                                },
+                                                "relative": {
+                                                  "type": "boolean"
+                                                },
+                                                "formats": {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              },
+                                              "required": ["path", "match"],
+                                              "additionalProperties": false
+                                            },
+                                            "minItems": 1
+                                          },
+                                          "aggregation": {
+                                            "type": "string",
+                                            "enum": ["weighted_average", "all_or_nothing"]
+                                          }
+                                        },
+                                        "required": ["type", "fields"],
+                                        "additionalProperties": false
                                       },
                                       {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "include": {
-                              "type": "string",
-                              "minLength": 1
-                            }
-                          },
-                          "required": ["include"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "composite"
-                            },
-                            "assertions": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "evaluators": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "aggregator": {
-                              "anyOf": [
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "weighted_average"
-                                    },
-                                    "weights": {
-                                      "type": "object",
-                                      "additionalProperties": {
-                                        "type": "number"
-                                      }
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "threshold"
-                                    },
-                                    "threshold": {
-                                      "type": "number",
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  },
-                                  "required": ["type", "threshold"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "code-grader"
-                                    },
-                                    "path": {
-                                      "type": "string"
-                                    },
-                                    "cwd": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type", "path"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "llm-grader"
-                                    },
-                                    "prompt": {
-                                      "type": "string"
-                                    },
-                                    "model": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "aggregator"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["tool-trajectory", "tool_trajectory"]
-                            },
-                            "mode": {
-                              "type": "string",
-                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
-                            },
-                            "minimums": {
-                              "type": "object",
-                              "additionalProperties": {
-                                "type": "integer",
-                                "minimum": 0
-                              }
-                            },
-                            "expected": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "tool": {
-                                    "type": "string"
-                                  },
-                                  "args": {
-                                    "anyOf": [
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "latency"
+                                          },
+                                          "threshold": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type", "threshold"],
+                                        "additionalProperties": false
+                                      },
                                       {
-                                        "type": "string",
-                                        "const": "any"
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "cost"
+                                          },
+                                          "budget": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type", "budget"],
+                                        "additionalProperties": false
                                       },
                                       {
                                         "type": "object",
-                                        "additionalProperties": {}
-                                      }
-                                    ]
-                                  },
-                                  "max_duration_ms": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "maxDurationMs": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "args_match": {
-                                    "anyOf": [
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["token-usage", "token_usage"]
+                                          },
+                                          "max_total": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_input": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_output": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
+                                      },
                                       {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["execution-metrics", "execution_metrics"]
+                                          },
+                                          "max_tool_calls": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_llm_calls": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_tokens": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_cost_usd": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_duration_ms": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "target_exploration_ratio": {
+                                            "type": "number",
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "exploration_tolerance": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
                                       },
                                       {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  },
-                                  "argsMatch": {
-                                    "anyOf": [
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "contains"
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      },
                                       {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "regex"
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
                                       },
                                       {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["is-json", "is_json"]
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "equals"
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "rubrics"
+                                          },
+                                          "criteria": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "id": {
+                                                  "type": "string"
+                                                },
+                                                "outcome": {
+                                                  "type": "string"
+                                                },
+                                                "operator": {
+                                                  "type": "string",
+                                                  "enum": ["correctness", "contradiction"]
+                                                },
+                                                "weight": {
+                                                  "type": "number"
+                                                },
+                                                "required": {
+                                                  "type": "boolean"
+                                                },
+                                                "min_score": {
+                                                  "type": "number",
+                                                  "exclusiveMinimum": true,
+                                                  "minimum": 0,
+                                                  "maximum": 1
+                                                },
+                                                "score_ranges": {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "object",
+                                                    "properties": {
+                                                      "score_range": {
+                                                        "type": "array",
+                                                        "minItems": 2,
+                                                        "maxItems": 2,
+                                                        "items": [
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          },
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          }
+                                                        ]
+                                                      },
+                                                      "outcome": {
+                                                        "type": "string",
+                                                        "minLength": 1
+                                                      }
+                                                    },
+                                                    "required": ["score_range", "outcome"],
+                                                    "additionalProperties": false
+                                                  }
+                                                }
+                                              },
+                                              "additionalProperties": false
+                                            },
+                                            "minItems": 1
+                                          }
+                                        },
+                                        "required": ["type", "criteria"],
+                                        "additionalProperties": false
                                       }
                                     ]
                                   }
-                                },
-                                "required": ["tool"],
-                                "additionalProperties": false
+                                ]
                               }
-                            },
-                            "args_match": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "argsMatch": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
                             }
                           },
-                          "required": ["type", "mode"],
+                          "required": ["input"],
                           "additionalProperties": false
                         },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                        "minItems": 1
+                      },
+                      "aggregation": {
+                        "type": "string",
+                        "enum": ["mean", "min", "max"]
+                      },
+                      "on_turn_failure": {
+                        "type": "string",
+                        "enum": ["continue", "stop"]
+                      },
+                      "window_size": {
+                        "type": "integer",
+                        "minimum": 1
+                      }
+                    },
+                    "required": ["id"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "include": {
+                        "type": "string",
+                        "minLength": 1
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["suite", "tests"]
+                      },
+                      "select": {
+                        "anyOf": [
+                          {
+                            "anyOf": [
+                              {
+                                "type": "string",
+                                "minLength": 1
+                              },
+                              {
+                                "type": "array",
+                                "items": {
+                                  "type": "string",
+                                  "minLength": 1
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["field-accuracy", "field_accuracy"]
-                            },
-                            "fields": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "path": {
-                                    "type": "string"
-                                  },
-                                  "match": {
+                                "minItems": 1
+                              }
+                            ]
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "test_ids": {
+                                "anyOf": [
+                                  {
                                     "type": "string",
-                                    "enum": ["exact", "numeric_tolerance", "date"]
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "tolerance": {
-                                    "type": "number",
-                                    "minimum": 0
+                                    "minLength": 1
                                   },
-                                  "relative": {
-                                    "type": "boolean"
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string",
+                                      "minLength": 1
+                                    },
+                                    "minItems": 1
+                                  }
+                                ]
+                              },
+                              "tags": {
+                                "anyOf": [
+                                  {
+                                    "type": "string",
+                                    "minLength": 1
                                   },
-                                  "formats": {
+                                  {
                                     "type": "array",
                                     "items": {
-                                      "type": "string"
-                                    }
+                                      "type": "string",
+                                      "minLength": 1
+                                    },
+                                    "minItems": 1
                                   }
-                                },
-                                "required": ["path", "match"],
-                                "additionalProperties": false
+                                ]
                               },
-                              "minItems": 1
+                              "metadata": {
+                                "type": "object",
+                                "additionalProperties": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "number"
+                                    },
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": ["string", "number", "boolean"]
+                                      },
+                                      "minItems": 1
+                                    }
+                                  ]
+                                }
+                              }
                             },
-                            "aggregation": {
-                              "type": "string",
-                              "enum": ["weighted_average", "all_or_nothing"]
-                            }
-                          },
-                          "required": ["type", "fields"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "latency"
+                            "additionalProperties": false
+                          }
+                        ]
+                      },
+                      "run": {
+                        "type": "object",
+                        "properties": {
+                          "threshold": {
+                            "type": "number",
+                            "minimum": 0,
+                            "maximum": 1
+                          },
+                          "repeat": {
+                            "type": "object",
+                            "properties": {
+                              "count": {
+                                "type": "integer",
+                                "minimum": 1
+                              },
+                              "strategy": {
+                                "type": "string",
+                                "enum": ["pass_at_k", "pass_all", "mean", "confidence_interval"]
+                              },
+                              "cost_limit_usd": {
+                                "type": "number",
+                                "minimum": 0
+                              },
+                              "costLimitUsd": {
+                                "type": "number",
+                                "minimum": 0
+                              }
                             },
-                            "threshold": {
-                              "type": "number",
-                              "minimum": 0
-                            }
+                            "required": ["count"],
+                            "additionalProperties": false
                           },
-                          "required": ["type", "threshold"],
-                          "additionalProperties": false
+                          "timeout_seconds": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          },
+                          "budget_usd": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          }
                         },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
+                        "additionalProperties": false
+                      }
+                    },
+                    "required": ["include", "type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "string",
+                    "minLength": 1
+                  }
+                ]
+              }
+            },
+            {
+              "type": "string",
+              "minLength": 1
+            }
+          ]
+        },
+        "eval_cases": {
+          "anyOf": [
+            {
+              "type": "array",
+              "items": {
+                "anyOf": [
+                  {
+                    "type": "object",
+                    "properties": {
+                      "id": {
+                        "type": "string",
+                        "minLength": 1
+                      },
+                      "vars": {
+                        "type": "object",
+                        "properties": {},
+                        "additionalProperties": {}
+                      },
+                      "criteria": {
+                        "type": "string"
+                      },
+                      "input": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "role": {
+                                "type": "string",
+                                "enum": ["system", "user", "assistant", "tool"]
+                              },
+                              "content": {
+                                "anyOf": [
+                                  {
+                                    "type": "string"
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {},
+                                    "additionalProperties": {}
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "enum": ["text", "file", "image"]
+                                        },
+                                        "value": {
+                                          "type": "string"
+                                        }
+                                      },
+                                      "required": ["type", "value"],
+                                      "additionalProperties": false
+                                    }
+                                  }
+                                ]
+                              }
                             },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                            "required": ["role", "content"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "role": {
+                                "not": {}
+                              }
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            "additionalProperties": {}
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "object",
+                              "properties": {
+                                "role": {
+                                  "type": "string",
+                                  "enum": ["system", "user", "assistant", "tool"]
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
+                                "content": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  ]
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "cost"
-                            },
-                            "budget": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["role", "content"],
+                              "additionalProperties": false
                             }
+                          }
+                        ]
+                      },
+                      "input_files": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        }
+                      },
+                      "expected_output": {
+                        "anyOf": [
+                          {
+                            "type": "string"
                           },
-                          "required": ["type", "budget"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                          {
+                            "type": "object",
+                            "properties": {},
+                            "additionalProperties": {}
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "object",
+                              "properties": {
+                                "role": {
+                                  "type": "string",
+                                  "enum": ["system", "user", "assistant", "tool"]
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
+                                "content": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  ]
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["token-usage", "token_usage"]
-                            },
-                            "max_total": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_input": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_output": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["role", "content"],
+                              "additionalProperties": false
                             }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                          }
+                        ]
+                      },
+                      "assertions": {
+                        "type": "array",
+                        "items": {
+                          "anyOf": [
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["code-grader", "code_grader"]
+                                },
+                                "command": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "script": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "cwd": {
+                                  "type": "string"
+                                },
+                                "target": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "max_calls": {
+                                          "type": "number"
+                                        }
+                                      },
+                                      "additionalProperties": false
+                                    }
+                                  ]
+                                },
+                                "config": {
+                                  "type": "object",
+                                  "additionalProperties": {}
+                                },
+                                "preprocessors": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "minLength": 1
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
+                                  }
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["execution-metrics", "execution_metrics"]
-                            },
-                            "max_tool_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_llm_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_tokens": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_cost_usd": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_duration_ms": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "target_exploration_ratio": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "exploration_tolerance": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "command"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "regex"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["is-json", "is_json"]
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "equals"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
+                                "negate": {
                                   "type": "boolean"
                                 },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "rubrics"
-                            },
-                            "criteria": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "operator": {
-                                    "type": "string",
-                                    "enum": ["correctness", "contradiction"]
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              },
-                              "minItems": 1
-                            }
-                          },
-                          "required": ["type", "criteria"],
-                          "additionalProperties": false
-                        }
-                      ]
-                    }
-                  },
-                  "execution": {
-                    "type": "object",
-                    "properties": {
-                      "target": {
-                        "type": "string"
-                      },
-                      "targets": {
-                        "type": "array",
-                        "items": {
-                          "anyOf": [
-                            {
-                              "type": "string"
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                                "type": {
                                   "type": "string",
-                                  "minLength": 1
-                                },
-                                "use_target": {
-                                  "type": "string"
+                                  "enum": ["llm-grader", "llm_grader"]
                                 },
-                                "hooks": {
-                                  "type": "object",
-                                  "properties": {
-                                    "before_all": {
+                                "prompt": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
                                       "type": "object",
                                       "properties": {
                                         "command": {
@@ -9249,261 +7307,97 @@
                                             }
                                           ]
                                         },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
+                                        "config": {
+                                          "type": "object",
+                                          "additionalProperties": {}
                                         }
                                       },
                                       "additionalProperties": false
-                                    },
-                                    "before_each": {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
-                                        }
+                                    }
+                                  ]
+                                },
+                                "rubrics": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
+                                        "type": "string"
                                       },
-                                      "additionalProperties": false
-                                    },
-                                    "after_each": {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
-                                        }
+                                      "outcome": {
+                                        "type": "string"
                                       },
-                                      "additionalProperties": false
-                                    },
-                                    "after_all": {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
+                                      "operator": {
+                                        "type": "string",
+                                        "enum": ["correctness", "contradiction"]
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
                                               "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
                                             },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
                                             }
-                                          ]
-                                        },
-                                        "timeout_ms": {
-                                          "type": "number"
-                                        },
-                                        "timeoutMs": {
-                                          "type": "number"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        },
-                                        "reset": {
-                                          "type": "string",
-                                          "enum": ["none", "fast", "strict"]
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
                                         }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              },
-                              "required": ["name"],
-                              "additionalProperties": false
-                            }
-                          ]
-                        }
-                      },
-                      "workers": {
-                        "type": "integer",
-                        "minimum": 1,
-                        "maximum": 50
-                      },
-                      "assertions": {
-                        "type": "array",
-                        "items": {
-                          "anyOf": [
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["code-grader", "code_grader"]
-                                },
-                                "command": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
                                       }
-                                    }
-                                  ]
-                                },
-                                "script": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
                                     },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
+                                    "additionalProperties": false
+                                  }
                                 },
-                                "cwd": {
+                                "model": {
                                   "type": "string"
                                 },
                                 "target": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "max_calls": {
-                                          "type": "number"
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  ]
+                                  "type": "string"
                                 },
                                 "config": {
                                   "type": "object",
                                   "additionalProperties": {}
                                 },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
+                                },
                                 "preprocessors": {
                                   "type": "array",
                                   "items": {
@@ -9532,7 +7426,18 @@
                                   }
                                 }
                               },
-                              "required": ["type", "command"],
+                              "required": ["type"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "include": {
+                                  "type": "string",
+                                  "minLength": 1
+                                }
+                              },
+                              "required": ["include"],
                               "additionalProperties": false
                             },
                             {
@@ -9569,210 +7474,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["llm-grader", "llm_grader"]
-                                },
-                                "prompt": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
-                                              "type": "string"
-                                            },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "config": {
-                                          "type": "object",
-                                          "additionalProperties": {}
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  ]
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "operator": {
-                                        "type": "string",
-                                        "enum": ["correctness", "contradiction"]
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "model": {
-                                  "type": "string"
-                                },
-                                "target": {
-                                  "type": "string"
-                                },
-                                "config": {
-                                  "type": "object",
-                                  "additionalProperties": {}
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "preprocessors": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "minLength": 1
-                                      },
-                                      "command": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["type", "command"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "include": {
-                                  "type": "string",
-                                  "minLength": 1
-                                }
-                              },
-                              "required": ["include"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "composite"
+                                  "const": "composite"
                                 },
                                 "assertions": {
                                   "type": "array",
@@ -11709,505 +9411,35 @@
                           ]
                         }
                       },
-                      "skip_defaults": {
-                        "type": "boolean"
-                      },
-                      "cache": {
-                        "type": "boolean"
-                      },
-                      "trials": {
-                        "not": {}
-                      },
-                      "budget_usd": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "budgetUsd": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "fail_on_error": {
-                        "type": "boolean"
-                      },
-                      "failOnError": {
-                        "type": "boolean"
-                      },
-                      "threshold": {
-                        "type": "number",
-                        "minimum": 0,
-                        "maximum": 1
-                      }
-                    },
-                    "additionalProperties": false
-                  },
-                  "workspace": {
-                    "type": "object",
-                    "properties": {
-                      "template": {
-                        "type": "string"
-                      },
-                      "isolation": {
-                        "type": "string",
-                        "enum": ["shared", "per_test"]
-                      },
-                      "repos": {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "path": {
-                              "type": "string"
-                            },
-                            "repo": {
-                              "type": "string",
-                              "minLength": 1
-                            },
-                            "commit": {
-                              "type": "string",
-                              "minLength": 1
-                            },
-                            "base_commit": {
-                              "type": "string",
-                              "minLength": 1
-                            },
-                            "ancestor": {
-                              "type": "integer",
-                              "minimum": 0
-                            },
-                            "sparse": {
-                              "type": "array",
-                              "items": {
-                                "type": "string"
-                              }
-                            }
-                          },
-                          "additionalProperties": false
-                        }
-                      },
-                      "hooks": {
-                        "type": "object",
-                        "properties": {
-                          "enabled": {
-                            "type": "boolean"
-                          },
-                          "before_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "before_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          }
-                        },
-                        "additionalProperties": false
-                      },
-                      "mode": {
-                        "type": "string",
-                        "enum": ["pooled", "temp", "static"]
-                      },
-                      "path": {
-                        "type": "string"
-                      },
-                      "docker": {
+                      "execution": {
                         "type": "object",
                         "properties": {
-                          "image": {
+                          "target": {
                             "type": "string"
                           },
-                          "timeout": {
-                            "type": "integer",
-                            "minimum": 1
-                          },
-                          "memory": {
-                            "type": "string"
-                          },
-                          "cpus": {
-                            "type": "number",
-                            "minimum": 0.1
-                          }
-                        },
-                        "required": ["image"],
-                        "additionalProperties": false
-                      }
-                    },
-                    "additionalProperties": false
-                  },
-                  "metadata": {
-                    "type": "object",
-                    "additionalProperties": {}
-                  },
-                  "conversation_id": {
-                    "type": "string"
-                  },
-                  "suite": {
-                    "type": "string"
-                  },
-                  "depends_on": {
-                    "type": "array",
-                    "items": {
-                      "type": "string"
-                    }
-                  },
-                  "on_dependency_failure": {
-                    "type": "string",
-                    "enum": ["skip", "fail", "run"]
-                  },
-                  "mode": {
-                    "type": "string",
-                    "enum": ["conversation"]
-                  },
-                  "turns": {
-                    "type": "array",
-                    "items": {
-                      "type": "object",
-                      "properties": {
-                        "input": {
-                          "anyOf": [
-                            {
-                              "type": "string"
-                            },
-                            {
+                          "targets": {
+                            "type": "array",
+                            "items": {
                               "anyOf": [
                                 {
                                   "type": "string"
                                 },
                                 {
                                   "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                  "properties": {
+                                    "name": {
+                                      "type": "string",
+                                      "minLength": 1
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              ]
-                            }
-                          ]
-                        },
-                        "expected_output": {
-                          "anyOf": [
-                            {
-                              "type": "string"
-                            },
-                            {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {},
-                                  "additionalProperties": {}
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["text", "file", "image"]
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                    "use_target": {
+                                      "type": "string"
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  }
-                                }
-                              ]
-                            }
-                          ]
-                        },
-                        "assertions": {
-                          "type": "array",
-                          "items": {
-                            "anyOf": [
-                              {
-                                "type": "string"
-                              },
-                              {
-                                "anyOf": [
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["code-grader", "code_grader"]
-                                      },
-                                      "command": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "script": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "cwd": {
-                                        "type": "string"
-                                      },
-                                      "target": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "max_calls": {
-                                                "type": "number"
-                                              }
-                                            },
-                                            "additionalProperties": false
-                                          }
-                                        ]
-                                      },
-                                      "config": {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      },
-                                      "preprocessors": {
-                                        "type": "array",
-                                        "items": {
+                                    "hooks": {
+                                      "type": "object",
+                                      "properties": {
+                                        "before_all": {
                                           "type": "object",
                                           "properties": {
-                                            "type": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            },
                                             "command": {
                                               "anyOf": [
                                                 {
@@ -12220,186 +9452,84 @@
                                                   }
                                                 }
                                               ]
-                                            }
-                                          },
-                                          "required": ["type", "command"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "required": ["type", "command"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["llm-grader", "llm_grader"]
-                                      },
-                                      "prompt": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string"
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "command": {
-                                                "anyOf": [
-                                                  {
-                                                    "type": "string"
-                                                  },
-                                                  {
-                                                    "type": "array",
-                                                    "items": {
-                                                      "type": "string"
-                                                    }
-                                                  }
-                                                ]
-                                              },
-                                              "script": {
-                                                "anyOf": [
-                                                  {
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
                                                     "type": "string"
-                                                  },
-                                                  {
-                                                    "type": "array",
-                                                    "items": {
-                                                      "type": "string"
-                                                    }
                                                   }
-                                                ]
-                                              },
-                                              "config": {
-                                                "type": "object",
-                                                "additionalProperties": {}
-                                              }
+                                                }
+                                              ]
                                             },
-                                            "additionalProperties": false
-                                          }
-                                        ]
-                                      },
-                                      "rubrics": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "id": {
-                                              "type": "string"
+                                            "timeout_ms": {
+                                              "type": "number"
                                             },
-                                            "outcome": {
+                                            "timeoutMs": {
+                                              "type": "number"
+                                            },
+                                            "cwd": {
                                               "type": "string"
                                             },
-                                            "operator": {
+                                            "reset": {
                                               "type": "string",
-                                              "enum": ["correctness", "contradiction"]
+                                              "enum": ["none", "fast", "strict"]
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        },
+                                        "before_each": {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
                                             },
-                                            "weight": {
+                                            "timeout_ms": {
                                               "type": "number"
                                             },
-                                            "required": {
-                                              "type": "boolean"
+                                            "timeoutMs": {
+                                              "type": "number"
                                             },
-                                            "min_score": {
-                                              "type": "number",
-                                              "exclusiveMinimum": true,
-                                              "minimum": 0,
-                                              "maximum": 1
+                                            "cwd": {
+                                              "type": "string"
                                             },
-                                            "score_ranges": {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "object",
-                                                "properties": {
-                                                  "score_range": {
-                                                    "type": "array",
-                                                    "minItems": 2,
-                                                    "maxItems": 2,
-                                                    "items": [
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      },
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      }
-                                                    ]
-                                                  },
-                                                  "outcome": {
-                                                    "type": "string",
-                                                    "minLength": 1
-                                                  }
-                                                },
-                                                "required": ["score_range", "outcome"],
-                                                "additionalProperties": false
-                                              }
+                                            "reset": {
+                                              "type": "string",
+                                              "enum": ["none", "fast", "strict"]
                                             }
                                           },
                                           "additionalProperties": false
-                                        }
-                                      },
-                                      "model": {
-                                        "type": "string"
-                                      },
-                                      "target": {
-                                        "type": "string"
-                                      },
-                                      "config": {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      },
-                                      "max_steps": {
-                                        "type": "integer",
-                                        "minimum": 1,
-                                        "maximum": 50
-                                      },
-                                      "temperature": {
-                                        "type": "number",
-                                        "minimum": 0,
-                                        "maximum": 2
-                                      },
-                                      "preprocessors": {
-                                        "type": "array",
-                                        "items": {
+                                        },
+                                        "after_each": {
                                           "type": "object",
                                           "properties": {
-                                            "type": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            },
                                             "command": {
                                               "anyOf": [
                                                 {
@@ -12412,232 +9542,43 @@
                                                   }
                                                 }
                                               ]
-                                            }
-                                          },
-                                          "required": ["type", "command"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "required": ["type"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "include": {
-                                        "type": "string",
-                                        "minLength": 1
-                                      }
-                                    },
-                                    "required": ["include"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "composite"
-                                      },
-                                      "assertions": {
-                                        "type": "array",
-                                        "items": {}
-                                      },
-                                      "evaluators": {
-                                        "type": "array",
-                                        "items": {}
-                                      },
-                                      "aggregator": {
-                                        "anyOf": [
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
-                                                "type": "string",
-                                                "const": "weighted_average"
-                                              },
-                                              "weights": {
-                                                "type": "object",
-                                                "additionalProperties": {
-                                                  "type": "number"
-                                                }
-                                              }
-                                            },
-                                            "required": ["type"],
-                                            "additionalProperties": false
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
-                                                "type": "string",
-                                                "const": "threshold"
-                                              },
-                                              "threshold": {
-                                                "type": "number",
-                                                "minimum": 0,
-                                                "maximum": 1
-                                              }
-                                            },
-                                            "required": ["type", "threshold"],
-                                            "additionalProperties": false
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
-                                                "type": "string",
-                                                "const": "code-grader"
-                                              },
-                                              "path": {
-                                                "type": "string"
-                                              },
-                                              "cwd": {
-                                                "type": "string"
-                                              }
-                                            },
-                                            "required": ["type", "path"],
-                                            "additionalProperties": false
-                                          },
-                                          {
-                                            "type": "object",
-                                            "properties": {
-                                              "type": {
-                                                "type": "string",
-                                                "const": "llm-grader"
-                                              },
-                                              "prompt": {
-                                                "type": "string"
-                                              },
-                                              "model": {
-                                                "type": "string"
-                                              }
-                                            },
-                                            "required": ["type"],
-                                            "additionalProperties": false
-                                          }
-                                        ]
-                                      }
-                                    },
-                                    "required": ["type", "aggregator"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["tool-trajectory", "tool_trajectory"]
-                                      },
-                                      "mode": {
-                                        "type": "string",
-                                        "enum": [
-                                          "any_order",
-                                          "in_order",
-                                          "exact",
-                                          "subset",
-                                          "superset"
-                                        ]
-                                      },
-                                      "minimums": {
-                                        "type": "object",
-                                        "additionalProperties": {
-                                          "type": "integer",
-                                          "minimum": 0
-                                        }
-                                      },
-                                      "expected": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "tool": {
-                                              "type": "string"
                                             },
-                                            "args": {
+                                            "script": {
                                               "anyOf": [
                                                 {
-                                                  "type": "string",
-                                                  "const": "any"
+                                                  "type": "string"
                                                 },
                                                 {
-                                                  "type": "object",
-                                                  "additionalProperties": {}
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
                                                 }
                                               ]
                                             },
-                                            "max_duration_ms": {
-                                              "type": "number",
-                                              "minimum": 0
+                                            "timeout_ms": {
+                                              "type": "number"
                                             },
-                                            "maxDurationMs": {
-                                              "type": "number",
-                                              "minimum": 0
+                                            "timeoutMs": {
+                                              "type": "number"
                                             },
-                                            "args_match": {
+                                            "cwd": {
+                                              "type": "string"
+                                            },
+                                            "reset": {
+                                              "type": "string",
+                                              "enum": ["none", "fast", "strict"]
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        },
+                                        "after_all": {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
                                               "anyOf": [
                                                 {
-                                                  "type": "string",
-                                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                                  "type": "string"
                                                 },
                                                 {
                                                   "type": "array",
@@ -12647,11 +9588,10 @@
                                                 }
                                               ]
                                             },
-                                            "argsMatch": {
+                                            "script": {
                                               "anyOf": [
                                                 {
-                                                  "type": "string",
-                                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                                  "type": "string"
                                                 },
                                                 {
                                                   "type": "array",
@@ -12660,641 +9600,6563 @@
                                                   }
                                                 }
                                               ]
+                                            },
+                                            "timeout_ms": {
+                                              "type": "number"
+                                            },
+                                            "timeoutMs": {
+                                              "type": "number"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            },
+                                            "reset": {
+                                              "type": "string",
+                                              "enum": ["none", "fast", "strict"]
                                             }
                                           },
-                                          "required": ["tool"],
                                           "additionalProperties": false
                                         }
                                       },
-                                      "args_match": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
-                                          }
-                                        ]
-                                      },
-                                      "argsMatch": {
-                                        "anyOf": [
-                                          {
-                                            "type": "string",
-                                            "enum": ["exact", "ignore", "subset", "superset"]
-                                          },
-                                          {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
+                                      "additionalProperties": false
+                                    }
+                                  },
+                                  "required": ["name"],
+                                  "additionalProperties": false
+                                }
+                              ]
+                            }
+                          },
+                          "workers": {
+                            "type": "integer",
+                            "minimum": 1,
+                            "maximum": 50
+                          },
+                          "assertions": {
+                            "type": "array",
+                            "items": {
+                              "anyOf": [
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["code-grader", "code_grader"]
+                                    },
+                                    "command": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
                                           }
-                                        ]
-                                      }
+                                        }
+                                      ]
                                     },
-                                    "required": ["type", "mode"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
+                                    "script": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
                                           }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["field-accuracy", "field_accuracy"]
-                                      },
-                                      "fields": {
-                                        "type": "array",
-                                        "items": {
+                                        }
+                                      ]
+                                    },
+                                    "cwd": {
+                                      "type": "string"
+                                    },
+                                    "target": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
                                           "type": "object",
                                           "properties": {
-                                            "path": {
-                                              "type": "string"
-                                            },
-                                            "match": {
-                                              "type": "string",
-                                              "enum": ["exact", "numeric_tolerance", "date"]
-                                            },
-                                            "required": {
-                                              "type": "boolean"
-                                            },
-                                            "weight": {
+                                            "max_calls": {
                                               "type": "number"
-                                            },
-                                            "tolerance": {
-                                              "type": "number",
-                                              "minimum": 0
-                                            },
-                                            "relative": {
-                                              "type": "boolean"
-                                            },
-                                            "formats": {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
                                             }
                                           },
-                                          "required": ["path", "match"],
                                           "additionalProperties": false
-                                        },
-                                        "minItems": 1
-                                      },
-                                      "aggregation": {
-                                        "type": "string",
-                                        "enum": ["weighted_average", "all_or_nothing"]
-                                      }
+                                        }
+                                      ]
                                     },
-                                    "required": ["type", "fields"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "preprocessors": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
                                           },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
                                           }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "latency"
-                                      },
-                                      "threshold": {
-                                        "type": "number",
-                                        "minimum": 0
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
                                       }
-                                    },
-                                    "required": ["type", "threshold"],
-                                    "additionalProperties": false
+                                    }
                                   },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
+                                  "required": ["type", "command"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["llm-grader", "llm_grader"]
+                                    },
+                                    "prompt": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "config": {
+                                              "type": "object",
+                                              "additionalProperties": {}
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        }
+                                      ]
+                                    },
+                                    "rubrics": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
                                             "type": "boolean"
                                           },
-                                          {
+                                          "min_score": {
                                             "type": "number",
                                             "exclusiveMinimum": true,
                                             "minimum": 0,
                                             "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
                                           }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "cost"
-                                      },
-                                      "budget": {
-                                        "type": "number",
-                                        "minimum": 0
+                                        },
+                                        "additionalProperties": false
                                       }
                                     },
-                                    "required": ["type", "budget"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
+                                    "model": {
+                                      "type": "string"
+                                    },
+                                    "target": {
+                                      "type": "string"
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "max_steps": {
+                                      "type": "integer",
+                                      "minimum": 1,
+                                      "maximum": 50
+                                    },
+                                    "temperature": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 2
+                                    },
+                                    "preprocessors": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
                                           },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
                                           }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["token-usage", "token_usage"]
-                                      },
-                                      "max_total": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_input": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_output": {
-                                        "type": "number",
-                                        "minimum": 0
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
                                       }
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "include": {
+                                      "type": "string",
+                                      "minLength": 1
+                                    }
+                                  },
+                                  "required": ["include"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
                                     },
-                                    "required": ["type"],
-                                    "additionalProperties": false
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "composite"
+                                    },
+                                    "assertions": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "evaluators": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "aggregator": {
+                                      "anyOf": [
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "weighted_average"
+                                            },
+                                            "weights": {
+                                              "type": "object",
+                                              "additionalProperties": {
+                                                "type": "number"
+                                              }
+                                            }
+                                          },
+                                          "required": ["type"],
+                                          "additionalProperties": false
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "threshold"
+                                            },
+                                            "threshold": {
+                                              "type": "number",
+                                              "minimum": 0,
+                                              "maximum": 1
+                                            }
+                                          },
+                                          "required": ["type", "threshold"],
+                                          "additionalProperties": false
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "code-grader"
+                                            },
+                                            "path": {
+                                              "type": "string"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            }
+                                          },
+                                          "required": ["type", "path"],
+                                          "additionalProperties": false
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "llm-grader"
+                                            },
+                                            "prompt": {
+                                              "type": "string"
+                                            },
+                                            "model": {
+                                              "type": "string"
+                                            }
+                                          },
+                                          "required": ["type"],
+                                          "additionalProperties": false
+                                        }
+                                      ]
+                                    }
                                   },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
+                                  "required": ["type", "aggregator"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["tool-trajectory", "tool_trajectory"]
+                                    },
+                                    "mode": {
+                                      "type": "string",
+                                      "enum": [
+                                        "any_order",
+                                        "in_order",
+                                        "exact",
+                                        "subset",
+                                        "superset"
+                                      ]
+                                    },
+                                    "minimums": {
+                                      "type": "object",
+                                      "additionalProperties": {
+                                        "type": "integer",
                                         "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
+                                      }
+                                    },
+                                    "expected": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "tool": {
+                                            "type": "string"
                                           },
-                                          {
+                                          "args": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "const": "any"
+                                              },
+                                              {
+                                                "type": "object",
+                                                "additionalProperties": {}
+                                              }
+                                            ]
+                                          },
+                                          "max_duration_ms": {
                                             "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
+                                            "minimum": 0
+                                          },
+                                          "maxDurationMs": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "args_match": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "argsMatch": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
                                           }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["execution-metrics", "execution_metrics"]
-                                      },
-                                      "max_tool_calls": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_llm_calls": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_tokens": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_cost_usd": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "max_duration_ms": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "target_exploration_ratio": {
-                                        "type": "number",
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "exploration_tolerance": {
-                                        "type": "number",
-                                        "minimum": 0
+                                        },
+                                        "required": ["tool"],
+                                        "additionalProperties": false
                                       }
                                     },
-                                    "required": ["type"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
+                                    "args_match": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string",
+                                          "enum": ["exact", "ignore", "subset", "superset"]
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
                                           }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "contains"
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
+                                        }
+                                      ]
                                     },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
+                                    "argsMatch": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string",
+                                          "enum": ["exact", "ignore", "subset", "superset"]
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    }
                                   },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
+                                  "required": ["type", "mode"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["field-accuracy", "field_accuracy"]
+                                    },
+                                    "fields": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "path": {
+                                            "type": "string"
+                                          },
+                                          "match": {
+                                            "type": "string",
+                                            "enum": ["exact", "numeric_tolerance", "date"]
+                                          },
+                                          "required": {
                                             "type": "boolean"
                                           },
-                                          {
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "tolerance": {
                                             "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "regex"
-                                      },
-                                      "value": {
-                                        "type": "string"
-                                      }
-                                    },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
+                                            "minimum": 0
+                                          },
+                                          "relative": {
                                             "type": "boolean"
                                           },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
+                                          "formats": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
                                           }
-                                        ]
+                                        },
+                                        "required": ["path", "match"],
+                                        "additionalProperties": false
                                       },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
+                                      "minItems": 1
+                                    },
+                                    "aggregation": {
+                                      "type": "string",
+                                      "enum": ["weighted_average", "all_or_nothing"]
+                                    }
+                                  },
+                                  "required": ["type", "fields"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "latency"
+                                    },
+                                    "threshold": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type", "threshold"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "cost"
+                                    },
+                                    "budget": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type", "budget"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["token-usage", "token_usage"]
+                                    },
+                                    "max_total": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_input": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_output": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["execution-metrics", "execution_metrics"]
+                                    },
+                                    "max_tool_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_llm_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_tokens": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_cost_usd": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_duration_ms": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "target_exploration_ratio": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "exploration_tolerance": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "contains"
+                                    },
+                                    "value": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "regex"
+                                    },
+                                    "value": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["is-json", "is_json"]
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "equals"
+                                    },
+                                    "value": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "rubrics"
+                                    },
+                                    "criteria": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "additionalProperties": false
+                                      },
+                                      "minItems": 1
+                                    }
+                                  },
+                                  "required": ["type", "criteria"],
+                                  "additionalProperties": false
+                                }
+                              ]
+                            }
+                          },
+                          "evaluators": {
+                            "type": "array",
+                            "items": {
+                              "anyOf": [
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["code-grader", "code_grader"]
+                                    },
+                                    "command": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "script": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "cwd": {
+                                      "type": "string"
+                                    },
+                                    "target": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "max_calls": {
+                                              "type": "number"
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        }
+                                      ]
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "preprocessors": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  },
+                                  "required": ["type", "command"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["llm-grader", "llm_grader"]
+                                    },
+                                    "prompt": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "script": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "config": {
+                                              "type": "object",
+                                              "additionalProperties": {}
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        }
+                                      ]
+                                    },
+                                    "rubrics": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "additionalProperties": false
+                                      }
+                                    },
+                                    "model": {
+                                      "type": "string"
+                                    },
+                                    "target": {
+                                      "type": "string"
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    },
+                                    "max_steps": {
+                                      "type": "integer",
+                                      "minimum": 1,
+                                      "maximum": 50
+                                    },
+                                    "temperature": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 2
+                                    },
+                                    "preprocessors": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "include": {
+                                      "type": "string",
+                                      "minLength": 1
+                                    }
+                                  },
+                                  "required": ["include"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "composite"
+                                    },
+                                    "assertions": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "evaluators": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
+                                    "aggregator": {
+                                      "anyOf": [
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "weighted_average"
+                                            },
+                                            "weights": {
+                                              "type": "object",
+                                              "additionalProperties": {
+                                                "type": "number"
+                                              }
+                                            }
+                                          },
+                                          "required": ["type"],
+                                          "additionalProperties": false
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "threshold"
+                                            },
+                                            "threshold": {
+                                              "type": "number",
+                                              "minimum": 0,
+                                              "maximum": 1
+                                            }
+                                          },
+                                          "required": ["type", "threshold"],
+                                          "additionalProperties": false
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "code-grader"
+                                            },
+                                            "path": {
+                                              "type": "string"
+                                            },
+                                            "cwd": {
+                                              "type": "string"
+                                            }
+                                          },
+                                          "required": ["type", "path"],
+                                          "additionalProperties": false
+                                        },
+                                        {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "const": "llm-grader"
+                                            },
+                                            "prompt": {
+                                              "type": "string"
+                                            },
+                                            "model": {
+                                              "type": "string"
+                                            }
+                                          },
+                                          "required": ["type"],
+                                          "additionalProperties": false
+                                        }
+                                      ]
+                                    }
+                                  },
+                                  "required": ["type", "aggregator"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["tool-trajectory", "tool_trajectory"]
+                                    },
+                                    "mode": {
+                                      "type": "string",
+                                      "enum": [
+                                        "any_order",
+                                        "in_order",
+                                        "exact",
+                                        "subset",
+                                        "superset"
+                                      ]
+                                    },
+                                    "minimums": {
+                                      "type": "object",
+                                      "additionalProperties": {
+                                        "type": "integer",
+                                        "minimum": 0
+                                      }
+                                    },
+                                    "expected": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "tool": {
+                                            "type": "string"
+                                          },
+                                          "args": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "const": "any"
+                                              },
+                                              {
+                                                "type": "object",
+                                                "additionalProperties": {}
+                                              }
+                                            ]
+                                          },
+                                          "max_duration_ms": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "maxDurationMs": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "args_match": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "argsMatch": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["tool"],
+                                        "additionalProperties": false
+                                      }
+                                    },
+                                    "args_match": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string",
+                                          "enum": ["exact", "ignore", "subset", "superset"]
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "argsMatch": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string",
+                                          "enum": ["exact", "ignore", "subset", "superset"]
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    }
+                                  },
+                                  "required": ["type", "mode"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["field-accuracy", "field_accuracy"]
+                                    },
+                                    "fields": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "path": {
+                                            "type": "string"
+                                          },
+                                          "match": {
+                                            "type": "string",
+                                            "enum": ["exact", "numeric_tolerance", "date"]
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "tolerance": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "relative": {
+                                            "type": "boolean"
+                                          },
+                                          "formats": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        },
+                                        "required": ["path", "match"],
+                                        "additionalProperties": false
+                                      },
+                                      "minItems": 1
+                                    },
+                                    "aggregation": {
+                                      "type": "string",
+                                      "enum": ["weighted_average", "all_or_nothing"]
+                                    }
+                                  },
+                                  "required": ["type", "fields"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "latency"
+                                    },
+                                    "threshold": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type", "threshold"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "cost"
+                                    },
+                                    "budget": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type", "budget"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["token-usage", "token_usage"]
+                                    },
+                                    "max_total": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_input": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_output": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["execution-metrics", "execution_metrics"]
+                                    },
+                                    "max_tool_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_llm_calls": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_tokens": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_cost_usd": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "max_duration_ms": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "target_exploration_ratio": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "exploration_tolerance": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "contains"
+                                    },
+                                    "value": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "regex"
+                                    },
+                                    "value": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "enum": ["is-json", "is_json"]
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "equals"
+                                    },
+                                    "value": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "value"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "name": {
+                                      "type": "string"
+                                    },
+                                    "weight": {
+                                      "type": "number",
+                                      "minimum": 0
+                                    },
+                                    "required": {
+                                      "anyOf": [
+                                        {
+                                          "type": "boolean"
+                                        },
+                                        {
+                                          "type": "number",
+                                          "exclusiveMinimum": true,
+                                          "minimum": 0,
+                                          "maximum": 1
+                                        }
+                                      ]
+                                    },
+                                    "min_score": {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    },
+                                    "negate": {
+                                      "type": "boolean"
+                                    },
+                                    "type": {
+                                      "type": "string",
+                                      "const": "rubrics"
+                                    },
+                                    "criteria": {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "id": {
+                                            "type": "string"
+                                          },
+                                          "outcome": {
+                                            "type": "string"
+                                          },
+                                          "operator": {
+                                            "type": "string",
+                                            "enum": ["correctness", "contradiction"]
+                                          },
+                                          "weight": {
+                                            "type": "number"
+                                          },
+                                          "required": {
+                                            "type": "boolean"
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "score_ranges": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "score_range": {
+                                                  "type": "array",
+                                                  "minItems": 2,
+                                                  "maxItems": 2,
+                                                  "items": [
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    },
+                                                    {
+                                                      "type": "integer",
+                                                      "minimum": 0,
+                                                      "maximum": 10
+                                                    }
+                                                  ]
+                                                },
+                                                "outcome": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                }
+                                              },
+                                              "required": ["score_range", "outcome"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "additionalProperties": false
+                                      },
+                                      "minItems": 1
+                                    }
+                                  },
+                                  "required": ["type", "criteria"],
+                                  "additionalProperties": false
+                                }
+                              ]
+                            }
+                          },
+                          "skip_defaults": {
+                            "type": "boolean"
+                          },
+                          "cache": {
+                            "type": "boolean"
+                          },
+                          "trials": {
+                            "not": {}
+                          },
+                          "budget_usd": {
+                            "type": "number",
+                            "minimum": 0
+                          },
+                          "budgetUsd": {
+                            "type": "number",
+                            "minimum": 0
+                          },
+                          "fail_on_error": {
+                            "type": "boolean"
+                          },
+                          "failOnError": {
+                            "type": "boolean"
+                          },
+                          "threshold": {
+                            "type": "number",
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        },
+                        "additionalProperties": false
+                      },
+                      "run": {
+                        "type": "object",
+                        "properties": {
+                          "threshold": {
+                            "type": "number",
+                            "minimum": 0,
+                            "maximum": 1
+                          },
+                          "repeat": {
+                            "type": "object",
+                            "properties": {
+                              "count": {
+                                "type": "integer",
+                                "minimum": 1
+                              },
+                              "strategy": {
+                                "type": "string",
+                                "enum": ["pass_at_k", "pass_all", "mean", "confidence_interval"]
+                              },
+                              "cost_limit_usd": {
+                                "type": "number",
+                                "minimum": 0
+                              },
+                              "costLimitUsd": {
+                                "type": "number",
+                                "minimum": 0
+                              }
+                            },
+                            "required": ["count"],
+                            "additionalProperties": false
+                          },
+                          "timeout_seconds": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          },
+                          "budget_usd": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          }
+                        },
+                        "additionalProperties": false
+                      },
+                      "workspace": {
+                        "type": "object",
+                        "properties": {
+                          "template": {
+                            "type": "string"
+                          },
+                          "isolation": {
+                            "type": "string",
+                            "enum": ["shared", "per_test"]
+                          },
+                          "repos": {
+                            "type": "array",
+                            "items": {
+                              "type": "object",
+                              "properties": {
+                                "path": {
+                                  "type": "string"
+                                },
+                                "repo": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "commit": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "base_commit": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "ancestor": {
+                                  "type": "integer",
+                                  "minimum": 0
+                                },
+                                "sparse": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              },
+                              "additionalProperties": false
+                            }
+                          },
+                          "hooks": {
+                            "type": "object",
+                            "properties": {
+                              "enabled": {
+                                "type": "boolean"
+                              },
+                              "before_all": {
+                                "type": "object",
+                                "properties": {
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "script": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "timeout_ms": {
+                                    "type": "number"
+                                  },
+                                  "timeoutMs": {
+                                    "type": "number"
+                                  },
+                                  "cwd": {
+                                    "type": "string"
+                                  },
+                                  "reset": {
+                                    "type": "string",
+                                    "enum": ["none", "fast", "strict"]
+                                  }
+                                },
+                                "additionalProperties": false
+                              },
+                              "before_each": {
+                                "type": "object",
+                                "properties": {
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "script": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "timeout_ms": {
+                                    "type": "number"
+                                  },
+                                  "timeoutMs": {
+                                    "type": "number"
+                                  },
+                                  "cwd": {
+                                    "type": "string"
+                                  },
+                                  "reset": {
+                                    "type": "string",
+                                    "enum": ["none", "fast", "strict"]
+                                  }
+                                },
+                                "additionalProperties": false
+                              },
+                              "after_each": {
+                                "type": "object",
+                                "properties": {
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "script": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "timeout_ms": {
+                                    "type": "number"
+                                  },
+                                  "timeoutMs": {
+                                    "type": "number"
+                                  },
+                                  "cwd": {
+                                    "type": "string"
+                                  },
+                                  "reset": {
+                                    "type": "string",
+                                    "enum": ["none", "fast", "strict"]
+                                  }
+                                },
+                                "additionalProperties": false
+                              },
+                              "after_all": {
+                                "type": "object",
+                                "properties": {
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "script": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "timeout_ms": {
+                                    "type": "number"
+                                  },
+                                  "timeoutMs": {
+                                    "type": "number"
+                                  },
+                                  "cwd": {
+                                    "type": "string"
+                                  },
+                                  "reset": {
+                                    "type": "string",
+                                    "enum": ["none", "fast", "strict"]
+                                  }
+                                },
+                                "additionalProperties": false
+                              }
+                            },
+                            "additionalProperties": false
+                          },
+                          "mode": {
+                            "type": "string",
+                            "enum": ["pooled", "temp", "static"]
+                          },
+                          "path": {
+                            "type": "string"
+                          },
+                          "docker": {
+                            "type": "object",
+                            "properties": {
+                              "image": {
+                                "type": "string"
+                              },
+                              "timeout": {
+                                "type": "integer",
+                                "minimum": 1
+                              },
+                              "memory": {
+                                "type": "string"
+                              },
+                              "cpus": {
+                                "type": "number",
+                                "minimum": 0.1
+                              }
+                            },
+                            "required": ["image"],
+                            "additionalProperties": false
+                          }
+                        },
+                        "additionalProperties": false
+                      },
+                      "metadata": {
+                        "type": "object",
+                        "additionalProperties": {}
+                      },
+                      "conversation_id": {
+                        "type": "string"
+                      },
+                      "suite": {
+                        "type": "string"
+                      },
+                      "depends_on": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        }
+                      },
+                      "on_dependency_failure": {
+                        "type": "string",
+                        "enum": ["skip", "fail", "run"]
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["conversation"]
+                      },
+                      "turns": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "input": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  ]
+                                }
+                              ]
+                            },
+                            "expected_output": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {},
+                                      "additionalProperties": {}
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "object",
+                                        "properties": {
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["text", "file", "image"]
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      }
+                                    }
+                                  ]
+                                }
+                              ]
+                            },
+                            "assertions": {
+                              "type": "array",
+                              "items": {
+                                "anyOf": [
+                                  {
+                                    "type": "string"
+                                  },
+                                  {
+                                    "anyOf": [
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["code-grader", "code_grader"]
+                                          },
+                                          "command": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "script": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "cwd": {
+                                            "type": "string"
+                                          },
+                                          "target": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "max_calls": {
+                                                    "type": "number"
+                                                  }
+                                                },
+                                                "additionalProperties": false
+                                              }
+                                            ]
+                                          },
+                                          "config": {
+                                            "type": "object",
+                                            "additionalProperties": {}
+                                          },
+                                          "preprocessors": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "type": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                },
+                                                "command": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string"
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                }
+                                              },
+                                              "required": ["type", "command"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "required": ["type", "command"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["llm-grader", "llm_grader"]
+                                          },
+                                          "prompt": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string"
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "command": {
+                                                    "anyOf": [
+                                                      {
+                                                        "type": "string"
+                                                      },
+                                                      {
+                                                        "type": "array",
+                                                        "items": {
+                                                          "type": "string"
+                                                        }
+                                                      }
+                                                    ]
+                                                  },
+                                                  "script": {
+                                                    "anyOf": [
+                                                      {
+                                                        "type": "string"
+                                                      },
+                                                      {
+                                                        "type": "array",
+                                                        "items": {
+                                                          "type": "string"
+                                                        }
+                                                      }
+                                                    ]
+                                                  },
+                                                  "config": {
+                                                    "type": "object",
+                                                    "additionalProperties": {}
+                                                  }
+                                                },
+                                                "additionalProperties": false
+                                              }
+                                            ]
+                                          },
+                                          "rubrics": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "id": {
+                                                  "type": "string"
+                                                },
+                                                "outcome": {
+                                                  "type": "string"
+                                                },
+                                                "operator": {
+                                                  "type": "string",
+                                                  "enum": ["correctness", "contradiction"]
+                                                },
+                                                "weight": {
+                                                  "type": "number"
+                                                },
+                                                "required": {
+                                                  "type": "boolean"
+                                                },
+                                                "min_score": {
+                                                  "type": "number",
+                                                  "exclusiveMinimum": true,
+                                                  "minimum": 0,
+                                                  "maximum": 1
+                                                },
+                                                "score_ranges": {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "object",
+                                                    "properties": {
+                                                      "score_range": {
+                                                        "type": "array",
+                                                        "minItems": 2,
+                                                        "maxItems": 2,
+                                                        "items": [
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          },
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          }
+                                                        ]
+                                                      },
+                                                      "outcome": {
+                                                        "type": "string",
+                                                        "minLength": 1
+                                                      }
+                                                    },
+                                                    "required": ["score_range", "outcome"],
+                                                    "additionalProperties": false
+                                                  }
+                                                }
+                                              },
+                                              "additionalProperties": false
+                                            }
+                                          },
+                                          "model": {
+                                            "type": "string"
+                                          },
+                                          "target": {
+                                            "type": "string"
+                                          },
+                                          "config": {
+                                            "type": "object",
+                                            "additionalProperties": {}
+                                          },
+                                          "max_steps": {
+                                            "type": "integer",
+                                            "minimum": 1,
+                                            "maximum": 50
+                                          },
+                                          "temperature": {
+                                            "type": "number",
+                                            "minimum": 0,
+                                            "maximum": 2
+                                          },
+                                          "preprocessors": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "type": {
+                                                  "type": "string",
+                                                  "minLength": 1
+                                                },
+                                                "command": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string"
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                }
+                                              },
+                                              "required": ["type", "command"],
+                                              "additionalProperties": false
+                                            }
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "include": {
+                                            "type": "string",
+                                            "minLength": 1
+                                          }
+                                        },
+                                        "required": ["include"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "composite"
+                                          },
+                                          "assertions": {
+                                            "type": "array",
+                                            "items": {}
+                                          },
+                                          "evaluators": {
+                                            "type": "array",
+                                            "items": {}
+                                          },
+                                          "aggregator": {
+                                            "anyOf": [
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "weighted_average"
+                                                  },
+                                                  "weights": {
+                                                    "type": "object",
+                                                    "additionalProperties": {
+                                                      "type": "number"
+                                                    }
+                                                  }
+                                                },
+                                                "required": ["type"],
+                                                "additionalProperties": false
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "threshold"
+                                                  },
+                                                  "threshold": {
+                                                    "type": "number",
+                                                    "minimum": 0,
+                                                    "maximum": 1
+                                                  }
+                                                },
+                                                "required": ["type", "threshold"],
+                                                "additionalProperties": false
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "code-grader"
+                                                  },
+                                                  "path": {
+                                                    "type": "string"
+                                                  },
+                                                  "cwd": {
+                                                    "type": "string"
+                                                  }
+                                                },
+                                                "required": ["type", "path"],
+                                                "additionalProperties": false
+                                              },
+                                              {
+                                                "type": "object",
+                                                "properties": {
+                                                  "type": {
+                                                    "type": "string",
+                                                    "const": "llm-grader"
+                                                  },
+                                                  "prompt": {
+                                                    "type": "string"
+                                                  },
+                                                  "model": {
+                                                    "type": "string"
+                                                  }
+                                                },
+                                                "required": ["type"],
+                                                "additionalProperties": false
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "aggregator"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["tool-trajectory", "tool_trajectory"]
+                                          },
+                                          "mode": {
+                                            "type": "string",
+                                            "enum": [
+                                              "any_order",
+                                              "in_order",
+                                              "exact",
+                                              "subset",
+                                              "superset"
+                                            ]
+                                          },
+                                          "minimums": {
+                                            "type": "object",
+                                            "additionalProperties": {
+                                              "type": "integer",
+                                              "minimum": 0
+                                            }
+                                          },
+                                          "expected": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "tool": {
+                                                  "type": "string"
+                                                },
+                                                "args": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string",
+                                                      "const": "any"
+                                                    },
+                                                    {
+                                                      "type": "object",
+                                                      "additionalProperties": {}
+                                                    }
+                                                  ]
+                                                },
+                                                "max_duration_ms": {
+                                                  "type": "number",
+                                                  "minimum": 0
+                                                },
+                                                "maxDurationMs": {
+                                                  "type": "number",
+                                                  "minimum": 0
+                                                },
+                                                "args_match": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string",
+                                                      "enum": [
+                                                        "exact",
+                                                        "ignore",
+                                                        "subset",
+                                                        "superset"
+                                                      ]
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                },
+                                                "argsMatch": {
+                                                  "anyOf": [
+                                                    {
+                                                      "type": "string",
+                                                      "enum": [
+                                                        "exact",
+                                                        "ignore",
+                                                        "subset",
+                                                        "superset"
+                                                      ]
+                                                    },
+                                                    {
+                                                      "type": "array",
+                                                      "items": {
+                                                        "type": "string"
+                                                      }
+                                                    }
+                                                  ]
+                                                }
+                                              },
+                                              "required": ["tool"],
+                                              "additionalProperties": false
+                                            }
+                                          },
+                                          "args_match": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          },
+                                          "argsMatch": {
+                                            "anyOf": [
+                                              {
+                                                "type": "string",
+                                                "enum": ["exact", "ignore", "subset", "superset"]
+                                              },
+                                              {
+                                                "type": "array",
+                                                "items": {
+                                                  "type": "string"
+                                                }
+                                              }
+                                            ]
+                                          }
+                                        },
+                                        "required": ["type", "mode"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["field-accuracy", "field_accuracy"]
+                                          },
+                                          "fields": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "path": {
+                                                  "type": "string"
+                                                },
+                                                "match": {
+                                                  "type": "string",
+                                                  "enum": ["exact", "numeric_tolerance", "date"]
+                                                },
+                                                "required": {
+                                                  "type": "boolean"
+                                                },
+                                                "weight": {
+                                                  "type": "number"
+                                                },
+                                                "tolerance": {
+                                                  "type": "number",
+                                                  "minimum": 0
+                                                },
+                                                "relative": {
+                                                  "type": "boolean"
+                                                },
+                                                "formats": {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              },
+                                              "required": ["path", "match"],
+                                              "additionalProperties": false
+                                            },
+                                            "minItems": 1
+                                          },
+                                          "aggregation": {
+                                            "type": "string",
+                                            "enum": ["weighted_average", "all_or_nothing"]
+                                          }
+                                        },
+                                        "required": ["type", "fields"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "latency"
+                                          },
+                                          "threshold": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type", "threshold"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "cost"
+                                          },
+                                          "budget": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type", "budget"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["token-usage", "token_usage"]
+                                          },
+                                          "max_total": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_input": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_output": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["execution-metrics", "execution_metrics"]
+                                          },
+                                          "max_tool_calls": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_llm_calls": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_tokens": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_cost_usd": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "max_duration_ms": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "target_exploration_ratio": {
+                                            "type": "number",
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "exploration_tolerance": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "contains"
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "regex"
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "enum": ["is-json", "is_json"]
+                                          }
+                                        },
+                                        "required": ["type"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "equals"
+                                          },
+                                          "value": {
+                                            "type": "string"
+                                          }
+                                        },
+                                        "required": ["type", "value"],
+                                        "additionalProperties": false
+                                      },
+                                      {
+                                        "type": "object",
+                                        "properties": {
+                                          "name": {
+                                            "type": "string"
+                                          },
+                                          "weight": {
+                                            "type": "number",
+                                            "minimum": 0
+                                          },
+                                          "required": {
+                                            "anyOf": [
+                                              {
+                                                "type": "boolean"
+                                              },
+                                              {
+                                                "type": "number",
+                                                "exclusiveMinimum": true,
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            ]
+                                          },
+                                          "min_score": {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          },
+                                          "negate": {
+                                            "type": "boolean"
+                                          },
+                                          "type": {
+                                            "type": "string",
+                                            "const": "rubrics"
+                                          },
+                                          "criteria": {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "object",
+                                              "properties": {
+                                                "id": {
+                                                  "type": "string"
+                                                },
+                                                "outcome": {
+                                                  "type": "string"
+                                                },
+                                                "operator": {
+                                                  "type": "string",
+                                                  "enum": ["correctness", "contradiction"]
+                                                },
+                                                "weight": {
+                                                  "type": "number"
+                                                },
+                                                "required": {
+                                                  "type": "boolean"
+                                                },
+                                                "min_score": {
+                                                  "type": "number",
+                                                  "exclusiveMinimum": true,
+                                                  "minimum": 0,
+                                                  "maximum": 1
+                                                },
+                                                "score_ranges": {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "object",
+                                                    "properties": {
+                                                      "score_range": {
+                                                        "type": "array",
+                                                        "minItems": 2,
+                                                        "maxItems": 2,
+                                                        "items": [
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          },
+                                                          {
+                                                            "type": "integer",
+                                                            "minimum": 0,
+                                                            "maximum": 10
+                                                          }
+                                                        ]
+                                                      },
+                                                      "outcome": {
+                                                        "type": "string",
+                                                        "minLength": 1
+                                                      }
+                                                    },
+                                                    "required": ["score_range", "outcome"],
+                                                    "additionalProperties": false
+                                                  }
+                                                }
+                                              },
+                                              "additionalProperties": false
+                                            },
+                                            "minItems": 1
+                                          }
+                                        },
+                                        "required": ["type", "criteria"],
+                                        "additionalProperties": false
+                                      }
+                                    ]
+                                  }
+                                ]
+                              }
+                            }
+                          },
+                          "required": ["input"],
+                          "additionalProperties": false
+                        },
+                        "minItems": 1
+                      },
+                      "aggregation": {
+                        "type": "string",
+                        "enum": ["mean", "min", "max"]
+                      },
+                      "on_turn_failure": {
+                        "type": "string",
+                        "enum": ["continue", "stop"]
+                      },
+                      "window_size": {
+                        "type": "integer",
+                        "minimum": 1
+                      }
+                    },
+                    "required": ["id"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "include": {
+                        "type": "string",
+                        "minLength": 1
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["suite", "tests"]
+                      },
+                      "select": {
+                        "anyOf": [
+                          {
+                            "anyOf": [
+                              {
+                                "type": "string",
+                                "minLength": 1
+                              },
+                              {
+                                "type": "array",
+                                "items": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "minItems": 1
+                              }
+                            ]
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "test_ids": {
+                                "anyOf": [
+                                  {
+                                    "type": "string",
+                                    "minLength": 1
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string",
+                                      "minLength": 1
+                                    },
+                                    "minItems": 1
+                                  }
+                                ]
+                              },
+                              "tags": {
+                                "anyOf": [
+                                  {
+                                    "type": "string",
+                                    "minLength": 1
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string",
+                                      "minLength": 1
+                                    },
+                                    "minItems": 1
+                                  }
+                                ]
+                              },
+                              "metadata": {
+                                "type": "object",
+                                "additionalProperties": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "number"
+                                    },
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": ["string", "number", "boolean"]
+                                      },
+                                      "minItems": 1
+                                    }
+                                  ]
+                                }
+                              }
+                            },
+                            "additionalProperties": false
+                          }
+                        ]
+                      },
+                      "run": {
+                        "type": "object",
+                        "properties": {
+                          "threshold": {
+                            "type": "number",
+                            "minimum": 0,
+                            "maximum": 1
+                          },
+                          "repeat": {
+                            "type": "object",
+                            "properties": {
+                              "count": {
+                                "type": "integer",
+                                "minimum": 1
+                              },
+                              "strategy": {
+                                "type": "string",
+                                "enum": ["pass_at_k", "pass_all", "mean", "confidence_interval"]
+                              },
+                              "cost_limit_usd": {
+                                "type": "number",
+                                "minimum": 0
+                              },
+                              "costLimitUsd": {
+                                "type": "number",
+                                "minimum": 0
+                              }
+                            },
+                            "required": ["count"],
+                            "additionalProperties": false
+                          },
+                          "timeout_seconds": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          },
+                          "budget_usd": {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0
+                          }
+                        },
+                        "additionalProperties": false
+                      }
+                    },
+                    "required": ["include", "type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "string",
+                    "minLength": 1
+                  }
+                ]
+              }
+            },
+            {
+              "type": "string",
+              "minLength": 1
+            }
+          ]
+        },
+        "target": {
+          "type": "string"
+        },
+        "experiment": {
+          "type": "object",
+          "properties": {
+            "target": {
+              "type": "string"
+            },
+            "targets": {
+              "type": "array",
+              "items": {
+                "anyOf": [
+                  {
+                    "type": "string",
+                    "minLength": 1
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string",
+                        "minLength": 1
+                      },
+                      "use_target": {
+                        "type": "string",
+                        "minLength": 1
+                      },
+                      "hooks": {
+                        "type": "object",
+                        "properties": {},
+                        "additionalProperties": {}
+                      }
+                    },
+                    "required": ["name"],
+                    "additionalProperties": false
+                  }
+                ]
+              },
+              "minItems": 1
+            },
+            "workers": {
+              "type": "integer",
+              "minimum": 1,
+              "maximum": 50
+            },
+            "assertions": {
+              "type": "array",
+              "items": {
+                "anyOf": [
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["code-grader", "code_grader"]
+                      },
+                      "command": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      },
+                      "script": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      },
+                      "cwd": {
+                        "type": "string"
+                      },
+                      "target": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "max_calls": {
+                                "type": "number"
+                              }
+                            },
+                            "additionalProperties": false
+                          }
+                        ]
+                      },
+                      "config": {
+                        "type": "object",
+                        "additionalProperties": {}
+                      },
+                      "preprocessors": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "type": {
+                              "type": "string",
+                              "minLength": 1
+                            },
+                            "command": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "command"],
+                          "additionalProperties": false
+                        }
+                      }
+                    },
+                    "required": ["type", "command"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["llm-grader", "llm_grader"]
+                      },
+                      "prompt": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "command": {
+                                "anyOf": [
+                                  {
+                                    "type": "string"
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string"
+                                    }
+                                  }
+                                ]
+                              },
+                              "script": {
+                                "anyOf": [
+                                  {
+                                    "type": "string"
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string"
+                                    }
+                                  }
+                                ]
+                              },
+                              "config": {
+                                "type": "object",
+                                "additionalProperties": {}
+                              }
+                            },
+                            "additionalProperties": false
+                          }
+                        ]
+                      },
+                      "rubrics": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "id": {
+                              "type": "string"
+                            },
+                            "outcome": {
+                              "type": "string"
+                            },
+                            "operator": {
+                              "type": "string",
+                              "enum": ["correctness", "contradiction"]
+                            },
+                            "weight": {
+                              "type": "number"
+                            },
+                            "required": {
+                              "type": "boolean"
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "score_ranges": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "score_range": {
+                                    "type": "array",
+                                    "minItems": 2,
+                                    "maxItems": 2,
+                                    "items": [
+                                      {
+                                        "type": "integer",
+                                        "minimum": 0,
+                                        "maximum": 10
+                                      },
+                                      {
+                                        "type": "integer",
+                                        "minimum": 0,
+                                        "maximum": 10
+                                      }
+                                    ]
+                                  },
+                                  "outcome": {
+                                    "type": "string",
+                                    "minLength": 1
+                                  }
+                                },
+                                "required": ["score_range", "outcome"],
+                                "additionalProperties": false
+                              }
+                            }
+                          },
+                          "additionalProperties": false
+                        }
+                      },
+                      "model": {
+                        "type": "string"
+                      },
+                      "target": {
+                        "type": "string"
+                      },
+                      "config": {
+                        "type": "object",
+                        "additionalProperties": {}
+                      },
+                      "max_steps": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 50
+                      },
+                      "temperature": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 2
+                      },
+                      "preprocessors": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "type": {
+                              "type": "string",
+                              "minLength": 1
+                            },
+                            "command": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "command"],
+                          "additionalProperties": false
+                        }
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "include": {
+                        "type": "string",
+                        "minLength": 1
+                      }
+                    },
+                    "required": ["include"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "composite"
+                      },
+                      "assertions": {
+                        "type": "array",
+                        "items": {}
+                      },
+                      "evaluators": {
+                        "type": "array",
+                        "items": {}
+                      },
+                      "aggregator": {
+                        "anyOf": [
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "weighted_average"
+                              },
+                              "weights": {
+                                "type": "object",
+                                "additionalProperties": {
+                                  "type": "number"
+                                }
+                              }
+                            },
+                            "required": ["type"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "threshold"
+                              },
+                              "threshold": {
+                                "type": "number",
+                                "minimum": 0,
+                                "maximum": 1
+                              }
+                            },
+                            "required": ["type", "threshold"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "code-grader"
+                              },
+                              "path": {
+                                "type": "string"
+                              },
+                              "cwd": {
+                                "type": "string"
+                              }
+                            },
+                            "required": ["type", "path"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "llm-grader"
+                              },
+                              "prompt": {
+                                "type": "string"
+                              },
+                              "model": {
+                                "type": "string"
+                              }
+                            },
+                            "required": ["type"],
+                            "additionalProperties": false
+                          }
+                        ]
+                      }
+                    },
+                    "required": ["type", "aggregator"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["tool-trajectory", "tool_trajectory"]
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                      },
+                      "minimums": {
+                        "type": "object",
+                        "additionalProperties": {
+                          "type": "integer",
+                          "minimum": 0
+                        }
+                      },
+                      "expected": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "tool": {
+                              "type": "string"
+                            },
+                            "args": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "const": "any"
+                                },
+                                {
+                                  "type": "object",
+                                  "additionalProperties": {}
+                                }
+                              ]
+                            },
+                            "max_duration_ms": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "maxDurationMs": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "args_match": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "argsMatch": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["tool"],
+                          "additionalProperties": false
+                        }
+                      },
+                      "args_match": {
+                        "anyOf": [
+                          {
+                            "type": "string",
+                            "enum": ["exact", "ignore", "subset", "superset"]
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      },
+                      "argsMatch": {
+                        "anyOf": [
+                          {
+                            "type": "string",
+                            "enum": ["exact", "ignore", "subset", "superset"]
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      }
+                    },
+                    "required": ["type", "mode"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["field-accuracy", "field_accuracy"]
+                      },
+                      "fields": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "path": {
+                              "type": "string"
+                            },
+                            "match": {
+                              "type": "string",
+                              "enum": ["exact", "numeric_tolerance", "date"]
+                            },
+                            "required": {
+                              "type": "boolean"
+                            },
+                            "weight": {
+                              "type": "number"
+                            },
+                            "tolerance": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "relative": {
+                              "type": "boolean"
+                            },
+                            "formats": {
+                              "type": "array",
+                              "items": {
+                                "type": "string"
+                              }
+                            }
+                          },
+                          "required": ["path", "match"],
+                          "additionalProperties": false
+                        },
+                        "minItems": 1
+                      },
+                      "aggregation": {
+                        "type": "string",
+                        "enum": ["weighted_average", "all_or_nothing"]
+                      }
+                    },
+                    "required": ["type", "fields"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "latency"
+                      },
+                      "threshold": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type", "threshold"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "cost"
+                      },
+                      "budget": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type", "budget"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["token-usage", "token_usage"]
+                      },
+                      "max_total": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_input": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_output": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["execution-metrics", "execution_metrics"]
+                      },
+                      "max_tool_calls": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_llm_calls": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_tokens": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_cost_usd": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_duration_ms": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "target_exploration_ratio": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "exploration_tolerance": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "contains"
+                      },
+                      "value": {
+                        "type": "string"
+                      }
+                    },
+                    "required": ["type", "value"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "regex"
+                      },
+                      "value": {
+                        "type": "string"
+                      }
+                    },
+                    "required": ["type", "value"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["is-json", "is_json"]
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "equals"
+                      },
+                      "value": {
+                        "type": "string"
+                      }
+                    },
+                    "required": ["type", "value"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "rubrics"
+                      },
+                      "criteria": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "id": {
+                              "type": "string"
+                            },
+                            "outcome": {
+                              "type": "string"
+                            },
+                            "operator": {
+                              "type": "string",
+                              "enum": ["correctness", "contradiction"]
+                            },
+                            "weight": {
+                              "type": "number"
+                            },
+                            "required": {
+                              "type": "boolean"
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "score_ranges": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "score_range": {
+                                    "type": "array",
+                                    "minItems": 2,
+                                    "maxItems": 2,
+                                    "items": [
+                                      {
+                                        "type": "integer",
+                                        "minimum": 0,
+                                        "maximum": 10
+                                      },
+                                      {
+                                        "type": "integer",
+                                        "minimum": 0,
+                                        "maximum": 10
+                                      }
+                                    ]
+                                  },
+                                  "outcome": {
+                                    "type": "string",
+                                    "minLength": 1
+                                  }
+                                },
+                                "required": ["score_range", "outcome"],
+                                "additionalProperties": false
+                              }
+                            }
+                          },
+                          "additionalProperties": false
+                        },
+                        "minItems": 1
+                      }
+                    },
+                    "required": ["type", "criteria"],
+                    "additionalProperties": false
+                  }
+                ]
+              }
+            },
+            "evaluators": {
+              "type": "array",
+              "items": {
+                "anyOf": [
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["code-grader", "code_grader"]
+                      },
+                      "command": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      },
+                      "script": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      },
+                      "cwd": {
+                        "type": "string"
+                      },
+                      "target": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "max_calls": {
+                                "type": "number"
+                              }
+                            },
+                            "additionalProperties": false
+                          }
+                        ]
+                      },
+                      "config": {
+                        "type": "object",
+                        "additionalProperties": {}
+                      },
+                      "preprocessors": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "type": {
+                              "type": "string",
+                              "minLength": 1
+                            },
+                            "command": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "command"],
+                          "additionalProperties": false
+                        }
+                      }
+                    },
+                    "required": ["type", "command"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["llm-grader", "llm_grader"]
+                      },
+                      "prompt": {
+                        "anyOf": [
+                          {
+                            "type": "string"
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "command": {
+                                "anyOf": [
+                                  {
+                                    "type": "string"
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string"
+                                    }
+                                  }
+                                ]
+                              },
+                              "script": {
+                                "anyOf": [
+                                  {
+                                    "type": "string"
+                                  },
+                                  {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string"
+                                    }
+                                  }
+                                ]
+                              },
+                              "config": {
+                                "type": "object",
+                                "additionalProperties": {}
+                              }
+                            },
+                            "additionalProperties": false
+                          }
+                        ]
+                      },
+                      "rubrics": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "id": {
+                              "type": "string"
+                            },
+                            "outcome": {
+                              "type": "string"
+                            },
+                            "operator": {
+                              "type": "string",
+                              "enum": ["correctness", "contradiction"]
+                            },
+                            "weight": {
+                              "type": "number"
+                            },
+                            "required": {
+                              "type": "boolean"
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "score_ranges": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "score_range": {
+                                    "type": "array",
+                                    "minItems": 2,
+                                    "maxItems": 2,
+                                    "items": [
+                                      {
+                                        "type": "integer",
+                                        "minimum": 0,
+                                        "maximum": 10
+                                      },
+                                      {
+                                        "type": "integer",
+                                        "minimum": 0,
+                                        "maximum": 10
+                                      }
+                                    ]
+                                  },
+                                  "outcome": {
+                                    "type": "string",
+                                    "minLength": 1
+                                  }
+                                },
+                                "required": ["score_range", "outcome"],
+                                "additionalProperties": false
+                              }
+                            }
+                          },
+                          "additionalProperties": false
+                        }
+                      },
+                      "model": {
+                        "type": "string"
+                      },
+                      "target": {
+                        "type": "string"
+                      },
+                      "config": {
+                        "type": "object",
+                        "additionalProperties": {}
+                      },
+                      "max_steps": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 50
+                      },
+                      "temperature": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 2
+                      },
+                      "preprocessors": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "type": {
+                              "type": "string",
+                              "minLength": 1
+                            },
+                            "command": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "command"],
+                          "additionalProperties": false
+                        }
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "include": {
+                        "type": "string",
+                        "minLength": 1
+                      }
+                    },
+                    "required": ["include"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "composite"
+                      },
+                      "assertions": {
+                        "type": "array",
+                        "items": {}
+                      },
+                      "evaluators": {
+                        "type": "array",
+                        "items": {}
+                      },
+                      "aggregator": {
+                        "anyOf": [
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "weighted_average"
+                              },
+                              "weights": {
+                                "type": "object",
+                                "additionalProperties": {
+                                  "type": "number"
+                                }
+                              }
+                            },
+                            "required": ["type"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "threshold"
+                              },
+                              "threshold": {
+                                "type": "number",
+                                "minimum": 0,
+                                "maximum": 1
+                              }
+                            },
+                            "required": ["type", "threshold"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "code-grader"
+                              },
+                              "path": {
+                                "type": "string"
+                              },
+                              "cwd": {
+                                "type": "string"
+                              }
+                            },
+                            "required": ["type", "path"],
+                            "additionalProperties": false
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "type": {
+                                "type": "string",
+                                "const": "llm-grader"
+                              },
+                              "prompt": {
+                                "type": "string"
+                              },
+                              "model": {
+                                "type": "string"
+                              }
+                            },
+                            "required": ["type"],
+                            "additionalProperties": false
+                          }
+                        ]
+                      }
+                    },
+                    "required": ["type", "aggregator"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["tool-trajectory", "tool_trajectory"]
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                      },
+                      "minimums": {
+                        "type": "object",
+                        "additionalProperties": {
+                          "type": "integer",
+                          "minimum": 0
+                        }
+                      },
+                      "expected": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "tool": {
+                              "type": "string"
+                            },
+                            "args": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "const": "any"
+                                },
+                                {
+                                  "type": "object",
+                                  "additionalProperties": {}
+                                }
+                              ]
+                            },
+                            "max_duration_ms": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "maxDurationMs": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "args_match": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "argsMatch": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["tool"],
+                          "additionalProperties": false
+                        }
+                      },
+                      "args_match": {
+                        "anyOf": [
+                          {
+                            "type": "string",
+                            "enum": ["exact", "ignore", "subset", "superset"]
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      },
+                      "argsMatch": {
+                        "anyOf": [
+                          {
+                            "type": "string",
+                            "enum": ["exact", "ignore", "subset", "superset"]
+                          },
+                          {
+                            "type": "array",
+                            "items": {
+                              "type": "string"
+                            }
+                          }
+                        ]
+                      }
+                    },
+                    "required": ["type", "mode"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["field-accuracy", "field_accuracy"]
+                      },
+                      "fields": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "path": {
+                              "type": "string"
+                            },
+                            "match": {
+                              "type": "string",
+                              "enum": ["exact", "numeric_tolerance", "date"]
+                            },
+                            "required": {
+                              "type": "boolean"
+                            },
+                            "weight": {
+                              "type": "number"
+                            },
+                            "tolerance": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "relative": {
+                              "type": "boolean"
+                            },
+                            "formats": {
+                              "type": "array",
+                              "items": {
+                                "type": "string"
+                              }
+                            }
+                          },
+                          "required": ["path", "match"],
+                          "additionalProperties": false
+                        },
+                        "minItems": 1
+                      },
+                      "aggregation": {
+                        "type": "string",
+                        "enum": ["weighted_average", "all_or_nothing"]
+                      }
+                    },
+                    "required": ["type", "fields"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "latency"
+                      },
+                      "threshold": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type", "threshold"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "cost"
+                      },
+                      "budget": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type", "budget"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["token-usage", "token_usage"]
+                      },
+                      "max_total": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_input": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_output": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["execution-metrics", "execution_metrics"]
+                      },
+                      "max_tool_calls": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_llm_calls": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_tokens": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_cost_usd": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_duration_ms": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "target_exploration_ratio": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "exploration_tolerance": {
+                        "type": "number",
+                        "minimum": 0
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "contains"
+                      },
+                      "value": {
+                        "type": "string"
+                      }
+                    },
+                    "required": ["type", "value"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "regex"
+                      },
+                      "value": {
+                        "type": "string"
+                      }
+                    },
+                    "required": ["type", "value"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "enum": ["is-json", "is_json"]
+                      }
+                    },
+                    "required": ["type"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "equals"
+                      },
+                      "value": {
+                        "type": "string"
+                      }
+                    },
+                    "required": ["type", "value"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string"
+                      },
+                      "weight": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "required": {
+                        "anyOf": [
+                          {
+                            "type": "boolean"
+                          },
+                          {
+                            "type": "number",
+                            "exclusiveMinimum": true,
+                            "minimum": 0,
+                            "maximum": 1
+                          }
+                        ]
+                      },
+                      "min_score": {
+                        "type": "number",
+                        "exclusiveMinimum": true,
+                        "minimum": 0,
+                        "maximum": 1
+                      },
+                      "negate": {
+                        "type": "boolean"
+                      },
+                      "type": {
+                        "type": "string",
+                        "const": "rubrics"
+                      },
+                      "criteria": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "id": {
+                              "type": "string"
+                            },
+                            "outcome": {
+                              "type": "string"
+                            },
+                            "operator": {
+                              "type": "string",
+                              "enum": ["correctness", "contradiction"]
+                            },
+                            "weight": {
+                              "type": "number"
+                            },
+                            "required": {
+                              "type": "boolean"
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "score_ranges": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "score_range": {
+                                    "type": "array",
+                                    "minItems": 2,
+                                    "maxItems": 2,
+                                    "items": [
+                                      {
+                                        "type": "integer",
                                         "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "enum": ["is-json", "is_json"]
-                                      }
-                                    },
-                                    "required": ["type"],
-                                    "additionalProperties": false
-                                  },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
+                                        "maximum": 10
                                       },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
+                                      {
+                                        "type": "integer",
                                         "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "equals"
-                                      },
-                                      "value": {
-                                        "type": "string"
+                                        "maximum": 10
                                       }
-                                    },
-                                    "required": ["type", "value"],
-                                    "additionalProperties": false
+                                    ]
                                   },
-                                  {
-                                    "type": "object",
-                                    "properties": {
-                                      "name": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number",
-                                        "minimum": 0
-                                      },
-                                      "required": {
-                                        "anyOf": [
-                                          {
-                                            "type": "boolean"
-                                          },
-                                          {
-                                            "type": "number",
-                                            "exclusiveMinimum": true,
-                                            "minimum": 0,
-                                            "maximum": 1
-                                          }
-                                        ]
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "negate": {
-                                        "type": "boolean"
-                                      },
-                                      "type": {
-                                        "type": "string",
-                                        "const": "rubrics"
-                                      },
-                                      "criteria": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "id": {
-                                              "type": "string"
-                                            },
-                                            "outcome": {
-                                              "type": "string"
-                                            },
-                                            "operator": {
-                                              "type": "string",
-                                              "enum": ["correctness", "contradiction"]
-                                            },
-                                            "weight": {
-                                              "type": "number"
-                                            },
-                                            "required": {
-                                              "type": "boolean"
-                                            },
-                                            "min_score": {
-                                              "type": "number",
-                                              "exclusiveMinimum": true,
-                                              "minimum": 0,
-                                              "maximum": 1
-                                            },
-                                            "score_ranges": {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "object",
-                                                "properties": {
-                                                  "score_range": {
-                                                    "type": "array",
-                                                    "minItems": 2,
-                                                    "maxItems": 2,
-                                                    "items": [
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      },
-                                                      {
-                                                        "type": "integer",
-                                                        "minimum": 0,
-                                                        "maximum": 10
-                                                      }
-                                                    ]
-                                                  },
-                                                  "outcome": {
-                                                    "type": "string",
-                                                    "minLength": 1
-                                                  }
-                                                },
-                                                "required": ["score_range", "outcome"],
-                                                "additionalProperties": false
-                                              }
-                                            }
-                                          },
-                                          "additionalProperties": false
-                                        },
-                                        "minItems": 1
-                                      }
-                                    },
-                                    "required": ["type", "criteria"],
-                                    "additionalProperties": false
+                                  "outcome": {
+                                    "type": "string",
+                                    "minLength": 1
                                   }
-                                ]
+                                },
+                                "required": ["score_range", "outcome"],
+                                "additionalProperties": false
                               }
-                            ]
-                          }
-                        }
-                      },
-                      "required": ["input"],
-                      "additionalProperties": false
+                            }
+                          },
+                          "additionalProperties": false
+                        },
+                        "minItems": 1
+                      }
                     },
-                    "minItems": 1
-                  },
-                  "aggregation": {
-                    "type": "string",
-                    "enum": ["mean", "min", "max"]
-                  },
-                  "on_turn_failure": {
-                    "type": "string",
-                    "enum": ["continue", "stop"]
-                  },
-                  "window_size": {
-                    "type": "integer",
-                    "minimum": 1
+                    "required": ["type", "criteria"],
+                    "additionalProperties": false
                   }
+                ]
+              }
+            },
+            "skip_defaults": {
+              "type": "boolean"
+            },
+            "cache": {
+              "type": "boolean"
+            },
+            "trials": {
+              "not": {}
+            },
+            "budget_usd": {
+              "type": "number",
+              "exclusiveMinimum": true,
+              "minimum": 0
+            },
+            "budgetUsd": {
+              "type": "number",
+              "minimum": 0
+            },
+            "fail_on_error": {
+              "type": "boolean"
+            },
+            "failOnError": {
+              "type": "boolean"
+            },
+            "threshold": {
+              "type": "number",
+              "minimum": 0,
+              "maximum": 1
+            },
+            "agent": {
+              "type": "string",
+              "minLength": 1
+            },
+            "model": {
+              "type": "string",
+              "minLength": 1
+            },
+            "agent_options": {
+              "type": "object",
+              "properties": {},
+              "additionalProperties": {}
+            },
+            "scripts": {
+              "not": {}
+            },
+            "repeat": {
+              "type": "object",
+              "properties": {
+                "count": {
+                  "type": "integer",
+                  "minimum": 1
                 },
-                "required": ["id"],
-                "additionalProperties": false
-              }
+                "strategy": {
+                  "type": "string",
+                  "enum": ["pass_at_k", "pass_all", "mean", "confidence_interval"]
+                },
+                "cost_limit_usd": {
+                  "type": "number",
+                  "minimum": 0
+                },
+                "costLimitUsd": {
+                  "type": "number",
+                  "minimum": 0
+                }
+              },
+              "required": ["count"],
+              "additionalProperties": false
             },
-            {
-              "type": "string"
+            "runs": {
+              "type": "integer",
+              "minimum": 1
+            },
+            "early_exit": {
+              "type": "boolean"
+            },
+            "timeout_seconds": {
+              "type": "number",
+              "exclusiveMinimum": true,
+              "minimum": 0
+            },
+            "sandbox": {
+              "type": "string",
+              "enum": ["auto", "docker", "vercel"]
+            },
+            "workspace": {
+              "type": "object",
+              "properties": {},
+              "additionalProperties": {}
+            },
+            "setup": {
+              "not": {}
             }
-          ]
-        },
-        "target": {
-          "type": "string"
+          },
+          "additionalProperties": false
         },
         "execution": {
           "type": "object",
@@ -13307,7 +16169,8 @@
               "items": {
                 "anyOf": [
                   {
-                    "type": "string"
+                    "type": "string",
+                    "minLength": 1
                   },
                   {
                     "type": "object",
@@ -13317,200 +16180,21 @@
                         "minLength": 1
                       },
                       "use_target": {
-                        "type": "string"
+                        "type": "string",
+                        "minLength": 1
                       },
                       "hooks": {
                         "type": "object",
-                        "properties": {
-                          "before_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "before_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "script": {
-                                "anyOf": [
-                                  {
-                                    "type": "string"
-                                  },
-                                  {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
-                                ]
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          }
-                        },
-                        "additionalProperties": false
+                        "properties": {},
+                        "additionalProperties": {}
                       }
                     },
                     "required": ["name"],
                     "additionalProperties": false
                   }
                 ]
-              }
+              },
+              "minItems": 1
             },
             "workers": {
               "type": "integer",
@@ -15822,6 +18506,7 @@
             },
             "budget_usd": {
               "type": "number",
+              "exclusiveMinimum": true,
               "minimum": 0
             },
             "budgetUsd": {
@@ -15838,6 +18523,69 @@
               "type": "number",
               "minimum": 0,
               "maximum": 1
+            },
+            "agent": {
+              "type": "string",
+              "minLength": 1
+            },
+            "model": {
+              "type": "string",
+              "minLength": 1
+            },
+            "agent_options": {
+              "type": "object",
+              "properties": {},
+              "additionalProperties": {}
+            },
+            "scripts": {
+              "not": {}
+            },
+            "repeat": {
+              "type": "object",
+              "properties": {
+                "count": {
+                  "type": "integer",
+                  "minimum": 1
+                },
+                "strategy": {
+                  "type": "string",
+                  "enum": ["pass_at_k", "pass_all", "mean", "confidence_interval"]
+                },
+                "cost_limit_usd": {
+                  "type": "number",
+                  "minimum": 0
+                },
+                "costLimitUsd": {
+                  "type": "number",
+                  "minimum": 0
+                }
+              },
+              "required": ["count"],
+              "additionalProperties": false
+            },
+            "runs": {
+              "type": "integer",
+              "minimum": 1
+            },
+            "early_exit": {
+              "type": "boolean"
+            },
+            "timeout_seconds": {
+              "type": "number",
+              "exclusiveMinimum": true,
+              "minimum": 0
+            },
+            "sandbox": {
+              "type": "string",
+              "enum": ["auto", "docker", "vercel"]
+            },
+            "workspace": {
+              "type": "object",
+              "properties": {},
+              "additionalProperties": {}
+            },
+            "setup": {
+              "not": {}
             }
           },
           "additionalProperties": false
diff --git a/skills-data/agentv-eval-writer/references/experiment-schema.json b/skills-data/agentv-eval-writer/references/experiment-schema.json
deleted file mode 100644
index 86414774c..000000000
--- a/skills-data/agentv-eval-writer/references/experiment-schema.json
+++ /dev/null
@@ -1,278 +0,0 @@
-{
-  "$schema": "https://json-schema.org/draft/2019-09/schema#",
-  "title": "AgentV Experiment File",
-  "description": "Schema for AgentV experiment YAML files (experiments/*.yaml)",
-  "$ref": "#/definitions/ExperimentFile",
-  "definitions": {
-    "ExperimentFile": {
-      "type": "object",
-      "properties": {
-        "name": {
-          "type": "string",
-          "minLength": 1
-        },
-        "agent": {
-          "type": "string",
-          "minLength": 1
-        },
-        "target": {
-          "type": "string",
-          "minLength": 1
-        },
-        "targets": {
-          "type": "array",
-          "items": {
-            "anyOf": [
-              {
-                "type": "string",
-                "minLength": 1
-              },
-              {
-                "type": "object",
-                "properties": {
-                  "name": {
-                    "type": "string",
-                    "minLength": 1
-                  },
-                  "use_target": {
-                    "type": "string",
-                    "minLength": 1
-                  },
-                  "hooks": {
-                    "type": "object",
-                    "properties": {},
-                    "additionalProperties": {}
-                  }
-                },
-                "required": ["name"],
-                "additionalProperties": false
-              }
-            ]
-          },
-          "minItems": 1
-        },
-        "model": {
-          "type": "string",
-          "minLength": 1
-        },
-        "agent_options": {
-          "type": "object",
-          "properties": {},
-          "additionalProperties": {}
-        },
-        "suites": {
-          "type": "array",
-          "items": {
-            "type": "object",
-            "properties": {
-              "ref": {
-                "type": "string",
-                "minLength": 1
-              },
-              "select": {
-                "type": "object",
-                "properties": {
-                  "test_ids": {
-                    "type": "array",
-                    "items": {
-                      "type": "string",
-                      "minLength": 1
-                    },
-                    "minItems": 1
-                  }
-                },
-                "required": ["test_ids"],
-                "additionalProperties": false
-              }
-            },
-            "required": ["ref"],
-            "additionalProperties": false
-          },
-          "minItems": 1
-        },
-        "scripts": {
-          "type": "array",
-          "items": {
-            "anyOf": [
-              {
-                "type": "string",
-                "minLength": 1
-              },
-              {
-                "type": "object",
-                "properties": {
-                  "command": {
-                    "anyOf": [
-                      {
-                        "type": "string",
-                        "minLength": 1
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "string",
-                          "minLength": 1
-                        },
-                        "minItems": 1
-                      }
-                    ]
-                  },
-                  "script": {
-                    "anyOf": [
-                      {
-                        "type": "string",
-                        "minLength": 1
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "string",
-                          "minLength": 1
-                        },
-                        "minItems": 1
-                      }
-                    ]
-                  },
-                  "timeout_seconds": {
-                    "type": "number",
-                    "exclusiveMinimum": true,
-                    "minimum": 0
-                  },
-                  "cwd": {
-                    "type": "string",
-                    "minLength": 1
-                  },
-                  "env": {
-                    "type": "object",
-                    "additionalProperties": {
-                      "type": "string"
-                    }
-                  }
-                },
-                "additionalProperties": false
-              }
-            ]
-          }
-        },
-        "repeat": {
-          "type": "object",
-          "properties": {
-            "count": {
-              "type": "integer",
-              "minimum": 1
-            },
-            "strategy": {
-              "type": "string",
-              "enum": ["pass_at_k", "mean", "confidence_interval"]
-            },
-            "cost_limit_usd": {
-              "type": "number",
-              "minimum": 0
-            },
-            "costLimitUsd": {
-              "type": "number",
-              "minimum": 0
-            }
-          },
-          "required": ["count"],
-          "additionalProperties": false
-        },
-        "runs": {
-          "type": "integer",
-          "minimum": 1
-        },
-        "early_exit": {
-          "type": "boolean"
-        },
-        "timeout_seconds": {
-          "type": "number",
-          "exclusiveMinimum": true,
-          "minimum": 0
-        },
-        "workers": {
-          "type": "integer",
-          "minimum": 1
-        },
-        "budget_usd": {
-          "type": "number",
-          "exclusiveMinimum": true,
-          "minimum": 0
-        },
-        "sandbox": {
-          "type": "string",
-          "enum": ["auto", "docker", "vercel"]
-        },
-        "workspace": {
-          "type": "object",
-          "properties": {},
-          "additionalProperties": {}
-        },
-        "setup": {
-          "type": "array",
-          "items": {
-            "anyOf": [
-              {
-                "type": "string",
-                "minLength": 1
-              },
-              {
-                "type": "object",
-                "properties": {
-                  "command": {
-                    "anyOf": [
-                      {
-                        "type": "string",
-                        "minLength": 1
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "string",
-                          "minLength": 1
-                        },
-                        "minItems": 1
-                      }
-                    ]
-                  },
-                  "script": {
-                    "anyOf": [
-                      {
-                        "type": "string",
-                        "minLength": 1
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "string",
-                          "minLength": 1
-                        },
-                        "minItems": 1
-                      }
-                    ]
-                  },
-                  "timeout_seconds": {
-                    "type": "number",
-                    "exclusiveMinimum": true,
-                    "minimum": 0
-                  },
-                  "cwd": {
-                    "type": "string",
-                    "minLength": 1
-                  },
-                  "env": {
-                    "type": "object",
-                    "additionalProperties": {
-                      "type": "string"
-                    }
-                  }
-                },
-                "additionalProperties": false
-              }
-            ]
-          }
-        }
-      },
-      "additionalProperties": false
-    }
-  }
-}