diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx index d240abd75..90c30f8d5 100644 --- a/apps/web/src/content/docs/docs/evaluation/sdk.mdx +++ b/apps/web/src/content/docs/docs/evaluation/sdk.mdx @@ -9,7 +9,7 @@ YAML remains AgentV's canonical, portable eval format. The SDK surfaces below ar AgentV currently provides two npm packages for programmatic use: -- **`@agentv/sdk`** — custom assertions and code graders +- **`@agentv/sdk`** — YAML-aligned eval authoring, custom assertions, and code graders - **`@agentv/core`** — programmatic evaluation API and typed configuration ## Installation @@ -27,12 +27,52 @@ npm install @agentv/core Use the simplest surface that matches the job: - **YAML / JSONL first** for portable eval specs you want to run from the CLI, check into a repo, or share across TypeScript and Python workflows. +- **`defineEval()` / `evalSuite()`** when you want a `.eval.ts` file that mirrors YAML concepts and lowers back to the canonical snake_case contract. - **`evaluate({ specFile })`** when you want library control around an existing YAML suite. - **Inline `evaluate({ tests })`** when the eval definition truly belongs inside application code. The programmatic API mirrors YAML, but uses current TypeScript naming such as `expectedOutput` and `assert`. - **`defineAssertion` / `defineCodeGrader`** when the grading logic itself must execute code. There is no separate first-party Python authoring SDK today. Python-facing workflows should either emit canonical YAML/JSONL or implement executable graders that consume the standard `snake_case` wire format. +## YAML-Aligned `.eval.ts` Authoring + +Use `defineEval()` from `@agentv/sdk` when you want TypeScript ergonomics without creating a second eval vocabulary. The helper keeps authoring in camelCase where TypeScript needs it, then lowers back to the canonical snake_case eval object contract when AgentV loads the file. + +```typescript +// evals/greeting.eval.ts +import { defineEval } from '@agentv/sdk'; + +export default defineEval({ + name: 'hello-suite', + execution: { + targets: ['mock-sdk'], + }, + workspace: { + hooks: { + beforeAll: { + command: ['echo', 'suite-start'], + }, + }, + }, + tests: [ + { + id: 'hello', + input: 'Say hello', + inputFiles: ['../fixtures/per-test-note.md'], + expectedOutput: 'Hello from the mock target', + assertions: [{ type: 'contains', value: 'Hello' }], + }, + ], +}); +``` + +Useful companion helpers: + +- `toEvalYamlObject()` returns the canonical snake_case object. +- `serializeEvalYaml()` returns YAML text using the same canonical field names. + +The durable field remains `assertions`. This helper does not introduce a second YAML vocabulary. + ## Custom Assertions Use `defineAssertion` from `@agentv/sdk` to create reusable assertion types. Place them in `.agentv/assertions/` — they're auto-discovered by filename. diff --git a/bun.lock b/bun.lock index 94c258d5c..9c7492892 100644 --- a/bun.lock +++ b/bun.lock @@ -142,6 +142,7 @@ "name": "@agentv/sdk", "version": "4.41.1-next.1", "dependencies": { + "yaml": "^2.8.3", "zod": "^3.23.8", }, }, diff --git a/examples/README.md b/examples/README.md index f35579210..cf526ea23 100644 --- a/examples/README.md +++ b/examples/README.md @@ -54,6 +54,7 @@ Focused demonstrations of specific AgentV capabilities. Each example includes it - [code-grader-sdk](features/code-grader-sdk/) - TypeScript SDK for code graders using `defineCodeGrader()` - [sdk-custom-assertion](features/sdk-custom-assertion/) - Custom assertion types using `defineAssertion()` - [sdk-programmatic-api](features/sdk-programmatic-api/) - Programmatic evaluation using `evaluate()` +- [sdk-eval-authoring](features/sdk-eval-authoring/) - YAML-aligned `.eval.ts` authoring using `defineEval()` - [sdk-config-file](features/sdk-config-file/) - Typed configuration with `defineConfig()` - [prompt-template-sdk](features/prompt-template-sdk/) - Custom LLM grader prompts using `definePromptTemplate()` diff --git a/examples/features/README.md b/examples/features/README.md index e70a44672..f51f36c34 100644 --- a/examples/features/README.md +++ b/examples/features/README.md @@ -122,6 +122,7 @@ Focused examples for specific AgentV capabilities. Find your use case below, the | Example | Description | |---------|-------------| | [sdk-custom-assertion](sdk-custom-assertion/) | Custom assertion types using `defineAssertion()` | +| [sdk-eval-authoring](sdk-eval-authoring/) | YAML-aligned `.eval.ts` authoring using `defineEval()` | | [sdk-programmatic-api](sdk-programmatic-api/) | Programmatic evaluation using `evaluate()` | | [sdk-config-file](sdk-config-file/) | Typed configuration with `defineConfig()` | | [prompt-template-sdk](prompt-template-sdk/) | Custom LLM grader prompts using `definePromptTemplate()` | @@ -167,6 +168,7 @@ Focused examples for specific AgentV capabilities. Find your use case below, the | [rubric](rubric/) | LLM grading | | [sdk-config-file](sdk-config-file/) | TypeScript SDK | | [sdk-custom-assertion](sdk-custom-assertion/) | TypeScript SDK | +| [sdk-eval-authoring](sdk-eval-authoring/) | TypeScript SDK | | [sdk-programmatic-api](sdk-programmatic-api/) | TypeScript SDK | | [suite-level-input](suite-level-input/) | Dataset & input | | [suite-level-input-files](suite-level-input-files/) | Dataset & input | diff --git a/examples/features/sdk-eval-authoring/.agentv/targets.yaml b/examples/features/sdk-eval-authoring/.agentv/targets.yaml new file mode 100644 index 000000000..4142ab434 --- /dev/null +++ b/examples/features/sdk-eval-authoring/.agentv/targets.yaml @@ -0,0 +1,4 @@ +targets: + - name: mock-sdk + provider: mock + response: Hello from the mock target diff --git a/examples/features/sdk-eval-authoring/README.md b/examples/features/sdk-eval-authoring/README.md new file mode 100644 index 000000000..926d44683 --- /dev/null +++ b/examples/features/sdk-eval-authoring/README.md @@ -0,0 +1,27 @@ +# SDK Example: YAML-Aligned Eval Authoring + +Demonstrates authoring a `.eval.ts` suite with `defineEval()` from `@agentv/sdk` while still lowering to AgentV's canonical snake_case YAML/runtime contract. + +## What It Shows + +1. `defineEval()` brands a TypeScript suite for the `.eval.ts` loader. +2. CamelCase authoring fields such as `inputFiles`, `expectedOutput`, `beforeAll`, and `beforeEach` lower to the canonical YAML/runtime keys. +3. The suite still runs through the standard CLI and YAML parser path instead of a separate SDK runner. + +## Files + +- `evals/greeting.eval.ts` — the YAML-aligned TypeScript suite +- `.agentv/targets.yaml` — local mock target for a zero-credential run +- `fixtures/*.md` — attached input files used by the suite + +## How to Run + +```bash +# From repository root +cd examples/features/sdk-eval-authoring +bun install + +bun ../../../../apps/cli/src/cli.ts eval evals/greeting.eval.ts +``` + +The example uses a local `mock` target, so it does not require API credentials. diff --git a/examples/features/sdk-eval-authoring/evals/greeting.eval.ts b/examples/features/sdk-eval-authoring/evals/greeting.eval.ts new file mode 100644 index 000000000..ea4f8324f --- /dev/null +++ b/examples/features/sdk-eval-authoring/evals/greeting.eval.ts @@ -0,0 +1,34 @@ +import { defineEval } from '@agentv/sdk'; + +export default defineEval({ + name: 'sdk-eval-authoring', + description: 'YAML-aligned TypeScript eval authoring with @agentv/sdk', + inputFiles: ['../fixtures/shared-context.md'], + execution: { + targets: ['mock-sdk'], + }, + workspace: { + hooks: { + beforeAll: { + command: ['echo', 'suite-start'], + }, + }, + }, + tests: [ + { + id: 'hello-from-typescript', + input: 'Use the attached notes and say hello.', + inputFiles: ['../fixtures/per-test-note.md'], + expectedOutput: 'Hello from the mock target', + assertions: [{ type: 'contains', value: 'Hello' }], + workspace: { + hooks: { + beforeEach: { + command: ['echo', 'per-test-setup'], + timeoutMs: 1_000, + }, + }, + }, + }, + ], +}); diff --git a/examples/features/sdk-eval-authoring/fixtures/per-test-note.md b/examples/features/sdk-eval-authoring/fixtures/per-test-note.md new file mode 100644 index 000000000..0b30fa4cf --- /dev/null +++ b/examples/features/sdk-eval-authoring/fixtures/per-test-note.md @@ -0,0 +1 @@ +Include the word "hello" in the response. diff --git a/examples/features/sdk-eval-authoring/fixtures/shared-context.md b/examples/features/sdk-eval-authoring/fixtures/shared-context.md new file mode 100644 index 000000000..d77029741 --- /dev/null +++ b/examples/features/sdk-eval-authoring/fixtures/shared-context.md @@ -0,0 +1 @@ +Use a friendly tone. diff --git a/examples/features/sdk-eval-authoring/package.json b/examples/features/sdk-eval-authoring/package.json new file mode 100644 index 000000000..4c3ba8acb --- /dev/null +++ b/examples/features/sdk-eval-authoring/package.json @@ -0,0 +1,8 @@ +{ + "name": "agentv-example-sdk-eval-authoring", + "private": true, + "type": "module", + "dependencies": { + "@agentv/sdk": "file:../../../packages/sdk" + } +} diff --git a/packages/core/src/evaluation/loaders/ts-eval-loader.ts b/packages/core/src/evaluation/loaders/ts-eval-loader.ts index 3556c8d8a..b5e20e119 100644 --- a/packages/core/src/evaluation/loaders/ts-eval-loader.ts +++ b/packages/core/src/evaluation/loaders/ts-eval-loader.ts @@ -15,12 +15,19 @@ import { type EvalConfig, materializeEvalConfig } from '../evaluate.js'; import { createFunctionProvider } from '../providers/function-provider.js'; import type { ProviderFactoryFn } from '../providers/provider-registry.js'; import type { TargetDefinition } from '../providers/types.js'; -import type { EvalSuiteResult } from '../yaml-parser.js'; +import { type EvalSuiteResult, loadTestSuiteFromYamlObject } from '../yaml-parser.js'; const EXPORT_NAMES = ['default', 'config', 'evalConfig'] as const; +const SDK_EVAL_SUITE_SYMBOL = Symbol.for('@agentv/sdk/eval-suite'); +const SDK_TO_EVAL_YAML_OBJECT_SYMBOL = Symbol.for('@agentv/sdk/to-eval-yaml-object'); + +type SdkEvalSuiteExport = Record & { + readonly [SDK_EVAL_SUITE_SYMBOL]: true; + readonly [SDK_TO_EVAL_YAML_OBJECT_SYMBOL]: () => Record; +}; export interface TsEvalResult { - readonly config: EvalConfig; + readonly config: EvalConfig | SdkEvalSuiteExport; readonly filePath: string; } @@ -38,10 +45,10 @@ export async function loadTsEvalFile(filePath: string): Promise { const moduleUrl = pathToFileURL(absolutePath).href; const module = await import(moduleUrl); - let config: EvalConfig | undefined; + let config: EvalConfig | SdkEvalSuiteExport | undefined; for (const name of EXPORT_NAMES) { const candidate = module[name]; - if (isEvalConfigLike(candidate)) { + if (isSupportedTsEvalExport(candidate)) { config = candidate; break; } @@ -49,7 +56,7 @@ export async function loadTsEvalFile(filePath: string): Promise { if (!config) { throw new Error( - `${filePath}: no EvalConfig export found. Export an EvalConfig as default, 'config', or 'evalConfig'.`, + `${filePath}: no supported eval export found. Export defineEval(...) or an EvalConfig as default, 'config', or 'evalConfig'.`, ); } @@ -66,6 +73,16 @@ export async function loadTsEvalSuite( }, ): Promise { const { config, filePath: absolutePath } = await loadTsEvalFile(filePath); + + if (isSdkEvalSuiteExport(config)) { + return loadTestSuiteFromYamlObject( + absolutePath, + config[SDK_TO_EVAL_YAML_OBJECT_SYMBOL](), + repoRoot, + options, + ); + } + const materialized = await materializeEvalConfig(config, { repoRoot, baseDir: path.dirname(absolutePath), @@ -98,6 +115,19 @@ export async function loadTsEvalSuite( }; } +function isSdkEvalSuiteExport(value: unknown): value is SdkEvalSuiteExport { + return ( + !!value && + typeof value === 'object' && + (value as SdkEvalSuiteExport)[SDK_EVAL_SUITE_SYMBOL] === true && + typeof (value as SdkEvalSuiteExport)[SDK_TO_EVAL_YAML_OBJECT_SYMBOL] === 'function' + ); +} + +function isSupportedTsEvalExport(value: unknown): value is EvalConfig | SdkEvalSuiteExport { + return isSdkEvalSuiteExport(value) || isEvalConfigLike(value); +} + /** * Duck-type check for EvalConfig-like objects. * An EvalConfig must have at least one of: tests, specFile, or target. diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index c82592992..0c9efccf9 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -315,27 +315,28 @@ export async function loadTestSuite( repoRoot, options, ); - const metadata = parseMetadata(parsed); - const failOnError = extractFailOnError(parsed); - const threshold = extractThreshold(parsed); - return { - tests, - trials: extractTrialsConfig(parsed), - targets: extractTargetsFromSuite(parsed), - targetRefs: extractTargetRefsFromSuite(parsed), - workers: extractWorkersFromSuite(parsed), - cacheConfig: extractCacheConfig(parsed), - budgetUsd: extractBudgetUsd(parsed), - ...(metadata !== undefined && { metadata }), - ...(failOnError !== undefined && { failOnError }), - ...(threshold !== undefined && { threshold }), - ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }), - }; + return buildEvalSuiteResult(parsed, tests, suiteWorkspacePath); } /** @deprecated Use `loadTestSuite` instead */ export const loadEvalSuite = loadTestSuite; +export async function loadTestSuiteFromYamlObject( + evalFilePath: string, + suiteObject: unknown, + repoRoot: URL | string, + options?: LoadOptions, +): Promise { + const { tests, parsed, suiteWorkspacePath } = await loadTestsFromParsedYamlValue( + suiteObject, + evalFilePath, + repoRoot, + options, + ); + + return buildEvalSuiteResult(parsed, tests, suiteWorkspacePath); +} + export async function loadTests( evalFilePath: string, repoRoot: URL | string, @@ -366,7 +367,18 @@ async function loadTestsFromYaml( repoRoot: URL | string, options?: LoadOptions, ): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> { - // YAML parsing (existing implementation) + const absoluteTestPath = path.resolve(evalFilePath); + const rawFile = await readFile(absoluteTestPath, 'utf8'); + + return loadTestsFromParsedYamlValue(parseYamlValue(rawFile), evalFilePath, repoRoot, options); +} + +async function loadTestsFromParsedYamlValue( + rawParsed: unknown, + evalFilePath: string, + repoRoot: URL | string, + options?: LoadOptions, +): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> { const verbose = options?.verbose ?? false; const filterPattern = options?.filter; const absoluteTestPath = path.resolve(evalFilePath); @@ -377,8 +389,6 @@ async function loadTestsFromYaml( // Load configuration (walks up directory tree to repo root) const config = await loadConfig(absoluteTestPath, repoRootPath); - const rawFile = await readFile(absoluteTestPath, 'utf8'); - const rawParsed = parseYamlValue(rawFile) as unknown; const rawCaseSnapshots = buildRawInlineTestSnapshots(rawParsed); const interpolated = interpolateEnv(rawParsed, process.env) as unknown; if (!isJsonObject(interpolated)) { @@ -715,6 +725,30 @@ async function loadTestsFromYaml( return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path }; } +function buildEvalSuiteResult( + parsed: JsonObject, + tests: readonly EvalTest[], + suiteWorkspacePath?: string, +): EvalSuiteResult { + const metadata = parseMetadata(parsed); + const failOnError = extractFailOnError(parsed); + const threshold = extractThreshold(parsed); + + return { + tests, + trials: extractTrialsConfig(parsed), + targets: extractTargetsFromSuite(parsed), + targetRefs: extractTargetRefsFromSuite(parsed), + workers: extractWorkersFromSuite(parsed), + cacheConfig: extractCacheConfig(parsed), + budgetUsd: extractBudgetUsd(parsed), + ...(metadata !== undefined && { metadata }), + ...(failOnError !== undefined && { failOnError }), + ...(threshold !== undefined && { threshold }), + ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }), + }; +} + const SOURCE_SECRET_KEY_PATTERN = /(api[_-]?key|authorization|bearer|credential|password|private[_-]?key|secret|token)/i; const REDACTED_SOURCE_VALUE = '[redacted]'; diff --git a/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts b/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts new file mode 100644 index 000000000..60f95f8db --- /dev/null +++ b/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts @@ -0,0 +1,37 @@ +import { defineEval } from '../../../../../sdk/src/index.ts'; + +export default defineEval({ + name: 'sdk-define-eval-suite', + description: 'YAML-aligned TypeScript suite authored with @agentv/sdk', + tags: ['sdk', 'typescript', 'yaml'], + execution: { + targets: ['mock-target'], + workers: 2, + skipDefaults: true, + budgetUsd: 2, + threshold: 0.75, + }, + workspace: { + hooks: { + beforeAll: { + command: ['echo', 'suite-setup'], + }, + }, + }, + tests: [ + { + id: 'sdk-define-eval', + input: 'Say hello', + expectedOutput: 'hello there', + assertions: [{ type: 'contains', value: 'hello' }], + workspace: { + hooks: { + beforeEach: { + command: ['echo', 'case-setup'], + timeoutMs: 1_000, + }, + }, + }, + }, + ], +}); diff --git a/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts b/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts index 7a453ab3c..2298ec0f7 100644 --- a/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts +++ b/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts @@ -9,26 +9,36 @@ const fixtureDir = path.join(import.meta.dir, 'fixtures'); describe('loadTsEvalFile', () => { it('loads default export', async () => { const result = await loadTsEvalFile(path.join(fixtureDir, 'default-export.eval.ts')); + const tests = (result.config as { tests?: Array<{ id?: string }> }).tests; expect(result.config).toBeDefined(); - expect(result.config.tests).toHaveLength(1); - expect(result.config.tests?.[0].id).toBe('greeting'); + expect(tests).toHaveLength(1); + expect(tests?.[0]?.id).toBe('greeting'); }); it('loads named "config" export', async () => { const result = await loadTsEvalFile(path.join(fixtureDir, 'named-config.eval.ts')); + const tests = (result.config as { tests?: Array<{ id?: string }> }).tests; expect(result.config).toBeDefined(); - expect(result.config.tests?.[0].id).toBe('named-config'); + expect(tests?.[0]?.id).toBe('named-config'); }); it('loads named "evalConfig" export', async () => { const result = await loadTsEvalFile(path.join(fixtureDir, 'eval-config-named.eval.ts')); + const tests = (result.config as { tests?: Array<{ id?: string }> }).tests; expect(result.config).toBeDefined(); - expect(result.config.tests?.[0].id).toBe('eval-config-named'); + expect(tests?.[0]?.id).toBe('eval-config-named'); }); - it('throws when no EvalConfig export found', async () => { + it('loads YAML-aligned sdk eval exports', async () => { + const result = await loadTsEvalFile(path.join(fixtureDir, 'sdk-define-eval.eval.ts')); + const tests = (result.config as { tests?: Array<{ id?: string }> }).tests; + expect(result.config).toBeDefined(); + expect(tests?.[0]?.id).toBe('sdk-define-eval'); + }); + + it('throws when no supported eval export is found', async () => { await expect(loadTsEvalFile(path.join(fixtureDir, 'no-config.eval.ts'))).rejects.toThrow( - 'no EvalConfig export found', + 'no supported eval export found', ); }); @@ -54,6 +64,27 @@ describe('loadTsEvalFile', () => { expect(suite.inlineTarget?.name).toBe('inline-target'); }); + it('materializes a YAML-aligned sdk eval through loadTestSuite', async () => { + const suite = await loadTestSuite( + path.join(fixtureDir, 'sdk-define-eval.eval.ts'), + fixtureDir, + { + category: 'sdk', + }, + ); + + expect(suite.tests).toHaveLength(1); + expect(suite.tests[0].suite).toBe('sdk-define-eval-suite'); + expect(suite.tests[0].workspace?.hooks?.before_all?.command).toEqual(['echo', 'suite-setup']); + expect(suite.tests[0].workspace?.hooks?.before_each?.command).toEqual(['echo', 'case-setup']); + expect(suite.tests[0].workspace?.hooks?.before_each?.timeout_ms).toBe(1_000); + expect(suite.targets).toEqual(['mock-target']); + expect(suite.workers).toBe(2); + expect(suite.budgetUsd).toBe(2); + expect(suite.threshold).toBe(0.75); + expect(suite.metadata?.tags).toEqual(['sdk', 'typescript', 'yaml']); + }); + it('routes TypeScript evals through loadTests', async () => { const tests = await loadTests(path.join(fixtureDir, 'default-export.eval.ts'), fixtureDir, { category: 'sdk', diff --git a/packages/sdk/README.md b/packages/sdk/README.md index 93f514b2e..2b4f6f210 100644 --- a/packages/sdk/README.md +++ b/packages/sdk/README.md @@ -1,6 +1,6 @@ # @agentv/sdk -Evaluation SDK for AgentV - build custom graders and prompt templates around the canonical AgentV eval model. +Evaluation SDK for AgentV - build YAML-aligned eval suites, custom graders, and prompt templates around the canonical AgentV eval model. ## Installation @@ -41,11 +41,37 @@ export default defineCodeGrader(({ output, traceSummary }) => ({ Both functions handle stdin/stdout parsing, snake_case conversion, Zod validation, and error handling automatically. +### defineEval (YAML-aligned `.eval.ts` authoring) + +```typescript +#!/usr/bin/env bun +import { defineEval } from '@agentv/sdk'; + +export default defineEval({ + name: 'hello-suite', + execution: { + targets: ['mock-sdk'], + }, + tests: [ + { + id: 'hello', + input: 'Say hello', + expectedOutput: 'Hello from the mock target', + assertions: [{ type: 'contains', value: 'Hello' }], + }, + ], +}); +``` + +`defineEval()` keeps TypeScript authoring in camelCase and lowers to the canonical snake_case YAML/runtime contract when AgentV loads the `.eval.ts` file. + ## Exports - `defineAssertion(handler)` - Define a custom assertion (pass/fail + optional score) - `defineCodeGrader(handler)` - Define a code grader grader (full score control) - `definePromptTemplate(handler)` - Define a dynamic prompt template +- `defineEval(definition)` / `evalSuite(definition)` - Define a YAML-aligned `.eval.ts` suite +- `toEvalYamlObject(definition)` / `serializeEvalYaml(definition)` - Lower or serialize canonical eval YAML - `AssertionContext`, `AssertionScore` - Assertion types - `CodeGraderInput`, `CodeGraderResult` - Code grader types - `TraceSummary`, `Message`, `ToolCall` - Trace data types diff --git a/packages/sdk/package.json b/packages/sdk/package.json index 707ce18b7..68d947f96 100644 --- a/packages/sdk/package.json +++ b/packages/sdk/package.json @@ -32,6 +32,7 @@ }, "files": ["dist", "README.md"], "dependencies": { + "yaml": "^2.8.3", "zod": "^3.23.8" } } diff --git a/packages/sdk/src/eval.ts b/packages/sdk/src/eval.ts new file mode 100644 index 000000000..e38141154 --- /dev/null +++ b/packages/sdk/src/eval.ts @@ -0,0 +1,286 @@ +import { stringify as stringifyYaml } from 'yaml'; + +const EVAL_SUITE_SYMBOL = Symbol.for('@agentv/sdk/eval-suite'); +const TO_EVAL_YAML_OBJECT_SYMBOL = Symbol.for('@agentv/sdk/to-eval-yaml-object'); + +const KNOWN_SNAKE_CASE_KEYS = { + afterAll: 'after_all', + afterEach: 'after_each', + argsMatch: 'args_match', + baseCommit: 'base_commit', + beforeAll: 'before_all', + beforeEach: 'before_each', + budgetUsd: 'budget_usd', + conversationId: 'conversation_id', + costLimitUsd: 'cost_limit_usd', + dependsOn: 'depends_on', + expectedOutput: 'expected_output', + explorationTolerance: 'exploration_tolerance', + failOnError: 'fail_on_error', + inputFiles: 'input_files', + keepWorkspaces: 'keep_workspaces', + maxCostUsd: 'max_cost_usd', + maxDurationMs: 'max_duration_ms', + maxInput: 'max_input', + maxLlmCalls: 'max_llm_calls', + maxOutput: 'max_output', + maxTokens: 'max_tokens', + maxToolCalls: 'max_tool_calls', + minScore: 'min_score', + onDependencyFailure: 'on_dependency_failure', + onTurnFailure: 'on_turn_failure', + outputPath: 'output_path', + scoreRanges: 'score_ranges', + skipDefaults: 'skip_defaults', + targetExplorationRatio: 'target_exploration_ratio', + timeoutMs: 'timeout_ms', + useTarget: 'use_target', + windowSize: 'window_size', +} as const; + +type KnownSnakeCaseKeyMap = typeof KNOWN_SNAKE_CASE_KEYS; + +type LowerEvalKey = Key extends keyof KnownSnakeCaseKeyMap + ? KnownSnakeCaseKeyMap[Key] + : Key; + +export type LowerEvalYamlValue = Value extends readonly (infer Item)[] + ? LowerEvalYamlValue[] + : Value extends object + ? { + [Key in keyof Value as Key extends string ? LowerEvalKey : never]: LowerEvalYamlValue< + Value[Key] + >; + } + : Value; + +export type EvalMessageContent = + | string + | Readonly> + | readonly (string | Readonly>)[]; + +export interface EvalMessage { + readonly role: 'system' | 'user' | 'assistant' | 'tool'; + readonly content: EvalMessageContent; + readonly [key: string]: unknown; +} + +export interface EvalAssertionConfig { + readonly type: string; + readonly [key: string]: unknown; +} + +export interface EvalPreprocessor { + readonly type: string; + readonly command: string | readonly string[]; + readonly [key: string]: unknown; +} + +export interface EvalWorkspaceHook { + readonly command?: string | readonly string[]; + readonly script?: string | readonly string[]; + readonly timeoutMs?: number; + readonly cwd?: string; + readonly reset?: 'none' | 'fast' | 'strict'; + readonly [key: string]: unknown; +} + +export interface EvalWorkspaceHooks { + readonly enabled?: boolean; + readonly beforeAll?: EvalWorkspaceHook; + readonly beforeEach?: EvalWorkspaceHook; + readonly afterEach?: EvalWorkspaceHook; + readonly afterAll?: EvalWorkspaceHook; +} + +export interface EvalWorkspaceRepo { + readonly path?: string; + readonly repo?: string; + readonly commit?: string; + readonly baseCommit?: string; + readonly ancestor?: number; + readonly sparse?: readonly string[]; +} + +export interface EvalDockerWorkspace { + readonly image: string; + readonly timeout?: number; + readonly memory?: string; + readonly cpus?: number; +} + +export interface EvalWorkspace { + readonly template?: string; + readonly isolation?: 'shared' | 'per_test'; + readonly repos?: readonly EvalWorkspaceRepo[]; + readonly hooks?: EvalWorkspaceHooks; + readonly mode?: 'pooled' | 'temp' | 'static'; + readonly path?: string; + readonly docker?: EvalDockerWorkspace; +} + +export interface EvalTargetRef { + readonly name: string; + readonly useTarget?: string; + readonly hooks?: EvalWorkspaceHooks; +} + +export interface EvalTrials { + readonly count: number; + readonly strategy?: 'pass_at_k' | 'mean' | 'confidence_interval'; + readonly costLimitUsd?: number; +} + +export interface EvalExecution { + readonly target?: string; + readonly targets?: readonly (string | EvalTargetRef)[]; + readonly workers?: number; + readonly assertions?: readonly EvalAssertionConfig[]; + readonly skipDefaults?: boolean; + readonly cache?: boolean; + readonly trials?: EvalTrials; + readonly budgetUsd?: number; + readonly failOnError?: boolean; + readonly threshold?: number; + readonly [key: string]: unknown; +} + +export interface EvalTurn { + readonly input: EvalMessageContent; + readonly expectedOutput?: EvalMessageContent; + readonly assertions?: readonly (string | EvalAssertionConfig)[]; +} + +export interface EvalTest { + readonly id: string; + readonly vars?: Readonly>; + readonly criteria?: string; + readonly input?: string | readonly EvalMessage[]; + readonly inputFiles?: readonly string[]; + readonly expectedOutput?: string | Readonly> | readonly EvalMessage[]; + readonly assertions?: readonly EvalAssertionConfig[]; + readonly execution?: EvalExecution; + readonly workspace?: EvalWorkspace; + readonly metadata?: Readonly>; + readonly conversationId?: string; + readonly suite?: string; + readonly dependsOn?: readonly string[]; + readonly onDependencyFailure?: 'skip' | 'fail' | 'run'; + readonly mode?: 'conversation'; + readonly turns?: readonly EvalTurn[]; + readonly aggregation?: 'mean' | 'min' | 'max'; + readonly onTurnFailure?: 'continue' | 'stop'; + readonly windowSize?: number; +} + +export interface EvalRequires { + readonly agentv?: string; + readonly [key: string]: unknown; +} + +export interface EvalDefinition { + readonly $schema?: string; + readonly name?: string; + readonly description?: string; + readonly category?: string; + readonly version?: string; + readonly author?: string; + readonly tags?: readonly string[]; + readonly license?: string; + readonly requires?: EvalRequires; + readonly input?: string | readonly EvalMessage[]; + readonly inputFiles?: readonly string[]; + readonly tests: readonly EvalTest[] | string; + readonly target?: string; + readonly execution?: EvalExecution; + readonly assertions?: readonly EvalAssertionConfig[]; + readonly preprocessors?: readonly EvalPreprocessor[]; + readonly workspace?: EvalWorkspace | string; +} + +export interface DefinedEvalSuite { + readonly [EVAL_SUITE_SYMBOL]: true; + readonly [TO_EVAL_YAML_OBJECT_SYMBOL]: () => Record; +} + +function lowerEvalYamlValue(value: unknown): unknown { + if (Array.isArray(value)) { + return value.map((item) => lowerEvalYamlValue(item)); + } + + if (value && typeof value === 'object') { + const result: Record = {}; + for (const [key, nestedValue] of Object.entries(value)) { + const loweredKey = KNOWN_SNAKE_CASE_KEYS[key as keyof KnownSnakeCaseKeyMap] ?? key; + result[loweredKey] = lowerEvalYamlValue(nestedValue); + } + return result; + } + + return value; +} + +function attachEvalSuiteBrand(definition: T): T & DefinedEvalSuite { + const branded = definition as T & Partial; + + if (branded[EVAL_SUITE_SYMBOL] === true) { + return branded as T & DefinedEvalSuite; + } + + Object.defineProperties(branded, { + [EVAL_SUITE_SYMBOL]: { + value: true, + enumerable: false, + configurable: false, + writable: false, + }, + [TO_EVAL_YAML_OBJECT_SYMBOL]: { + value: () => toEvalYamlObject(definition), + enumerable: false, + configurable: false, + writable: false, + }, + }); + + return branded as T & DefinedEvalSuite; +} + +/** + * Define a YAML-aligned eval suite in TypeScript. + * + * The returned object preserves the TypeScript authoring shape and carries a + * non-enumerable lowering hook so AgentV can materialize the canonical + * snake_case eval contract when the suite is loaded from a `.eval.ts` file. + */ +export function defineEval(definition: T): T & DefinedEvalSuite { + return attachEvalSuiteBrand(definition); +} + +/** + * Alias for `defineEval()` when a suite reads more clearly as a plain object. + */ +export function evalSuite(definition: T): T & DefinedEvalSuite { + return defineEval(definition); +} + +/** + * Lower a TypeScript-authored eval suite into the canonical snake_case object + * contract used by YAML files and the runtime loader. + * + * Only known AgentV wire keys are converted. Unknown keys are preserved as-is + * so opaque assertion, provider, and metadata payloads are not corrupted. + */ +export function toEvalYamlObject( + definition: T, +): LowerEvalYamlValue { + return lowerEvalYamlValue(definition) as LowerEvalYamlValue; +} + +/** + * Serialize an eval suite to canonical YAML. + */ +export function serializeEvalYaml( + definition: T, +): string { + return stringifyYaml(toEvalYamlObject(definition), { lineWidth: 0 }).trimEnd(); +} diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts index dd4d5b5e5..634f933cf 100644 --- a/packages/sdk/src/index.ts +++ b/packages/sdk/src/index.ts @@ -92,6 +92,32 @@ export { type Content, } from './schemas.js'; +// Re-export YAML-aligned eval authoring helpers +export { + defineEval, + evalSuite, + serializeEvalYaml, + toEvalYamlObject, + type DefinedEvalSuite, + type EvalAssertionConfig, + type EvalDefinition, + type EvalDockerWorkspace, + type EvalExecution, + type EvalMessage, + type EvalMessageContent, + type EvalPreprocessor, + type EvalRequires, + type EvalTargetRef, + type EvalTest, + type EvalTrials, + type EvalTurn, + type EvalWorkspace, + type EvalWorkspaceHook, + type EvalWorkspaceHooks, + type EvalWorkspaceRepo, + type LowerEvalYamlValue, +} from './eval.js'; + // Re-export target client export { createTargetClient, diff --git a/packages/sdk/test/eval-authoring.test.ts b/packages/sdk/test/eval-authoring.test.ts new file mode 100644 index 000000000..822c137b5 --- /dev/null +++ b/packages/sdk/test/eval-authoring.test.ts @@ -0,0 +1,192 @@ +import { describe, expect, it } from 'bun:test'; + +import { defineEval, evalSuite, serializeEvalYaml, toEvalYamlObject } from '../src/eval.js'; + +describe('YAML-aligned eval authoring helpers', () => { + it('lowers known AgentV fields to canonical snake_case without broad key rewriting', () => { + const suite = defineEval({ + name: 'sdk-yaml-suite', + inputFiles: ['fixtures/shared-system.md'], + execution: { + targets: [ + { + name: 'mock-target', + useTarget: 'mock_base', + hooks: { + beforeAll: { + command: ['bun', 'run', 'scripts/setup.ts'], + timeoutMs: 30_000, + }, + }, + }, + ], + workers: 2, + skipDefaults: true, + budgetUsd: 1.5, + failOnError: true, + keepWorkspaces: true, + }, + assertions: [ + { + type: 'execution-metrics', + maxToolCalls: 3, + maxCostUsd: 0.25, + customThresholdLabel: 'leave-me-alone', + }, + ], + tests: [ + { + id: 'reply-with-hello', + input: 'Say hello.', + inputFiles: ['fixtures/prompt.md'], + expectedOutput: 'Hello there', + workspace: { + hooks: { + beforeEach: { + script: 'git reset --hard', + timeoutMs: 5_000, + }, + afterEach: { + command: ['git', 'status'], + }, + afterAll: { + script: ['echo', 'done'], + }, + }, + }, + mode: 'conversation', + turns: [ + { + input: 'hello?', + expectedOutput: 'hi', + assertions: [ + 'mentions hi', + { + type: 'tool-trajectory', + expected: [ + { + tool: 'Read', + maxDurationMs: 500, + argsMatch: ['path'], + }, + ], + outputPath: 'artifacts/tool-trace.json', + customCamelKey: 'preserve me', + }, + ], + }, + ], + dependsOn: ['setup'], + onDependencyFailure: 'run', + onTurnFailure: 'stop', + windowSize: 2, + }, + ], + }); + + const lowered = toEvalYamlObject(suite); + + expect(suite.execution?.skipDefaults).toBe(true); + expect(lowered).toEqual({ + name: 'sdk-yaml-suite', + input_files: ['fixtures/shared-system.md'], + execution: { + targets: [ + { + name: 'mock-target', + use_target: 'mock_base', + hooks: { + before_all: { + command: ['bun', 'run', 'scripts/setup.ts'], + timeout_ms: 30_000, + }, + }, + }, + ], + workers: 2, + skip_defaults: true, + budget_usd: 1.5, + fail_on_error: true, + keep_workspaces: true, + }, + assertions: [ + { + type: 'execution-metrics', + max_tool_calls: 3, + max_cost_usd: 0.25, + customThresholdLabel: 'leave-me-alone', + }, + ], + tests: [ + { + id: 'reply-with-hello', + input: 'Say hello.', + input_files: ['fixtures/prompt.md'], + expected_output: 'Hello there', + workspace: { + hooks: { + before_each: { + script: 'git reset --hard', + timeout_ms: 5_000, + }, + after_each: { + command: ['git', 'status'], + }, + after_all: { + script: ['echo', 'done'], + }, + }, + }, + mode: 'conversation', + turns: [ + { + input: 'hello?', + expected_output: 'hi', + assertions: [ + 'mentions hi', + { + type: 'tool-trajectory', + expected: [ + { + tool: 'Read', + max_duration_ms: 500, + args_match: ['path'], + }, + ], + output_path: 'artifacts/tool-trace.json', + customCamelKey: 'preserve me', + }, + ], + }, + ], + depends_on: ['setup'], + on_dependency_failure: 'run', + on_turn_failure: 'stop', + window_size: 2, + }, + ], + }); + }); + + it('serializes canonical YAML and keeps assertions as the durable field', () => { + const suite = evalSuite({ + name: 'yaml-round-trip', + tests: [ + { + id: 'hello', + input: 'Say hello', + expectedOutput: 'Hello', + assertions: [{ type: 'contains', value: 'Hello' }], + }, + ], + }); + + const yaml = serializeEvalYaml(suite); + + expect(yaml).toContain('name: yaml-round-trip'); + expect(yaml).toContain('expected_output: Hello'); + expect(yaml).toContain('assertions:'); + expect(yaml).not.toContain('expectedOutput'); + expect(yaml).not.toContain('inputFiles'); + }); +}); diff --git a/scripts/validate-eval-dirs.ts b/scripts/validate-eval-dirs.ts index 94e4c197d..c435ab2d2 100644 --- a/scripts/validate-eval-dirs.ts +++ b/scripts/validate-eval-dirs.ts @@ -1,8 +1,8 @@ #!/usr/bin/env bun /** * Validates that each feature directory under examples/features/ that has an - * evals/ subdirectory contains at least one *.eval.yaml or *.EVAL.yaml file - * (either inside evals/ or at the feature root). + * evals/ subdirectory contains at least one eval file (either inside evals/ + * or at the feature root). * * Directories without an evals/ subdirectory are skipped — they may be SDK * examples or other non-eval feature demos. @@ -34,9 +34,10 @@ for (const entry of entries) { } // Look for eval files in evals/ (recursive) and at feature root. - // Matches: *.eval.yaml, *.EVAL.yaml, eval.yaml, dataset*.yaml (config default patterns) + // Matches config default patterns, including YAML and TypeScript eval suites. const evalPatterns = [ '**/*.{eval.yaml,eval.yml,EVAL.yaml,EVAL.yml}', + '**/*.{eval.ts,eval.mts}', '**/eval.{yaml,yml}', '**/dataset*.{yaml,yml}', ]; @@ -52,7 +53,7 @@ for (const entry of entries) { if (errors.length > 0) { console.error( - 'The following evals/ directories contain no eval files (*.eval.yaml or *.EVAL.yaml):', + 'The following evals/ directories contain no eval files (*.eval.yaml, *.EVAL.yaml, or *.eval.ts):', ); for (const e of errors) console.error(` - ${e}`); process.exit(1);