diff --git a/README.md b/README.md index 4300bd9d1..87e3b0ceb 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ cat ./run/index.jsonl # JSONL results for scripts/CI Use AgentV programmatically: ```typescript -import { evaluate } from '@agentv/core'; +import { evaluate } from '@agentv/sdk'; const { results, summary } = await evaluate({ tests: [ diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx index fdd7a34e5..b846b2f5c 100644 --- a/apps/web/src/content/docs/docs/evaluation/sdk.mdx +++ b/apps/web/src/content/docs/docs/evaluation/sdk.mdx @@ -9,16 +9,16 @@ YAML remains AgentV's canonical, portable eval format. The SDK surfaces below ar AgentV currently provides two npm packages for programmatic use: -- **`@agentv/sdk`** — YAML-aligned eval authoring, custom assertions, and code graders -- **`@agentv/core`** — programmatic evaluation API and typed configuration +- **`@agentv/sdk`** — user-facing SDK for `evaluate()`, YAML-aligned eval authoring, custom assertions, and code graders +- **`@agentv/core`** — core implementation package and typed configuration ## Installation ```bash -# Lightweight SDK (defineEval, graders, defineAssertion, defineCodeGrader) +# User-facing SDK (evaluate, defineEval, graders, defineAssertion, defineCodeGrader) npm install @agentv/sdk -# Programmatic API (evaluate, defineConfig) +# Core configuration helpers (defineConfig) npm install @agentv/core ``` @@ -335,12 +335,12 @@ Raw grader stdin uses `snake_case` because it crosses a process boundary and may ## Programmatic API -Use `evaluate()` from `@agentv/core` to run evaluations as a library. The most portable pattern is still to keep the suite in YAML and point `specFile` at it; inline tests are best when the eval is tightly coupled to application code. +Use `evaluate()` from `@agentv/sdk` to run evaluations as a library. The implementation is owned by `@agentv/core`, but the SDK re-exports it as the user-facing entrypoint. The most portable pattern is still to keep the suite in YAML and point `specFile` at it; inline tests are best when the eval is tightly coupled to application code. ### Inline Test Definitions ```typescript -import { evaluate } from '@agentv/core'; +import { evaluate } from '@agentv/sdk'; const { results, summary } = await evaluate({ tests: [ @@ -363,7 +363,7 @@ Auto-discovers the `default` target from `.agentv/targets.yaml` and `.env` crede Point to an existing YAML eval instead of inlining tests: ```typescript -import { evaluate } from '@agentv/core'; +import { evaluate } from '@agentv/sdk'; const { results, summary } = await evaluate({ specFile: './evals/my-eval.eval.yaml', diff --git a/bun.lock b/bun.lock index 3e197e5be..6f032ef7c 100644 --- a/bun.lock +++ b/bun.lock @@ -137,6 +137,7 @@ "name": "@agentv/sdk", "version": "4.42.4", "dependencies": { + "@agentv/core": "workspace:*", "yaml": "^2.8.3", "zod": "^3.23.8", }, diff --git a/examples/features/sdk-programmatic-api-advanced/evaluate.ts b/examples/features/sdk-programmatic-api-advanced/evaluate.ts index d1e3ac64f..7fdf05338 100644 --- a/examples/features/sdk-programmatic-api-advanced/evaluate.ts +++ b/examples/features/sdk-programmatic-api-advanced/evaluate.ts @@ -6,7 +6,7 @@ * * Run: bun run evaluate.ts */ -import { evaluate } from '@agentv/core'; +import { evaluate } from '@agentv/sdk'; const { results, summary } = await evaluate({ // Run a setup command before the suite starts diff --git a/examples/features/sdk-programmatic-api-advanced/package.json b/examples/features/sdk-programmatic-api-advanced/package.json index 8311e3fee..e222f311d 100644 --- a/examples/features/sdk-programmatic-api-advanced/package.json +++ b/examples/features/sdk-programmatic-api-advanced/package.json @@ -3,6 +3,7 @@ "private": true, "type": "module", "dependencies": { - "@agentv/core": "file:../../../packages/core" + "@agentv/core": "file:../../../packages/core", + "@agentv/sdk": "file:../../../packages/sdk" } } diff --git a/examples/features/sdk-programmatic-api/README.md b/examples/features/sdk-programmatic-api/README.md index d1e725801..304f9293d 100644 --- a/examples/features/sdk-programmatic-api/README.md +++ b/examples/features/sdk-programmatic-api/README.md @@ -1,10 +1,10 @@ # SDK Example: Programmatic API -Demonstrates using `evaluate()` from `@agentv/core` to run evaluations as a library when the eval definition belongs in TypeScript. The config mirrors the canonical YAML surface, but uses programmatic names such as `expectedOutput` and `assert`. +Demonstrates using `evaluate()` from `@agentv/sdk` to run evaluations as a library when the eval definition belongs in TypeScript. The config mirrors the canonical YAML surface, but uses programmatic names such as `expectedOutput` and `assert`. ## What It Does -1. Imports `evaluate()` from `@agentv/core` +1. Imports `evaluate()` from `@agentv/sdk` 2. Defines tests inline with `assert` 3. Runs the evaluation and prints summary statistics 4. Writes canonical AgentV run artifacts under `.agentv/results/runs/...` diff --git a/examples/features/sdk-programmatic-api/evaluate.ts b/examples/features/sdk-programmatic-api/evaluate.ts index 1ae5b0e98..107b77ff9 100644 --- a/examples/features/sdk-programmatic-api/evaluate.ts +++ b/examples/features/sdk-programmatic-api/evaluate.ts @@ -1,13 +1,13 @@ /** * Programmatic API Example * - * Uses evaluate() from @agentv/core to run evaluations as a library. + * Uses evaluate() from @agentv/sdk to run evaluations as a library. * The inline config mirrors the canonical YAML surface with TypeScript-friendly names. * * Run: bun run evaluate.ts * (Uses 'default' target from .agentv/targets.yaml and .env credentials) */ -import { evaluate } from '@agentv/core'; +import { evaluate } from '@agentv/sdk'; const { results, summary } = await evaluate({ tests: [ diff --git a/examples/features/sdk-programmatic-api/package.json b/examples/features/sdk-programmatic-api/package.json index be1bd6fa7..f0040259d 100644 --- a/examples/features/sdk-programmatic-api/package.json +++ b/examples/features/sdk-programmatic-api/package.json @@ -3,6 +3,7 @@ "private": true, "type": "module", "dependencies": { - "@agentv/core": "file:../../../packages/core" + "@agentv/core": "file:../../../packages/core", + "@agentv/sdk": "file:../../../packages/sdk" } } diff --git a/packages/core/test/evaluation/graders.test.ts b/packages/core/test/evaluation/graders.test.ts index 1db1b9c3f..713e32030 100644 --- a/packages/core/test/evaluation/graders.test.ts +++ b/packages/core/test/evaluation/graders.test.ts @@ -1038,7 +1038,7 @@ describe('CodeGrader', () => { expect(failedAssertions[0].text).toContain('test-error'); }); - it('works with defineCodeGrader-based code grader', async () => { + it('works with code grader stdin/stdout contract', async () => { const graderProvider = new StubProvider(textResponse('Logging improvements applied')); const __dirname = dirname(fileURLToPath(import.meta.url)); diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts index dcc20da61..d5464e7a3 100644 --- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts +++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from 'bun:test'; import { mkdtempSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { fileURLToPath, pathToFileURL } from 'node:url'; +import { fileURLToPath } from 'node:url'; import { containsTemplateVariables, @@ -90,15 +90,12 @@ describe('resolveCustomPrompt', () => { it('passes final answer as output and transcript through messages/trace to executable prompts', async () => { const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-contract-')); const promptPath = path.join(tmpDir, 'prompt-template.ts'); - const promptTemplateRuntime = pathToFileURL( - path.resolve(__dirname, '../../../../sdk/src/prompt-template.ts'), - ).href; writeFileSync( promptPath, - `import { definePromptTemplate } from ${JSON.stringify(promptTemplateRuntime)}; + `import { readFileSync } from 'node:fs'; -definePromptTemplate((ctx) => { +const ctx = JSON.parse(readFileSync(0, 'utf8')); if (typeof ctx.output !== 'string') { throw new Error('expected output to be the final answer string'); } @@ -115,8 +112,7 @@ definePromptTemplate((ctx) => { throw new Error('expected full trace with transcript messages'); } - return \`Final: \${ctx.output}; messages: \${ctx.messages.length}; trace: \${ctx.trace.messages.length}\`; -}); +console.log(\`Final: \${ctx.output}; messages: \${ctx.messages.length}; trace: \${ctx.trace.messages.length}\`); `, ); diff --git a/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts b/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts index 60f95f8db..ab6a7f0ab 100644 --- a/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts +++ b/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts @@ -1,6 +1,7 @@ -import { defineEval } from '../../../../../sdk/src/index.ts'; +const EVAL_SUITE_SYMBOL = Symbol.for('@agentv/sdk/eval-suite'); +const TO_EVAL_YAML_OBJECT_SYMBOL = Symbol.for('@agentv/sdk/to-eval-yaml-object'); -export default defineEval({ +const suite = { name: 'sdk-define-eval-suite', description: 'YAML-aligned TypeScript suite authored with @agentv/sdk', tags: ['sdk', 'typescript', 'yaml'], @@ -34,4 +35,49 @@ export default defineEval({ }, }, ], +}; + +export default Object.defineProperties(suite, { + [EVAL_SUITE_SYMBOL]: { + value: true, + enumerable: false, + }, + [TO_EVAL_YAML_OBJECT_SYMBOL]: { + value: () => ({ + name: suite.name, + description: suite.description, + tags: suite.tags, + execution: { + targets: ['mock-target'], + workers: 2, + skip_defaults: true, + budget_usd: 2, + threshold: 0.75, + }, + workspace: { + hooks: { + before_all: { + command: ['echo', 'suite-setup'], + }, + }, + }, + tests: [ + { + id: 'sdk-define-eval', + input: 'Say hello', + expected_output: 'hello there', + assertions: [{ type: 'contains', value: 'hello' }], + workspace: { + hooks: { + before_each: { + command: ['echo', 'case-setup'], + timeout_ms: 1_000, + }, + }, + }, + }, + ], + }), + enumerable: false, + }, }); diff --git a/packages/core/test/fixtures/test-define-grader.ts b/packages/core/test/fixtures/test-define-grader.ts index 6ba916c25..09b74f280 100644 --- a/packages/core/test/fixtures/test-define-grader.ts +++ b/packages/core/test/fixtures/test-define-grader.ts @@ -1,34 +1,35 @@ #!/usr/bin/env bun /** - * Test fixture for defineCodeGrader integration test. + * Test fixture for the code-grader stdin/stdout contract. */ -import { defineCodeGrader } from '../../../sdk/src/index.js'; +import { readFileSync } from 'node:fs'; -export default defineCodeGrader(({ output, criteria }) => { - const assertions: { text: string; passed: boolean }[] = []; +const input = JSON.parse(readFileSync(0, 'utf8')) as { + readonly output?: string | null; + readonly criteria?: string; +}; - // `output` is the final answer/scored result. Transcript-aware graders should - // use messages/trace instead. - const candidateText = output ?? ''; +const assertions: { text: string; passed: boolean }[] = []; - // Simple check: does candidate mention the criteria keywords? - const outcomeWords = criteria.toLowerCase().split(/\s+/); - const candidateWords = candidateText.toLowerCase().split(/\s+/); +// `output` is the final answer/scored result. Transcript-aware graders should +// use messages/trace instead. +const candidateText = input.output ?? ''; +const criteria = input.criteria ?? ''; - for (const word of outcomeWords) { - if (word.length > 3 && candidateWords.includes(word)) { - assertions.push({ text: `Contains keyword: ${word}`, passed: true }); - } - } +// Simple check: does candidate mention the criteria keywords? +const outcomeWords = criteria.toLowerCase().split(/\s+/); +const candidateWords = candidateText.toLowerCase().split(/\s+/); - if (assertions.length === 0) { - assertions.push({ text: 'No matching keywords found', passed: false }); +for (const word of outcomeWords) { + if (word.length > 3 && candidateWords.includes(word)) { + assertions.push({ text: `Contains keyword: ${word}`, passed: true }); } +} + +if (assertions.length === 0) { + assertions.push({ text: 'No matching keywords found', passed: false }); +} - const score = assertions.some((a) => a.passed) ? 1.0 : 0.0; +const score = assertions.some((a) => a.passed) ? 1.0 : 0.0; - return { - score, - assertions, - }; -}); +console.log(JSON.stringify({ score, assertions }, null, 2)); diff --git a/packages/sdk/README.md b/packages/sdk/README.md index dc4b67a03..3b2e7aca3 100644 --- a/packages/sdk/README.md +++ b/packages/sdk/README.md @@ -1,6 +1,6 @@ # @agentv/sdk -Public lightweight SDK for AgentV - build YAML-aligned eval suites, custom graders, and prompt templates around the canonical AgentV eval model. +Public lightweight SDK for AgentV - run evaluations programmatically, build YAML-aligned eval suites, and write custom graders and prompt templates around the canonical AgentV eval model. ## Installation @@ -25,6 +25,38 @@ import { defineCodeGrader } from '@agentv/sdk'; ## Quick Start +### evaluate (programmatic runs) + +```typescript +import { evaluate } from '@agentv/sdk'; + +const { results, summary } = await evaluate({ + tests: [ + { + id: 'greeting', + input: 'Say hello', + expectedOutput: 'Hello there!', + assert: [{ type: 'contains', value: 'Hello' }], + }, + ], + task: async (input) => `Hello from: ${input}`, +}); + +console.log(`${summary.passed}/${summary.total} passed`); +``` + +Use `specFile` when you want library control around an existing YAML suite: + +```typescript +import { evaluate } from '@agentv/sdk'; + +const { summary } = await evaluate({ + specFile: './evals/my-eval.eval.yaml', +}); +``` + +The `evaluate()` implementation is owned by `@agentv/core`; `@agentv/sdk` re-exports it as the user-facing SDK entrypoint. + ### defineAssertion (simplest way) ```typescript @@ -197,6 +229,7 @@ Python workflows should emit canonical YAML/JSONL or implement code graders over ## Exports +- `evaluate(config)` - Run evaluations programmatically from inline tests or an eval spec file - `defineAssertion(handler)` - Define a custom assertion (pass/fail + optional score) - `defineCodeGrader(handler)` - Define a code grader (full score control) - `defineVitestWorkspaceGrader(options)` - Embed the Vitest workspace verifier adapter in a custom script @@ -206,6 +239,7 @@ Python workflows should emit canonical YAML/JSONL or implement code graders over - `graders` - Catalog of built-in AgentV grader config helpers - `containsGrader`, `equalsGrader`, `exactGrader`, `regexGrader`, `isJsonGrader`, `jsonGrader`, `rubricsGrader`, `llmGrader`, `codeGrader` - Named grader helper functions - `toEvalYamlObject(definition)` / `serializeEvalYaml(definition)` - Lower or serialize canonical eval YAML +- `EvalConfig`, `EvalRunResult`, `EvalSummary`, `EvalTestInput`, `EvalAssertionInput` - Programmatic evaluation types - `AssertionContext`, `AssertionScore` - Assertion types - `CodeGraderInput`, `CodeGraderResult`, `Workspace`, `WorkspaceAssertion` - Grader types - `TraceSummary`, `Message`, `ToolCall` - Trace data types diff --git a/packages/sdk/package.json b/packages/sdk/package.json index db9734198..0df0924d7 100644 --- a/packages/sdk/package.json +++ b/packages/sdk/package.json @@ -22,16 +22,18 @@ }, "scripts": { "prepublishOnly": "node -e \"if(process.env.ALLOW_PUBLISH!=='1'){console.error('ERROR: Use bun run publish:next, then bun run promote:latest');process.exit(1)}\"", - "build": "tsup", + "build:deps": "bun --cwd ../.. --filter @agentv/core build", + "build": "bun run build:deps && tsup", "dev": "tsup --watch", - "typecheck": "tsc --noEmit", + "typecheck": "bun run build:deps && tsc --noEmit", "lint": "biome check .", "format": "biome format --write .", "fix": "biome check --write .", - "test": "bun test" + "test": "bun run build:deps && bun test" }, "files": ["dist", "README.md"], "dependencies": { + "@agentv/core": "4.42.4", "yaml": "^2.8.3", "zod": "^3.23.8" } diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts index b9b4c15dc..c752a5ed7 100644 --- a/packages/sdk/src/index.ts +++ b/packages/sdk/src/index.ts @@ -117,6 +117,18 @@ export { } from './schemas.js'; // Re-export YAML-aligned eval authoring helpers +export { + evaluate, + type AssertEntry, + type ConversationTurnInput, + type EvalAssertionInput, + type EvalConfig, + type EvalRunArtifacts, + type EvalRunResult, + type EvalSummary, + type EvalTestInput, +} from '@agentv/core'; + export { defineEval, evalSuite, diff --git a/packages/sdk/test/evaluate-export.test.ts b/packages/sdk/test/evaluate-export.test.ts new file mode 100644 index 000000000..d4dad0f65 --- /dev/null +++ b/packages/sdk/test/evaluate-export.test.ts @@ -0,0 +1,22 @@ +import { describe, expect, it } from 'bun:test'; + +import { evaluate } from '../src/index.js'; + +describe('evaluate export', () => { + it('runs the core programmatic evaluate API through @agentv/sdk', async () => { + const { results, summary } = await evaluate({ + tests: [ + { + id: 'sdk-evaluate-export', + input: 'Say hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + task: async (input) => `hello: ${input}`, + }); + + expect(summary.total).toBe(1); + expect(summary.passed).toBe(1); + expect(results[0].testId).toBe('sdk-evaluate-export'); + }); +}); diff --git a/packages/sdk/test/package-graph.test.ts b/packages/sdk/test/package-graph.test.ts new file mode 100644 index 000000000..55177694b --- /dev/null +++ b/packages/sdk/test/package-graph.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from 'bun:test'; +import { readFileSync, readdirSync, statSync } from 'node:fs'; +import path from 'node:path'; + +const repoRoot = path.resolve(import.meta.dir, '../../..'); + +function readJson(filePath: string) { + return JSON.parse(readFileSync(path.join(repoRoot, filePath), 'utf8')) as { + readonly dependencies?: Record; + readonly devDependencies?: Record; + }; +} + +function listFiles(dir: string): string[] { + const absoluteDir = path.join(repoRoot, dir); + return readdirSync(absoluteDir).flatMap((entry) => { + const absolutePath = path.join(absoluteDir, entry); + const relativePath = path.relative(repoRoot, absolutePath); + return statSync(absolutePath).isDirectory() ? listFiles(relativePath) : [relativePath]; + }); +} + +function importSpecifiers(source: string): string[] { + return [...source.matchAll(/\b(?:import|export)\b[\s\S]*?\bfrom\s+['"]([^'"]+)['"]/g)].map( + (match) => match[1], + ); +} + +describe('core/sdk package graph', () => { + it('keeps runtime package dependencies acyclic', () => { + const sdkPackage = readJson('packages/sdk/package.json'); + const corePackage = readJson('packages/core/package.json'); + + expect(sdkPackage.dependencies?.['@agentv/core']).toBe(corePackage.version); + expect(corePackage.dependencies?.['@agentv/sdk']).toBeUndefined(); + expect(corePackage.devDependencies?.['@agentv/sdk']).toBeUndefined(); + }); + + it('keeps core source and tests from importing sdk source', () => { + const offenders = [...listFiles('packages/core/src'), ...listFiles('packages/core/test')] + .filter((filePath) => /\.(?:ts|tsx|js|mjs|cjs)$/.test(filePath)) + .filter((filePath) => { + const source = readFileSync(path.join(repoRoot, filePath), 'utf8'); + return importSpecifiers(source).some( + (specifier) => specifier === '@agentv/sdk' || specifier.includes('/sdk/src/'), + ); + }); + + expect(offenders).toEqual([]); + }); +}); diff --git a/scripts/release.ts b/scripts/release.ts index 29bf93175..df7df8106 100644 --- a/scripts/release.ts +++ b/scripts/release.ts @@ -44,6 +44,7 @@ const PRIMARY_PACKAGE = 'apps/cli/package.json'; interface PackageJson { name: string; version: string; + dependencies?: Record; [key: string]: unknown; } @@ -69,6 +70,12 @@ function writePackageJson(path: string, pkg: PackageJson): void { writeFileSync(path, `${JSON.stringify(pkg, null, 2)}\n`, 'utf-8'); } +function syncInternalRuntimeDependencies(pkg: PackageJson, version: string): void { + if (pkg.name === '@agentv/sdk' && pkg.dependencies?.['@agentv/core']) { + pkg.dependencies['@agentv/core'] = version; + } +} + function bumpVersion(currentVersion: string, bumpType: BumpType): string { const stablePart = currentVersion.split('-')[0]; const parts = stablePart.split('.').map(Number); @@ -401,6 +408,7 @@ async function main() { const pkg = readPackageJson(fullPath); const oldVersion = pkg.version; pkg.version = newVersion; + syncInternalRuntimeDependencies(pkg, newVersion); writePackageJson(fullPath, pkg); console.log(` ✓ ${pkg.name}: ${oldVersion} → ${newVersion}`); }