EntityProcess · christso · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ cat ./run/index.jsonl                         # JSONL results for scripts/CI
 Use AgentV programmatically:
 
 ```typescript
-import { evaluate } from '@agentv/core';
+import { evaluate } from '@agentv/sdk';
 
 const { results, summary } = await evaluate({
   tests: [

diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx
@@ -9,16 +9,16 @@ YAML remains AgentV's canonical, portable eval format. The SDK surfaces below ar
 
 AgentV currently provides two npm packages for programmatic use:
 
-- **`@agentv/sdk`** — YAML-aligned eval authoring, custom assertions, and code graders
-- **`@agentv/core`** — programmatic evaluation API and typed configuration
+- **`@agentv/sdk`** — user-facing SDK for `evaluate()`, YAML-aligned eval authoring, custom assertions, and code graders
+- **`@agentv/core`** — core implementation package and typed configuration
 
 ## Installation
 
 ```bash
-# Lightweight SDK (defineEval, graders, defineAssertion, defineCodeGrader)
+# User-facing SDK (evaluate, defineEval, graders, defineAssertion, defineCodeGrader)
 npm install @agentv/sdk
 
-# Programmatic API (evaluate, defineConfig)
+# Core configuration helpers (defineConfig)
 npm install @agentv/core
 ```
 
@@ -335,12 +335,12 @@ Raw grader stdin uses `snake_case` because it crosses a process boundary and may
 
 ## Programmatic API
 
-Use `evaluate()` from `@agentv/core` to run evaluations as a library. The most portable pattern is still to keep the suite in YAML and point `specFile` at it; inline tests are best when the eval is tightly coupled to application code.
+Use `evaluate()` from `@agentv/sdk` to run evaluations as a library. The implementation is owned by `@agentv/core`, but the SDK re-exports it as the user-facing entrypoint. The most portable pattern is still to keep the suite in YAML and point `specFile` at it; inline tests are best when the eval is tightly coupled to application code.
 
 ### Inline Test Definitions
 
 ```typescript
-import { evaluate } from '@agentv/core';
+import { evaluate } from '@agentv/sdk';
 
 const { results, summary } = await evaluate({
   tests: [
@@ -363,7 +363,7 @@ Auto-discovers the `default` target from `.agentv/targets.yaml` and `.env` crede
 Point to an existing YAML eval instead of inlining tests:
 
 ```typescript
-import { evaluate } from '@agentv/core';
+import { evaluate } from '@agentv/sdk';
 
 const { results, summary } = await evaluate({
   specFile: './evals/my-eval.eval.yaml',

diff --git a/bun.lock b/bun.lock
diff --git a/examples/features/sdk-programmatic-api-advanced/evaluate.ts b/examples/features/sdk-programmatic-api-advanced/evaluate.ts
@@ -6,7 +6,7 @@
  *
  * Run: bun run evaluate.ts
  */
-import { evaluate } from '@agentv/core';
+import { evaluate } from '@agentv/sdk';
 
 const { results, summary } = await evaluate({
   // Run a setup command before the suite starts

diff --git a/examples/features/sdk-programmatic-api-advanced/package.json b/examples/features/sdk-programmatic-api-advanced/package.json
@@ -3,6 +3,7 @@
   "private": true,
   "type": "module",
   "dependencies": {
-    "@agentv/core": "file:../../../packages/core"
+    "@agentv/core": "file:../../../packages/core",
+    "@agentv/sdk": "file:../../../packages/sdk"
   }
 }
diff --git a/examples/features/sdk-programmatic-api/README.md b/examples/features/sdk-programmatic-api/README.md
@@ -1,10 +1,10 @@
 # SDK Example: Programmatic API
 
-Demonstrates using `evaluate()` from `@agentv/core` to run evaluations as a library when the eval definition belongs in TypeScript. The config mirrors the canonical YAML surface, but uses programmatic names such as `expectedOutput` and `assert`.
+Demonstrates using `evaluate()` from `@agentv/sdk` to run evaluations as a library when the eval definition belongs in TypeScript. The config mirrors the canonical YAML surface, but uses programmatic names such as `expectedOutput` and `assert`.
 
 ## What It Does
 
-1. Imports `evaluate()` from `@agentv/core`
+1. Imports `evaluate()` from `@agentv/sdk`
 2. Defines tests inline with `assert`
 3. Runs the evaluation and prints summary statistics
 4. Writes canonical AgentV run artifacts under `.agentv/results/runs/...`

diff --git a/examples/features/sdk-programmatic-api/evaluate.ts b/examples/features/sdk-programmatic-api/evaluate.ts
@@ -1,13 +1,13 @@
 /**
  * Programmatic API Example
  *
- * Uses evaluate() from @agentv/core to run evaluations as a library.
+ * Uses evaluate() from @agentv/sdk to run evaluations as a library.
  * The inline config mirrors the canonical YAML surface with TypeScript-friendly names.
  *
  * Run: bun run evaluate.ts
  * (Uses 'default' target from .agentv/targets.yaml and .env credentials)
  */
-import { evaluate } from '@agentv/core';
+import { evaluate } from '@agentv/sdk';
 
 const { results, summary } = await evaluate({
   tests: [

diff --git a/examples/features/sdk-programmatic-api/package.json b/examples/features/sdk-programmatic-api/package.json
@@ -3,6 +3,7 @@
   "private": true,
   "type": "module",
   "dependencies": {
-    "@agentv/core": "file:../../../packages/core"
+    "@agentv/core": "file:../../../packages/core",
+    "@agentv/sdk": "file:../../../packages/sdk"
   }
 }
diff --git a/packages/core/test/evaluation/graders.test.ts b/packages/core/test/evaluation/graders.test.ts
@@ -1038,7 +1038,7 @@ describe('CodeGrader', () => {
     expect(failedAssertions[0].text).toContain('test-error');
   });
 
-  it('works with defineCodeGrader-based code grader', async () => {
+  it('works with code grader stdin/stdout contract', async () => {
     const graderProvider = new StubProvider(textResponse('Logging improvements applied'));
 
     const __dirname = dirname(fileURLToPath(import.meta.url));

diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts
@@ -2,7 +2,7 @@ import { describe, expect, it } from 'bun:test';
 import { mkdtempSync, writeFileSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
-import { fileURLToPath, pathToFileURL } from 'node:url';
+import { fileURLToPath } from 'node:url';
 
 import {
   containsTemplateVariables,
@@ -90,15 +90,12 @@ describe('resolveCustomPrompt', () => {
   it('passes final answer as output and transcript through messages/trace to executable prompts', async () => {
     const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-contract-'));
     const promptPath = path.join(tmpDir, 'prompt-template.ts');
-    const promptTemplateRuntime = pathToFileURL(
-      path.resolve(__dirname, '../../../../sdk/src/prompt-template.ts'),
-    ).href;
 
     writeFileSync(
       promptPath,
-      `import { definePromptTemplate } from ${JSON.stringify(promptTemplateRuntime)};
+      `import { readFileSync } from 'node:fs';
 
-definePromptTemplate((ctx) => {
+const ctx = JSON.parse(readFileSync(0, 'utf8'));
   if (typeof ctx.output !== 'string') {
     throw new Error('expected output to be the final answer string');
   }
@@ -115,8 +112,7 @@ definePromptTemplate((ctx) => {
     throw new Error('expected full trace with transcript messages');
   }
 
-  return \`Final: \${ctx.output}; messages: \${ctx.messages.length}; trace: \${ctx.trace.messages.length}\`;
-});
+console.log(\`Final: \${ctx.output}; messages: \${ctx.messages.length}; trace: \${ctx.trace.messages.length}\`);
 `,
     );
 

diff --git a/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts b/packages/core/test/evaluation/loaders/fixtures/sdk-define-eval.eval.ts
@@ -1,6 +1,7 @@
-import { defineEval } from '../../../../../sdk/src/index.ts';
+const EVAL_SUITE_SYMBOL = Symbol.for('@agentv/sdk/eval-suite');
+const TO_EVAL_YAML_OBJECT_SYMBOL = Symbol.for('@agentv/sdk/to-eval-yaml-object');
 
-export default defineEval({
+const suite = {
   name: 'sdk-define-eval-suite',
   description: 'YAML-aligned TypeScript suite authored with @agentv/sdk',
   tags: ['sdk', 'typescript', 'yaml'],
@@ -34,4 +35,49 @@ export default defineEval({
       },
     },
   ],
+};
+
+export default Object.defineProperties(suite, {
+  [EVAL_SUITE_SYMBOL]: {
+    value: true,
+    enumerable: false,
+  },
+  [TO_EVAL_YAML_OBJECT_SYMBOL]: {
+    value: () => ({
+      name: suite.name,
+      description: suite.description,
+      tags: suite.tags,
+      execution: {
+        targets: ['mock-target'],
+        workers: 2,
+        skip_defaults: true,
+        budget_usd: 2,
+        threshold: 0.75,
+      },
+      workspace: {
+        hooks: {
+          before_all: {
+            command: ['echo', 'suite-setup'],
+          },
+        },
+      },
+      tests: [
+        {
+          id: 'sdk-define-eval',
+          input: 'Say hello',
+          expected_output: 'hello there',
+          assertions: [{ type: 'contains', value: 'hello' }],
+          workspace: {
+            hooks: {
+              before_each: {
+                command: ['echo', 'case-setup'],
+                timeout_ms: 1_000,
+              },
+            },
+          },
+        },
+      ],
+    }),
+    enumerable: false,
+  },
 });
diff --git a/packages/core/test/fixtures/test-define-grader.ts b/packages/core/test/fixtures/test-define-grader.ts
@@ -1,34 +1,35 @@
 #!/usr/bin/env bun
 /**
- * Test fixture for defineCodeGrader integration test.
+ * Test fixture for the code-grader stdin/stdout contract.
  */
-import { defineCodeGrader } from '../../../sdk/src/index.js';
+import { readFileSync } from 'node:fs';
 
-export default defineCodeGrader(({ output, criteria }) => {
-  const assertions: { text: string; passed: boolean }[] = [];
+const input = JSON.parse(readFileSync(0, 'utf8')) as {
+  readonly output?: string | null;
+  readonly criteria?: string;
+};
 
-  // `output` is the final answer/scored result. Transcript-aware graders should
-  // use messages/trace instead.
-  const candidateText = output ?? '';
+const assertions: { text: string; passed: boolean }[] = [];
 
-  // Simple check: does candidate mention the criteria keywords?
-  const outcomeWords = criteria.toLowerCase().split(/\s+/);
-  const candidateWords = candidateText.toLowerCase().split(/\s+/);
+// `output` is the final answer/scored result. Transcript-aware graders should
+// use messages/trace instead.
+const candidateText = input.output ?? '';
+const criteria = input.criteria ?? '';
 
-  for (const word of outcomeWords) {
-    if (word.length > 3 && candidateWords.includes(word)) {
-      assertions.push({ text: `Contains keyword: ${word}`, passed: true });
-    }
-  }
+// Simple check: does candidate mention the criteria keywords?
+const outcomeWords = criteria.toLowerCase().split(/\s+/);
+const candidateWords = candidateText.toLowerCase().split(/\s+/);
 
-  if (assertions.length === 0) {
-    assertions.push({ text: 'No matching keywords found', passed: false });
+for (const word of outcomeWords) {
+  if (word.length > 3 && candidateWords.includes(word)) {
+    assertions.push({ text: `Contains keyword: ${word}`, passed: true });
   }
+}
+
+if (assertions.length === 0) {
+  assertions.push({ text: 'No matching keywords found', passed: false });
+}
 
-  const score = assertions.some((a) => a.passed) ? 1.0 : 0.0;
+const score = assertions.some((a) => a.passed) ? 1.0 : 0.0;
 
-  return {
-    score,
-    assertions,
-  };
-});
+console.log(JSON.stringify({ score, assertions }, null, 2));
diff --git a/packages/sdk/README.md b/packages/sdk/README.md
@@ -1,6 +1,6 @@
 # @agentv/sdk
 
-Public lightweight SDK for AgentV - build YAML-aligned eval suites, custom graders, and prompt templates around the canonical AgentV eval model.
+Public lightweight SDK for AgentV - run evaluations programmatically, build YAML-aligned eval suites, and write custom graders and prompt templates around the canonical AgentV eval model.
 
 ## Installation
 
@@ -25,6 +25,38 @@ import { defineCodeGrader } from '@agentv/sdk';
 
 ## Quick Start
 
+### evaluate (programmatic runs)
+
+```typescript
+import { evaluate } from '@agentv/sdk';
+
+const { results, summary } = await evaluate({
+  tests: [
+    {
+      id: 'greeting',
+      input: 'Say hello',
+      expectedOutput: 'Hello there!',
+      assert: [{ type: 'contains', value: 'Hello' }],
+    },
+  ],
+  task: async (input) => `Hello from: ${input}`,
+});
+
+console.log(`${summary.passed}/${summary.total} passed`);
+```
+
+Use `specFile` when you want library control around an existing YAML suite:
+
+```typescript
+import { evaluate } from '@agentv/sdk';
+
+const { summary } = await evaluate({
+  specFile: './evals/my-eval.eval.yaml',
+});
+```
+
+The `evaluate()` implementation is owned by `@agentv/core`; `@agentv/sdk` re-exports it as the user-facing SDK entrypoint.
+
 ### defineAssertion (simplest way)
 
 ```typescript
@@ -197,6 +229,7 @@ Python workflows should emit canonical YAML/JSONL or implement code graders over
 
 ## Exports
 
+- `evaluate(config)` - Run evaluations programmatically from inline tests or an eval spec file
 - `defineAssertion(handler)` - Define a custom assertion (pass/fail + optional score)
 - `defineCodeGrader(handler)` - Define a code grader (full score control)
 - `defineVitestWorkspaceGrader(options)` - Embed the Vitest workspace verifier adapter in a custom script
@@ -206,6 +239,7 @@ Python workflows should emit canonical YAML/JSONL or implement code graders over
 - `graders` - Catalog of built-in AgentV grader config helpers
 - `containsGrader`, `equalsGrader`, `exactGrader`, `regexGrader`, `isJsonGrader`, `jsonGrader`, `rubricsGrader`, `llmGrader`, `codeGrader` - Named grader helper functions
 - `toEvalYamlObject(definition)` / `serializeEvalYaml(definition)` - Lower or serialize canonical eval YAML
+- `EvalConfig`, `EvalRunResult`, `EvalSummary`, `EvalTestInput`, `EvalAssertionInput` - Programmatic evaluation types
 - `AssertionContext`, `AssertionScore` - Assertion types
 - `CodeGraderInput`, `CodeGraderResult`, `Workspace`, `WorkspaceAssertion` - Grader types
 - `TraceSummary`, `Message`, `ToolCall` - Trace data types

diff --git a/packages/sdk/package.json b/packages/sdk/package.json
@@ -22,16 +22,18 @@
   },
   "scripts": {
     "prepublishOnly": "node -e \"if(process.env.ALLOW_PUBLISH!=='1'){console.error('ERROR: Use bun run publish:next, then bun run promote:latest');process.exit(1)}\"",
-    "build": "tsup",
+    "build:deps": "bun --cwd ../.. --filter @agentv/core build",
+    "build": "bun run build:deps && tsup",
     "dev": "tsup --watch",
-    "typecheck": "tsc --noEmit",
+    "typecheck": "bun run build:deps && tsc --noEmit",
     "lint": "biome check .",
     "format": "biome format --write .",
     "fix": "biome check --write .",
-    "test": "bun test"
+    "test": "bun run build:deps && bun test"
   },
   "files": ["dist", "README.md"],
   "dependencies": {
+    "@agentv/core": "4.42.4",
     "yaml": "^2.8.3",
     "zod": "^3.23.8"
   }

diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts
@@ -117,6 +117,18 @@ export {
 } from './schemas.js';
 
 // Re-export YAML-aligned eval authoring helpers
+export {
+  evaluate,
+  type AssertEntry,
+  type ConversationTurnInput,
+  type EvalAssertionInput,
+  type EvalConfig,
+  type EvalRunArtifacts,
+  type EvalRunResult,
+  type EvalSummary,
+  type EvalTestInput,
+} from '@agentv/core';
+
 export {
   defineEval,
   evalSuite,