diff --git a/apps/cli/package.json b/apps/cli/package.json index 6572d7193..b3c1f83a1 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -17,14 +17,14 @@ "files": ["dist", "README.md"], "scripts": { "dev": "bun src/cli.ts", - "build": "tsup && bun run copy-readme", + "build": "(cd ../../packages/sdk && bun run build) && tsup && bun run copy-readme", "copy-readme": "bun -e \"import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')\"", "prepublishOnly": "node -e \"if(process.env.ALLOW_PUBLISH!=='1'){console.error('ERROR: Use bun run publish:next, then bun run promote:latest');process.exit(1)}\"", - "typecheck": "tsc --noEmit", + "typecheck": "(cd ../../packages/sdk && bun run build) && tsc --noEmit", "lint": "biome check .", "format": "biome format --write .", "fix": "biome check --write .", - "test": "bun test", + "test": "(cd ../../packages/sdk && bun run build) && bun test", "test:watch": "bun test --watch" }, "dependencies": { @@ -53,6 +53,7 @@ }, "devDependencies": { "@agentv/core": "workspace:*", + "@agentv/sdk": "workspace:*", "@types/semver": "^7.7.1", "execa": "^9.3.0" } diff --git a/apps/cli/src/commands/eval/commands/vitest.ts b/apps/cli/src/commands/eval/commands/vitest.ts new file mode 100644 index 000000000..524dfd8eb --- /dev/null +++ b/apps/cli/src/commands/eval/commands/vitest.ts @@ -0,0 +1,64 @@ +import { command, flag, number, option, optional, restPositionals, string } from 'cmd-ts'; + +import { runCodeGrader, runVitestWorkspaceGrader } from '@agentv/sdk'; + +function parseCommand(value: string | undefined): readonly string[] | undefined { + const trimmed = value?.trim(); + return trimmed ? trimmed.split(/\s+/) : undefined; +} + +export const evalVitestCommand = command({ + name: 'vitest', + description: 'Run Vitest workspace verifier files as an AgentV code-grader protocol adapter', + args: { + testFiles: restPositionals({ + type: string, + displayName: 'test-files', + description: 'Vitest verifier file(s) to run', + }), + cwd: option({ + type: optional(string), + long: 'cwd', + description: 'Workspace-relative directory where Vitest should run', + }), + vitestCommand: option({ + type: optional(string), + long: 'vitest-command', + description: 'Vitest command to execute, defaults to "bunx vitest run"', + }), + timeoutMs: option({ + type: optional(number), + long: 'timeout-ms', + description: 'Timeout for the Vitest command in milliseconds', + }), + inWorkspace: flag({ + long: 'in-workspace', + description: + 'Treat test files as already present in the prepared workspace instead of copying them from the current directory', + }), + passWithNoTests: flag({ + long: 'pass-with-no-tests', + description: 'Return score 1 when Vitest reports zero tests', + }), + }, + handler: async ({ testFiles, cwd, vitestCommand, timeoutMs, inWorkspace, passWithNoTests }) => { + await runCodeGrader((input) => { + if (testFiles.length === 0) { + throw new Error('Provide at least one Vitest verifier file.'); + } + + return runVitestWorkspaceGrader( + { + testFile: testFiles, + cwd, + vitestCommand: parseCommand(vitestCommand), + timeoutMs, + passWithNoTests, + copyTestFilesToWorkspace: !inWorkspace, + testFileRoot: process.cwd(), + }, + input, + ); + }); + }, +}); diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts index 0ec0e9412..9b65fd6f5 100644 --- a/apps/cli/src/commands/eval/index.ts +++ b/apps/cli/src/commands/eval/index.ts @@ -4,6 +4,7 @@ import { evalAggregateCommand } from './commands/aggregate.js'; import { evalAssertCommand } from './commands/assert.js'; import { evalBundleCommand } from './commands/bundle.js'; import { evalRunCommand } from './commands/run.js'; +import { evalVitestCommand } from './commands/vitest.js'; export const evalCommand = subcommands({ name: 'eval', @@ -14,5 +15,6 @@ export const evalCommand = subcommands({ assert: evalAssertCommand, aggregate: evalAggregateCommand, bundle: evalBundleCommand, + vitest: evalVitestCommand, }, }); diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 48e4124c8..e18ecfa2f 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -61,7 +61,7 @@ export const app = subcommands({ * Known eval subcommand names — used to decide whether to inject the * implicit `run` subcommand for backward-compatible `agentv eval `. */ -const EVAL_SUBCOMMANDS = new Set(['run', 'assert', 'aggregate', 'bundle']); +const EVAL_SUBCOMMANDS = new Set(['run', 'assert', 'aggregate', 'bundle', 'vitest']); /** * Top-level CLI command names (excluding `eval` itself). @@ -96,6 +96,10 @@ export function usesDeprecatedStudioAlias(argv: string[]): boolean { return argv[2] === 'studio'; } +export function shouldRunBeforeSessionHook(argv: string[]): boolean { + return !(argv[2] === 'eval' && argv[3] === 'vitest'); +} + /** * Preprocess argv for convenience aliases: * - `--eval-id` → `--test-id` @@ -162,14 +166,16 @@ export async function runCli(argv: string[] = process.argv): Promise { ); } - // Run before_session hook once at startup, before any command executes. - // Uses cwd as the search root for .agentv/config.yaml. - const cwd = process.cwd(); - const repoRoot = await findRepoRoot(cwd); - const sessionConfig = await loadConfig(path.join(cwd, '_'), repoRoot); - const beforeSessionCommand = sessionConfig?.hooks?.before_session; - if (beforeSessionCommand) { - runBeforeSessionHook(beforeSessionCommand); + if (shouldRunBeforeSessionHook(processedArgv)) { + // Run before_session hook once at startup, before any command executes. + // Uses cwd as the search root for .agentv/config.yaml. + const cwd = process.cwd(); + const repoRoot = await findRepoRoot(cwd); + const sessionConfig = await loadConfig(path.join(cwd, '_'), repoRoot); + const beforeSessionCommand = sessionConfig?.hooks?.before_session; + if (beforeSessionCommand) { + runBeforeSessionHook(beforeSessionCommand); + } } await run(binary(app), processedArgv); diff --git a/apps/cli/test/commands/eval/vitest.test.ts b/apps/cli/test/commands/eval/vitest.test.ts new file mode 100644 index 000000000..a71fc9915 --- /dev/null +++ b/apps/cli/test/commands/eval/vitest.test.ts @@ -0,0 +1,130 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdir, mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { execa } from 'execa'; +import { assertCoreBuild } from '../../setup-core-build.js'; + +assertCoreBuild(); + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const projectRoot = path.resolve(__dirname, '../../../../..'); +const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts'); + +const report = { + success: false, + numTotalTests: 2, + numPassedTests: 1, + numFailedTests: 1, + numPendingTests: 0, + numTodoTests: 0, + testResults: [ + { + name: '/workspace/.agentv-vitest/example.test.ts', + assertionResults: [ + { + fullName: 'welcome banner includes ready status', + status: 'passed', + failureMessages: [], + }, + { + fullName: 'welcome banner links to dashboard', + status: 'failed', + failureMessages: ['AssertionError: expected link to point at /dashboard'], + }, + ], + }, + ], +}; + +async function runCli(args: readonly string[], cwd: string, input: string) { + return execa('bun', ['--no-env-file', CLI_ENTRY, ...args], { + cwd, + input, + env: { + AGENTV_HOME: path.join(cwd, '.agentv-home'), + AGENTV_NO_UPDATE_CHECK: '1', + }, + }); +} + +describe('agentv eval vitest', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(path.join(tmpdir(), 'agentv-eval-vitest-test-')); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('runs external verifier files through the code-grader protocol', async () => { + const workspacePath = path.join(tempDir, 'workspace'); + const gradersPath = path.join(tempDir, 'graders'); + const fakeVitest = path.join(tempDir, 'fake-vitest.ts'); + await mkdir(workspacePath, { recursive: true }); + await mkdir(gradersPath, { recursive: true }); + await writeFile( + path.join(gradersPath, 'welcome-banner.test.ts'), + 'import { expect, it } from "vitest";\n', + 'utf8', + ); + await writeFile( + fakeVitest, + `import { writeFileSync } from 'node:fs'; + +const args = process.argv.slice(2); +writeFileSync('vitest-args.json', JSON.stringify(args)); +const outputArg = args.find((arg) => arg.startsWith('--outputFile=')); +if (!outputArg) throw new Error('missing outputFile arg'); +writeFileSync(outputArg.slice('--outputFile='.length), JSON.stringify(${JSON.stringify(report)})); +process.exit(1); +`, + 'utf8', + ); + + const payload = JSON.stringify({ + criteria: 'Verify the workspace', + expected_output: [], + input_files: [], + input: [{ role: 'user', content: 'Update the welcome banner' }], + workspace_path: workspacePath, + }); + + const result = await runCli( + ['eval', 'vitest', '--vitest-command', `bun ${fakeVitest}`, 'graders/welcome-banner.test.ts'], + tempDir, + payload, + ); + + const output = JSON.parse(result.stdout); + expect(output.score).toBe(0.5); + expect(output.assertions).toEqual([ + { text: 'welcome banner includes ready status', passed: true }, + { + text: 'welcome banner links to dashboard', + passed: false, + evidence: 'AssertionError: expected link to point at /dashboard', + }, + ]); + expect(output.details).toMatchObject({ + vitest_success: false, + num_total_tests: 2, + num_passed_tests: 1, + num_failed_tests: 1, + }); + + const vitestArgs = JSON.parse( + await readFile(path.join(workspacePath, 'vitest-args.json'), 'utf8'), + ) as string[]; + expect(vitestArgs[0]).toMatch(/^\.agentv-vitest-.+\/0-welcome-banner\.test\.ts$/); + expect(vitestArgs).toContain('--reporter=json'); + expect(vitestArgs.some((arg) => arg.startsWith('--outputFile='))).toBe(true); + + const workspaceEntries = await readdir(workspacePath); + expect(workspaceEntries.some((entry) => entry.startsWith('.agentv-vitest-'))).toBe(false); + }); +}); diff --git a/apps/cli/test/setup-core-build.ts b/apps/cli/test/setup-core-build.ts index 40c1ae46d..3c0a87b53 100644 --- a/apps/cli/test/setup-core-build.ts +++ b/apps/cli/test/setup-core-build.ts @@ -1,15 +1,15 @@ /** * Pre-flight check for CLI integration tests. * - * CLI integration tests depend on @agentv/core being built (they import - * from the dist output). Rather than building core inside the test — which - * is slow and hides staleness issues — we simply verify dist exists and + * CLI integration tests depend on @agentv/core and @agentv/sdk being built + * (they import from the dist output). Rather than building packages inside + * the test — which is slow and hides staleness issues — we simply verify dist exists and * fail fast with a clear message if it doesn't. * * CI runs `bun run build` before `bun run test`, so dist is available in * the normal merge gate. For ad-hoc local runs, build first: * - * bun run --filter @agentv/core build && bun --filter agentv test + * bun --filter @agentv/core build && bun --filter @agentv/sdk build && bun --filter agentv test */ import { constants, accessSync } from 'node:fs'; @@ -19,12 +19,19 @@ import { fileURLToPath } from 'node:url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const projectRoot = path.resolve(__dirname, '../../..'); -const coreDistEntry = path.join(projectRoot, 'packages/core/dist/index.js'); +const distEntries = [ + ['@agentv/core', path.join(projectRoot, 'packages/core/dist/index.js')], + ['@agentv/sdk', path.join(projectRoot, 'packages/sdk/dist/index.js')], +] as const; export function assertCoreBuild(): void { - try { - accessSync(coreDistEntry, constants.R_OK); - } catch { - throw new Error('@agentv/core is not built. Run `bun run --filter @agentv/core build` first.'); + for (const [packageName, distEntry] of distEntries) { + try { + accessSync(distEntry, constants.R_OK); + } catch { + throw new Error( + `${packageName} is not built. Run \`bun --filter @agentv/core build && bun --filter @agentv/sdk build\` first.`, + ); + } } } diff --git a/apps/cli/test/unit/preprocess-argv.test.ts b/apps/cli/test/unit/preprocess-argv.test.ts index 2c4963e70..e58f8c331 100644 --- a/apps/cli/test/unit/preprocess-argv.test.ts +++ b/apps/cli/test/unit/preprocess-argv.test.ts @@ -1,6 +1,10 @@ import { describe, expect, it } from 'bun:test'; -import { preprocessArgv, usesDeprecatedStudioAlias } from '../../src/index.js'; +import { + preprocessArgv, + shouldRunBeforeSessionHook, + usesDeprecatedStudioAlias, +} from '../../src/index.js'; describe('preprocessArgv', () => { describe('--eval-id convenience alias', () => { @@ -31,6 +35,11 @@ describe('preprocessArgv', () => { expect(preprocessArgv(argv)).toEqual(argv); }); + it('does not insert `run` for eval vitest', () => { + const argv = ['node', 'agentv', 'eval', 'vitest', 'graders/welcome-banner.test.ts']; + expect(preprocessArgv(argv)).toEqual(argv); + }); + it('does not insert `run` when eval is followed by --help', () => { const argv = ['node', 'agentv', 'eval', '--help']; expect(preprocessArgv(argv)).toEqual(argv); @@ -85,4 +94,24 @@ describe('preprocessArgv', () => { expect(preprocessArgv(argv)).toEqual(argv); }); }); + + describe('before_session hook guard', () => { + it('skips before_session hooks for the Vitest protocol adapter', () => { + expect( + shouldRunBeforeSessionHook([ + 'node', + 'agentv', + 'eval', + 'vitest', + 'graders/welcome-banner.test.ts', + ]), + ).toBe(false); + }); + + it('keeps before_session hooks for normal eval runs', () => { + expect(shouldRunBeforeSessionHook(['node', 'agentv', 'eval', 'run', 'evals/demo.yaml'])).toBe( + true, + ); + }); + }); }); diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx index 7f9db1758..2e97a3123 100644 --- a/apps/web/src/content/docs/docs/evaluation/sdk.mdx +++ b/apps/web/src/content/docs/docs/evaluation/sdk.mdx @@ -46,6 +46,7 @@ Use the simplest surface that matches the job: - **`evaluate({ specFile })`** when you want library control around an existing YAML suite. - **Inline `evaluate({ tests })`** when the eval definition truly belongs inside application code. The programmatic API mirrors YAML, but uses current TypeScript naming such as `expectedOutput` and `assert`. - **`defineAssertion` / `defineCodeGrader`** when the grading logic itself must execute code. +- **`agentv eval vitest`** for deterministic workspace checks that fit normal Vitest `expect(...)` tests. There is no separate first-party Python authoring SDK today. Python-facing workflows should either emit canonical YAML/JSONL or implement executable graders that consume the standard `snake_case` wire format. @@ -279,7 +280,40 @@ export default defineCodeGrader(({ output, traceSummary }) => ({ })); ``` -`defineCodeGrader` graders are referenced in YAML with `type: code-grader` and `command: [bun, run, grader.ts]`. `defineAssertion` uses convention-based discovery instead — just place in `.agentv/assertions/` and reference by name. +For deterministic workspace verifiers, prefer normal Vitest tests plus AgentV's built-in Vitest adapter command: + +```typescript +// graders/welcome-banner.test.ts +import { readFileSync } from 'node:fs'; +import { expect, it } from 'vitest'; + +it('links to the dashboard', () => { + const page = readFileSync('app/page.tsx', 'utf8'); + expect(page).toMatch(/href=["']\/dashboard["']/); +}); +``` + +```yaml +assertions: + - name: vitest-welcome-banner + type: code-grader + command: [agentv, eval, vitest, graders/welcome-banner.test.ts] +``` + +Use `defineWorkspaceGrader` only for tiny one-off file checks or custom score shaping: + +```typescript +import { defineWorkspaceGrader } from '@agentv/sdk'; + +export default defineWorkspaceGrader(async ({ workspace }) => [ + await workspace.file('app/page.tsx').contains('Status: All systems ready'), + await workspace.file('app/page.tsx').contains('Open dashboard'), + await workspace.file('app/page.tsx').matches(/href=["']\/dashboard["']/), + await workspace.file('app/page.tsx').notMatches(/TODO/i), +]); +``` + +`defineCodeGrader`, `defineVitestWorkspaceGrader`, and `defineWorkspaceGrader` custom scripts are referenced in YAML with `type: code-grader` and `command: [bun, run, grader.ts]`. Plain Vitest verifier files can use `command: [agentv, eval, vitest, ...]` without a custom wrapper. `defineAssertion` uses convention-based discovery instead — just place in `.agentv/assertions/` and reference by name. For detailed patterns, input/output contracts, and language-agnostic examples, see [Code Graders](/docs/graders/code-graders/). diff --git a/apps/web/src/content/docs/docs/graders/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx index 1b2ee04f4..73b9a8ffd 100644 --- a/apps/web/src/content/docs/docs/graders/code-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/code-graders.mdx @@ -187,7 +187,7 @@ assertions: ## TypeScript SDK -The `@agentv/sdk` package provides a declarative API with automatic stdin/stdout handling. Use `defineCodeGrader` to skip boilerplate: +The `@agentv/sdk` package provides a declarative API with automatic stdin/stdout handling. Use `defineCodeGrader` to skip protocol boilerplate: ```typescript #!/usr/bin/env bun @@ -211,7 +211,63 @@ export default defineCodeGrader(({ output, criteria }) => { }); ``` -**SDK exports:** `defineCodeGrader`, `Message`, `ToolCall`, `Trace`, `TraceSummary`, `CodeGraderInput`, `CodeGraderResult` +### Vitest Workspace Verifiers + +For deterministic workspace checks, prefer a normal Vitest verifier file. This matches the common hidden-verifier pattern: read files from the prepared workspace and use `expect(...)`. + +```typescript +// graders/welcome-banner.test.ts +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; + +function readWorkspaceFile(relativePath: string) { + return readFileSync(join(process.env.AGENTV_WORKSPACE_PATH ?? process.cwd(), relativePath), 'utf8'); +} + +describe('welcome banner', () => { + const page = () => readWorkspaceFile('app/page.tsx'); + + it('shows ready status text', () => { + expect(page()).toContain('Status: All systems ready'); + }); + + it('links the call to action to /dashboard', () => { + expect(page()).toMatch(/href=["']\/dashboard["']/); + }); +}); +``` + +Then use AgentV's built-in Vitest adapter as the `code-grader` command. The adapter copies verifier files into a temporary workspace-local path when needed, runs Vitest in `workspace_path`, reads the JSON reporter output, and maps each test outcome to an AgentV assertion: + +```yaml +assertions: + - name: vitest-welcome-banner + type: code-grader + command: [agentv, eval, vitest, graders/welcome-banner.test.ts] +``` + +Use `agentv eval vitest --in-workspace verifiers/welcome-banner.test.ts` when the verifier file is already materialized inside the prepared workspace. Use the SDK's `defineVitestWorkspaceGrader()` only when embedding the adapter in a custom script or custom command. See `examples/features/vitest-workspace-grader/` for a runnable example. + +### Lower-Level Workspace Helpers + +For tiny one-off file checks, `defineWorkspaceGrader` can resolve the workspace path, read files relative to the workspace, build assertions, and aggregate the score: + +```typescript +#!/usr/bin/env bun +import { defineWorkspaceGrader } from '@agentv/sdk'; + +export default defineWorkspaceGrader(async ({ workspace }) => [ + await workspace.file('app/page.tsx').contains('Status: All systems ready'), + await workspace.file('app/page.tsx').contains('Open dashboard'), + await workspace.file('app/page.tsx').matches(/href=["']\/dashboard["']/), + await workspace.file('app/page.tsx').notMatches(/TODO/i), +]); +``` + +Prefer Vitest verifiers when the checks naturally fit `expect(...)`. Use `defineWorkspaceGrader` when you need a very small custom script, custom weighting, or details that do not map cleanly to individual test outcomes. + +**SDK exports:** `defineCodeGrader`, `defineVitestWorkspaceGrader`, `defineWorkspaceGrader`, `Message`, `ToolCall`, `Trace`, `TraceSummary`, `CodeGraderInput`, `CodeGraderResult`, `Workspace`, `WorkspaceAssertion` ## Target Access diff --git a/bun.lock b/bun.lock index 366f850c9..3e197e5be 100644 --- a/bun.lock +++ b/bun.lock @@ -42,6 +42,7 @@ }, "devDependencies": { "@agentv/core": "workspace:*", + "@agentv/sdk": "workspace:*", "@types/semver": "^7.7.1", "execa": "^9.3.0", }, diff --git a/examples/README.md b/examples/README.md index cf526ea23..93f662ee9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -47,11 +47,13 @@ Focused demonstrations of specific AgentV capabilities. Each example includes it - [local-cli](features/local-cli/) - Local CLI targets - [compare](features/compare/) - Baseline comparison - [deterministic-graders](features/deterministic-graders/) - Deterministic assertions (contains, regex, JSON validation) +- [vitest-workspace-grader](features/vitest-workspace-grader/) - Vitest-style deterministic workspace verifiers - [workspace-setup-script](features/workspace-setup-script/) - Multi-step workspace setup with `before_all` lifecycle hook ### SDK - [code-grader-sdk](features/code-grader-sdk/) - TypeScript SDK for code graders using `defineCodeGrader()` +- [vitest-workspace-grader](features/vitest-workspace-grader/) - Built-in AgentV adapter for Vitest workspace verifier files - [sdk-custom-assertion](features/sdk-custom-assertion/) - Custom assertion types using `defineAssertion()` - [sdk-programmatic-api](features/sdk-programmatic-api/) - Programmatic evaluation using `evaluate()` - [sdk-eval-authoring](features/sdk-eval-authoring/) - YAML-aligned `.eval.ts` authoring using `defineEval()` diff --git a/examples/features/vitest-workspace-grader/.agentv/targets.yaml b/examples/features/vitest-workspace-grader/.agentv/targets.yaml new file mode 100644 index 000000000..11d4049d8 --- /dev/null +++ b/examples/features/vitest-workspace-grader/.agentv/targets.yaml @@ -0,0 +1,20 @@ +targets: + - name: mock_agent + provider: cli + command: | + bash -c ' + mkdir -p app && + cat > app/page.tsx <<'"'"'EOF'"'"' + export default function Page() { + return ( +
+
+

Status: All systems ready

+ Open dashboard +
+
+ ); + } + EOF + echo "Updated the welcome banner." > {OUTPUT_FILE} + ' diff --git a/examples/features/vitest-workspace-grader/README.md b/examples/features/vitest-workspace-grader/README.md new file mode 100644 index 000000000..aaf5782a8 --- /dev/null +++ b/examples/features/vitest-workspace-grader/README.md @@ -0,0 +1,46 @@ +# Vitest Workspace Grader + +Demonstrates the preferred deterministic workspace grader path: write normal Vitest tests with `expect(...)`, then let AgentV run the verifier and map test results into AgentV assertions. + +## Files + +- `graders/welcome-banner.test.ts`: plain Vitest verifier that reads `app/page.tsx` +- `evals/dataset.eval.yaml`: eval case that runs the verifier through `agentv eval vitest` +- `.agentv/targets.yaml`: mock CLI target that updates the workspace + +## Run + +From this example directory: + +```bash +bun install +cd ../../.. +bun apps/cli/src/cli.ts eval examples/features/vitest-workspace-grader/evals/dataset.eval.yaml --target mock_agent +``` + +## Pattern + +Use Vitest verifiers when deterministic workspace checks can be expressed as normal tests: + +```ts +import { readFileSync } from 'node:fs'; +import { expect, it } from 'vitest'; + +it('links to the dashboard', () => { + const page = readFileSync('app/page.tsx', 'utf8'); + expect(page).toMatch(/href=["']\/dashboard["']/); +}); +``` + +The eval YAML calls AgentV's built-in adapter directly: + +```yaml +assertions: + - name: vitest-welcome-banner + type: code-grader + command: [agentv, eval, vitest, graders/welcome-banner.test.ts] +``` + +The local example uses a source-relative CLI path so it can run before the next AgentV package release. In a normal project, use the installed `agentv` binary form above. + +Use lower-level `defineCodeGrader` scripts when the grader needs custom scoring, multi-stage setup, external commands beyond a test runner, or structured `details` that do not map cleanly to individual test outcomes. diff --git a/examples/features/vitest-workspace-grader/bun.lock b/examples/features/vitest-workspace-grader/bun.lock new file mode 100644 index 000000000..6a055e088 --- /dev/null +++ b/examples/features/vitest-workspace-grader/bun.lock @@ -0,0 +1,163 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "agentv-example-vitest-workspace-grader", + "devDependencies": { + "vitest": "^4.0.0", + }, + }, + }, + "packages": { + "@emnapi/core": ["@emnapi/core@1.10.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.1", "tslib": "^2.4.0" } }, "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw=="], + + "@emnapi/runtime": ["@emnapi/runtime@1.10.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA=="], + + "@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.2.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w=="], + + "@jridgewell/sourcemap-codec": ["@jridgewell/sourcemap-codec@1.5.5", "", {}, "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og=="], + + "@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.5", "", { "dependencies": { "@tybys/wasm-util": "^0.10.2" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-AWPoBRJ9tsnVhor4sjO7rkni+7p+2IAEFj6cx06UgP10jkQHqay/36uRV/bFkgrh18D9vb4cr8Q0Pthskgzy+Q=="], + + "@oxc-project/types": ["@oxc-project/types@0.133.0", "", {}, "sha512-KzkdCd6Uxqnf6l3HOw1xfatAlUURA0g14cvBYFyJ5SaNOQbOUvBr9PKArcPcrNIeRsBdgcUzOGrhKveVpvOIGA=="], + + "@rolldown/binding-android-arm64": ["@rolldown/binding-android-arm64@1.0.3", "", { "os": "android", "cpu": "arm64" }, "sha512-454rs7jHngixp/NMxd5srYD57OnzSlZ/eFTETjORQHLwJG1lRtmNOJcBerZlfu4GjKqeq8aCCIQrMdHyhI51Hw=="], + + "@rolldown/binding-darwin-arm64": ["@rolldown/binding-darwin-arm64@1.0.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-PcAhP+ynjURNyy8SKGl5DQP94aGuB/7JrXJb/t7P+hanXvQVMWzUvRRhBAcg/lNRadBhoUPqSoP4xw5tR/KBEA=="], + + "@rolldown/binding-darwin-x64": ["@rolldown/binding-darwin-x64@1.0.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-9YpfeUvSE2RS7wysJ81uOZkXJz7f7Q55H2Gvp3VEw/EsahqDtrphrZ0EwDLK5vvKOzaCrBsjF8JmnMLcUt78Gg=="], + + "@rolldown/binding-freebsd-x64": ["@rolldown/binding-freebsd-x64@1.0.3", "", { "os": "freebsd", "cpu": "x64" }, "sha512-yB1IlAsSNHncV6SCTL27/MVGR5htvQsoGxIv5KMGXALp+Ll1wYsn+x98M9MW7qa+NdSbvrrY7ANI4wLJ0n1e6g=="], + + "@rolldown/binding-linux-arm-gnueabihf": ["@rolldown/binding-linux-arm-gnueabihf@1.0.3", "", { "os": "linux", "cpu": "arm" }, "sha512-Yi30IVAAfLUCy2MseFjbB1jAMDl1VMCAas5StnYp8da9+CKvMd2H2cbEjWcw5NPaPqzvYkVIaF1nNUG+b7u/sw=="], + + "@rolldown/binding-linux-arm64-gnu": ["@rolldown/binding-linux-arm64-gnu@1.0.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-jsO7R8To+AdlYgUmN5sHSCZbfhtMBkO0WUx8iORQnPcMMdgr7qM2DQmMwgabs3GhNztdmoKkMKQFHD6DTMCIQw=="], + + "@rolldown/binding-linux-arm64-musl": ["@rolldown/binding-linux-arm64-musl@1.0.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-VWkUHwWriDciit80wleYwKILoR/KMvxh/IdwS/paX+ZgpuRpCrKLUdadJbc0NpBEiyhpYawsJ73j9aCvOH+f7Q=="], + + "@rolldown/binding-linux-ppc64-gnu": ["@rolldown/binding-linux-ppc64-gnu@1.0.3", "", { "os": "linux", "cpu": "ppc64" }, "sha512-5f1laC0SlIR0yDbFCd8acUhvJIag6N3zC5P7oUPN6wX0aOma+uKJ0wBDH5aq7I1PVI2ttTlhJwzwRIBnLiSGEg=="], + + "@rolldown/binding-linux-s390x-gnu": ["@rolldown/binding-linux-s390x-gnu@1.0.3", "", { "os": "linux", "cpu": "s390x" }, "sha512-Iq4ko0r4XsgbrF/LunNgHtAGLRRVE2kXonAXQ/MV0mC6jQpMOhW1SvtZja2EhC/kd05++bP78dsqBeIQyYJ6Yg=="], + + "@rolldown/binding-linux-x64-gnu": ["@rolldown/binding-linux-x64-gnu@1.0.3", "", { "os": "linux", "cpu": "x64" }, "sha512-B8m6tD5+/N5FeNQFbKlLA/2yVq9ycQP1SeedyEYYKWBNR3ZQbkvIUcNnDNM03lO1l5F2roiiFJGgvoLLyZXtSg=="], + + "@rolldown/binding-linux-x64-musl": ["@rolldown/binding-linux-x64-musl@1.0.3", "", { "os": "linux", "cpu": "x64" }, "sha512-pSdpdUJHkuCxun9LE7jvgUB9qsRgaiyNNCX7m/AvHTcq67AiT/Yhoxvw5zPfhrM8k/BfP8ce/hMOpthKDpEUow=="], + + "@rolldown/binding-openharmony-arm64": ["@rolldown/binding-openharmony-arm64@1.0.3", "", { "os": "none", "cpu": "arm64" }, "sha512-OXXS3RKJgX2uLwM+gYyuH5omcH8fL1LJs96pZGgtetVCahON57+d4SJHzTgZiOjxgGkSnpXpOsWuPDGAKAigEg=="], + + "@rolldown/binding-wasm32-wasi": ["@rolldown/binding-wasm32-wasi@1.0.3", "", { "dependencies": { "@emnapi/core": "1.10.0", "@emnapi/runtime": "1.10.0", "@napi-rs/wasm-runtime": "^1.1.4" }, "cpu": "none" }, "sha512-JTtb8BWFynicNSoPrehsCzBtOKjZ6jhMiPFEmOiuXg1Fl8dn2KHQob+GuPSGR0dryQa1PQJbzjF3dqO/whhjLg=="], + + "@rolldown/binding-win32-arm64-msvc": ["@rolldown/binding-win32-arm64-msvc@1.0.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-gEdFFEN70A/jxb2svrWsN3aDL7OUtmvlOy+6fa2jxG8K0wQ1ZbdeLGnidov6Yu5/733dI5ySfzFlQ/cb0bSz1g=="], + + "@rolldown/binding-win32-x64-msvc": ["@rolldown/binding-win32-x64-msvc@1.0.3", "", { "os": "win32", "cpu": "x64" }, "sha512-eXB7CHuaQdqmJcc3koCNtNPmT/bj2gc999kUFgBxG8Ac0NdgXc4rkCHhqrgrhN3zddvvvrgzj1e90SuSfmyIXA=="], + + "@rolldown/pluginutils": ["@rolldown/pluginutils@1.0.1", "", {}, "sha512-2j9bGt5Jh8hj+vPtgzPtl72j0yRxHAyumoo6TNfAjsLB04UtpSvPbPcDcBMxz7n+9CYB0c1GxQFxYRg2jimqGw=="], + + "@standard-schema/spec": ["@standard-schema/spec@1.1.0", "", {}, "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w=="], + + "@tybys/wasm-util": ["@tybys/wasm-util@0.10.2", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-RoBvJ2X0wuKlWFIjrwffGw1IqZHKQqzIchKaadZZfnNpsAYp2mM0h36JtPCjNDAHGgYez/15uMBpfGwchhiMgg=="], + + "@types/chai": ["@types/chai@5.2.3", "", { "dependencies": { "@types/deep-eql": "*", "assertion-error": "^2.0.1" } }, "sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA=="], + + "@types/deep-eql": ["@types/deep-eql@4.0.2", "", {}, "sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw=="], + + "@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="], + + "@vitest/expect": ["@vitest/expect@4.1.9", "", { "dependencies": { "@standard-schema/spec": "^1.1.0", "@types/chai": "^5.2.2", "@vitest/spy": "4.1.9", "@vitest/utils": "4.1.9", "chai": "^6.2.2", "tinyrainbow": "^3.1.0" } }, "sha512-vl/rYsUKcBr3SnQn166+XR5ZQcgMx3DQhFWdfli/cWpLnLUmbxZvyrJZotLFUryib+LtArYMSTJ5RbQ57ZqrlA=="], + + "@vitest/mocker": ["@vitest/mocker@4.1.9", "", { "dependencies": { "@vitest/spy": "4.1.9", "estree-walker": "^3.0.3", "magic-string": "^0.30.21" }, "peerDependencies": { "msw": "^2.4.9", "vite": "^6.0.0 || ^7.0.0 || ^8.0.0" }, "optionalPeers": ["msw", "vite"] }, "sha512-EVkXzBjrPGM+cK8/ANWgBrkUCfJfb38/EfTSO8h7pWvKkyPkpWxvR7BkD2MyItMF62C97zAEoqdpUixwR/e+Rw=="], + + "@vitest/pretty-format": ["@vitest/pretty-format@4.1.9", "", { "dependencies": { "tinyrainbow": "^3.1.0" } }, "sha512-s0iufns3iIFitdgm+YR7g1whCAaGtXz459VS9/PqyKDEEFgYIhsHOQmXgIgDuYCt7DeQmiZT0Qe2OA2p4ZPu5A=="], + + "@vitest/runner": ["@vitest/runner@4.1.9", "", { "dependencies": { "@vitest/utils": "4.1.9", "pathe": "^2.0.3" } }, "sha512-KXLMDtc7oe70+3mJfGrPUWPesswH+3sTxAMAMl8DG7I8IUQT4XW718dY5ID3vPUcmlu27CcKfY4P3h3I29SLJg=="], + + "@vitest/snapshot": ["@vitest/snapshot@4.1.9", "", { "dependencies": { "@vitest/pretty-format": "4.1.9", "@vitest/utils": "4.1.9", "magic-string": "^0.30.21", "pathe": "^2.0.3" } }, "sha512-Jc7RKGNBo8Z28WYIm0Niej4xdSPByRf6mU58VpHQkd6Zh05rlnA+twjbK5HyeIGHxrzsc3mJgS43uM0CZKzaIA=="], + + "@vitest/spy": ["@vitest/spy@4.1.9", "", {}, "sha512-fHpsS6mIi+PiEW+vcRVOMkX1oSaPKne3VOclSFICPcGOmfKgXPU5iAah+wcNcj2xPrCCmfq99IDGf+EojhhvhA=="], + + "@vitest/utils": ["@vitest/utils@4.1.9", "", { "dependencies": { "@vitest/pretty-format": "4.1.9", "convert-source-map": "^2.0.0", "tinyrainbow": "^3.1.0" } }, "sha512-A51o8ymO5PpqlWNnBP9ZHPXDIpuMtTLlGSjN7la4US+LJzoUMyhwjA5QXlm39JexgwHKW4Xjs8Z2d3dLCXOeuA=="], + + "assertion-error": ["assertion-error@2.0.1", "", {}, "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA=="], + + "chai": ["chai@6.2.2", "", {}, "sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg=="], + + "convert-source-map": ["convert-source-map@2.0.0", "", {}, "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg=="], + + "detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="], + + "es-module-lexer": ["es-module-lexer@2.1.0", "", {}, "sha512-n27zTYMjYu1aj4MjCWzSP7G9r75utsaoc8m61weK+W8JMBGGQybd43GstCXZ3WNmSFtGT9wi59qQTW6mhTR5LQ=="], + + "estree-walker": ["estree-walker@3.0.3", "", { "dependencies": { "@types/estree": "^1.0.0" } }, "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g=="], + + "expect-type": ["expect-type@1.3.0", "", {}, "sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA=="], + + "fdir": ["fdir@6.5.0", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg=="], + + "fsevents": ["fsevents@2.3.3", "", { "os": "darwin" }, "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw=="], + + "lightningcss": ["lightningcss@1.32.0", "", { "dependencies": { "detect-libc": "^2.0.3" }, "optionalDependencies": { "lightningcss-android-arm64": "1.32.0", "lightningcss-darwin-arm64": "1.32.0", "lightningcss-darwin-x64": "1.32.0", "lightningcss-freebsd-x64": "1.32.0", "lightningcss-linux-arm-gnueabihf": "1.32.0", "lightningcss-linux-arm64-gnu": "1.32.0", "lightningcss-linux-arm64-musl": "1.32.0", "lightningcss-linux-x64-gnu": "1.32.0", "lightningcss-linux-x64-musl": "1.32.0", "lightningcss-win32-arm64-msvc": "1.32.0", "lightningcss-win32-x64-msvc": "1.32.0" } }, "sha512-NXYBzinNrblfraPGyrbPoD19C1h9lfI/1mzgWYvXUTe414Gz/X1FD2XBZSZM7rRTrMA8JL3OtAaGifrIKhQ5yQ=="], + + "lightningcss-android-arm64": ["lightningcss-android-arm64@1.32.0", "", { "os": "android", "cpu": "arm64" }, "sha512-YK7/ClTt4kAK0vo6w3X+Pnm0D2cf2vPHbhOXdoNti1Ga0al1P4TBZhwjATvjNwLEBCnKvjJc2jQgHXH0NEwlAg=="], + + "lightningcss-darwin-arm64": ["lightningcss-darwin-arm64@1.32.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-RzeG9Ju5bag2Bv1/lwlVJvBE3q6TtXskdZLLCyfg5pt+HLz9BqlICO7LZM7VHNTTn/5PRhHFBSjk5lc4cmscPQ=="], + + "lightningcss-darwin-x64": ["lightningcss-darwin-x64@1.32.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-U+QsBp2m/s2wqpUYT/6wnlagdZbtZdndSmut/NJqlCcMLTWp5muCrID+K5UJ6jqD2BFshejCYXniPDbNh73V8w=="], + + "lightningcss-freebsd-x64": ["lightningcss-freebsd-x64@1.32.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-JCTigedEksZk3tHTTthnMdVfGf61Fky8Ji2E4YjUTEQX14xiy/lTzXnu1vwiZe3bYe0q+SpsSH/CTeDXK6WHig=="], + + "lightningcss-linux-arm-gnueabihf": ["lightningcss-linux-arm-gnueabihf@1.32.0", "", { "os": "linux", "cpu": "arm" }, "sha512-x6rnnpRa2GL0zQOkt6rts3YDPzduLpWvwAF6EMhXFVZXD4tPrBkEFqzGowzCsIWsPjqSK+tyNEODUBXeeVHSkw=="], + + "lightningcss-linux-arm64-gnu": ["lightningcss-linux-arm64-gnu@1.32.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-0nnMyoyOLRJXfbMOilaSRcLH3Jw5z9HDNGfT/gwCPgaDjnx0i8w7vBzFLFR1f6CMLKF8gVbebmkUN3fa/kQJpQ=="], + + "lightningcss-linux-arm64-musl": ["lightningcss-linux-arm64-musl@1.32.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-UpQkoenr4UJEzgVIYpI80lDFvRmPVg6oqboNHfoH4CQIfNA+HOrZ7Mo7KZP02dC6LjghPQJeBsvXhJod/wnIBg=="], + + "lightningcss-linux-x64-gnu": ["lightningcss-linux-x64-gnu@1.32.0", "", { "os": "linux", "cpu": "x64" }, "sha512-V7Qr52IhZmdKPVr+Vtw8o+WLsQJYCTd8loIfpDaMRWGUZfBOYEJeyJIkqGIDMZPwPx24pUMfwSxxI8phr/MbOA=="], + + "lightningcss-linux-x64-musl": ["lightningcss-linux-x64-musl@1.32.0", "", { "os": "linux", "cpu": "x64" }, "sha512-bYcLp+Vb0awsiXg/80uCRezCYHNg1/l3mt0gzHnWV9XP1W5sKa5/TCdGWaR/zBM2PeF/HbsQv/j2URNOiVuxWg=="], + + "lightningcss-win32-arm64-msvc": ["lightningcss-win32-arm64-msvc@1.32.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-8SbC8BR40pS6baCM8sbtYDSwEVQd4JlFTOlaD3gWGHfThTcABnNDBda6eTZeqbofalIJhFx0qKzgHJmcPTnGdw=="], + + "lightningcss-win32-x64-msvc": ["lightningcss-win32-x64-msvc@1.32.0", "", { "os": "win32", "cpu": "x64" }, "sha512-Amq9B/SoZYdDi1kFrojnoqPLxYhQ4Wo5XiL8EVJrVsB8ARoC1PWW6VGtT0WKCemjy8aC+louJnjS7U18x3b06Q=="], + + "magic-string": ["magic-string@0.30.21", "", { "dependencies": { "@jridgewell/sourcemap-codec": "^1.5.5" } }, "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ=="], + + "nanoid": ["nanoid@3.3.15", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-y7Wygv/7mEOvxTuEQDB8StXdMRBWf1kR/tlhAzBRUFkB2jfcLOAxO/SHmOO2zgz1pVgK29/kyupn059/bCHdjA=="], + + "obug": ["obug@2.1.3", "", {}, "sha512-9miFgM2OFba7hB+pRgvtV84pYTBaoTHohvmIgiRt6dRIzbwEOIaNaP+dIlGs2fNFoB0SeISs0Jz5WFVRid6Xyg=="], + + "pathe": ["pathe@2.0.3", "", {}, "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w=="], + + "picocolors": ["picocolors@1.1.1", "", {}, "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA=="], + + "picomatch": ["picomatch@4.0.4", "", {}, "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A=="], + + "postcss": ["postcss@8.5.15", "", { "dependencies": { "nanoid": "^3.3.12", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" } }, "sha512-FfR8sjd4em2T6fb3I2MwAJU7HWVMr9zba+enmQeeWFfCbm+UOC/0X4DS8XtpUTMwWMGbjKYP7xjfNekzyGmB3A=="], + + "rolldown": ["rolldown@1.0.3", "", { "dependencies": { "@oxc-project/types": "=0.133.0", "@rolldown/pluginutils": "^1.0.0" }, "optionalDependencies": { "@rolldown/binding-android-arm64": "1.0.3", "@rolldown/binding-darwin-arm64": "1.0.3", "@rolldown/binding-darwin-x64": "1.0.3", "@rolldown/binding-freebsd-x64": "1.0.3", "@rolldown/binding-linux-arm-gnueabihf": "1.0.3", "@rolldown/binding-linux-arm64-gnu": "1.0.3", "@rolldown/binding-linux-arm64-musl": "1.0.3", "@rolldown/binding-linux-ppc64-gnu": "1.0.3", "@rolldown/binding-linux-s390x-gnu": "1.0.3", "@rolldown/binding-linux-x64-gnu": "1.0.3", "@rolldown/binding-linux-x64-musl": "1.0.3", "@rolldown/binding-openharmony-arm64": "1.0.3", "@rolldown/binding-wasm32-wasi": "1.0.3", "@rolldown/binding-win32-arm64-msvc": "1.0.3", "@rolldown/binding-win32-x64-msvc": "1.0.3" }, "bin": { "rolldown": "./bin/cli.mjs" } }, "sha512-i00lAJ2ks1BYr7rjNjKC7BcqAS7nVfiT3QX1SI5aY+AFHblCmaUf9OE9dbdzDvW6dJxbi2ZCZiy9v3CcwOiX3g=="], + + "siginfo": ["siginfo@2.0.0", "", {}, "sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g=="], + + "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], + + "stackback": ["stackback@0.0.2", "", {}, "sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw=="], + + "std-env": ["std-env@4.1.0", "", {}, "sha512-Rq7ybcX2RuC55r9oaPVEW7/xu3tj8u4GeBYHBWCychFtzMIr86A7e3PPEBPT37sHStKX3+TiX/Fr/ACmJLVlLQ=="], + + "tinybench": ["tinybench@2.9.0", "", {}, "sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg=="], + + "tinyexec": ["tinyexec@1.2.4", "", {}, "sha512-SHf/r48b7vOrjve9PxJo3MN5v5yuyjHvdUcrQffT3WXMUfnGmHDVbC4k3sHJaJTgZCwpUplIaAo5ANtMyp3YHg=="], + + "tinyglobby": ["tinyglobby@0.2.17", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.4" } }, "sha512-wXR/dYpcqKmfWpEdZjiKJOwCNFndD0DMnrW/cYjVGttEkBfVgcLFHoNrlj47mjOVic9yyNu65alsgF4NQyTa2g=="], + + "tinyrainbow": ["tinyrainbow@3.1.0", "", {}, "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw=="], + + "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], + + "vite": ["vite@8.0.16", "", { "dependencies": { "lightningcss": "^1.32.0", "picomatch": "^4.0.4", "postcss": "^8.5.15", "rolldown": "1.0.3", "tinyglobby": "^0.2.17" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^20.19.0 || >=22.12.0", "@vitejs/devtools": "^0.1.18", "esbuild": "^0.27.0 || ^0.28.0", "jiti": ">=1.21.0", "less": "^4.0.0", "sass": "^1.70.0", "sass-embedded": "^1.70.0", "stylus": ">=0.54.8", "sugarss": "^5.0.0", "terser": "^5.16.0", "tsx": "^4.8.1", "yaml": "^2.4.2" }, "optionalPeers": ["@types/node", "@vitejs/devtools", "esbuild", "jiti", "less", "sass", "sass-embedded", "stylus", "sugarss", "terser", "tsx", "yaml"], "bin": { "vite": "bin/vite.js" } }, "sha512-h9bXPmJichP5fLmVQo3PyaGSDE2n3aPuomeAlVRm0JLmt4rY6zmPKd59HYI4LNW8oTK7tlTsuC7l/m7awx9Jcw=="], + + "vitest": ["vitest@4.1.9", "", { "dependencies": { "@vitest/expect": "4.1.9", "@vitest/mocker": "4.1.9", "@vitest/pretty-format": "4.1.9", "@vitest/runner": "4.1.9", "@vitest/snapshot": "4.1.9", "@vitest/spy": "4.1.9", "@vitest/utils": "4.1.9", "es-module-lexer": "^2.0.0", "expect-type": "^1.3.0", "magic-string": "^0.30.21", "obug": "^2.1.1", "pathe": "^2.0.3", "picomatch": "^4.0.3", "std-env": "^4.0.0-rc.1", "tinybench": "^2.9.0", "tinyexec": "^1.0.2", "tinyglobby": "^0.2.15", "tinyrainbow": "^3.1.0", "vite": "^6.0.0 || ^7.0.0 || ^8.0.0", "why-is-node-running": "^2.3.0" }, "peerDependencies": { "@edge-runtime/vm": "*", "@opentelemetry/api": "^1.9.0", "@types/node": "^20.0.0 || ^22.0.0 || >=24.0.0", "@vitest/browser-playwright": "4.1.9", "@vitest/browser-preview": "4.1.9", "@vitest/browser-webdriverio": "4.1.9", "@vitest/coverage-istanbul": "4.1.9", "@vitest/coverage-v8": "4.1.9", "@vitest/ui": "4.1.9", "happy-dom": "*", "jsdom": "*" }, "optionalPeers": ["@edge-runtime/vm", "@opentelemetry/api", "@types/node", "@vitest/browser-playwright", "@vitest/browser-preview", "@vitest/browser-webdriverio", "@vitest/coverage-istanbul", "@vitest/coverage-v8", "@vitest/ui", "happy-dom", "jsdom"], "bin": { "vitest": "./vitest.mjs" } }, "sha512-nE3/LEyc0z87uHYLZebqCUOaJr2hdtuPp7BQ4BosVFnfltxgAvMG08NyrSGlPpOUWvR27c5flSmYFTNr78L9GQ=="], + + "why-is-node-running": ["why-is-node-running@2.3.0", "", { "dependencies": { "siginfo": "^2.0.0", "stackback": "0.0.2" }, "bin": { "why-is-node-running": "cli.js" } }, "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w=="], + } +} diff --git a/examples/features/vitest-workspace-grader/evals/dataset.eval.yaml b/examples/features/vitest-workspace-grader/evals/dataset.eval.yaml new file mode 100644 index 000000000..f1076b018 --- /dev/null +++ b/examples/features/vitest-workspace-grader/evals/dataset.eval.yaml @@ -0,0 +1,29 @@ +name: vitest-workspace-grader +description: Deterministic workspace grading with a Vitest verifier file. + +workspace: + template: ../workspace-template + +execution: + target: mock_agent + +tests: + - id: welcome-banner + criteria: >- + Add a welcome banner to app/page.tsx with ready status text and a + dashboard link. + input: >- + Update app/page.tsx so the page shows "Status: All systems ready", + includes the text "Open dashboard", links it to /dashboard, and leaves + no TODO marker behind. + assertions: + - name: vitest-welcome-banner + type: code-grader + command: + [ + "bun", + "../../../../apps/cli/src/cli.ts", + "eval", + "vitest", + "../graders/welcome-banner.test.ts", + ] diff --git a/examples/features/vitest-workspace-grader/graders/welcome-banner.test.ts b/examples/features/vitest-workspace-grader/graders/welcome-banner.test.ts new file mode 100644 index 000000000..12fb09f7a --- /dev/null +++ b/examples/features/vitest-workspace-grader/graders/welcome-banner.test.ts @@ -0,0 +1,30 @@ +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; + +function readWorkspaceFile(relativePath: string) { + return readFileSync( + join(process.env.AGENTV_WORKSPACE_PATH ?? process.cwd(), relativePath), + 'utf8', + ); +} + +describe('welcome banner', () => { + const page = () => readWorkspaceFile('app/page.tsx'); + + it('shows ready status text', () => { + expect(page()).toContain('Status: All systems ready'); + }); + + it('shows the dashboard call to action', () => { + expect(page()).toContain('Open dashboard'); + }); + + it('links the call to action to /dashboard', () => { + expect(page()).toMatch(/href=["']\/dashboard["']/); + }); + + it('does not leave TODO markers behind', () => { + expect(page()).not.toMatch(/TODO/i); + }); +}); diff --git a/examples/features/vitest-workspace-grader/package.json b/examples/features/vitest-workspace-grader/package.json new file mode 100644 index 000000000..5863d0f06 --- /dev/null +++ b/examples/features/vitest-workspace-grader/package.json @@ -0,0 +1,8 @@ +{ + "name": "agentv-example-vitest-workspace-grader", + "private": true, + "type": "module", + "devDependencies": { + "vitest": "^4.0.0" + } +} diff --git a/examples/features/vitest-workspace-grader/workspace-template/app/page.tsx b/examples/features/vitest-workspace-grader/workspace-template/app/page.tsx new file mode 100644 index 000000000..48dade169 --- /dev/null +++ b/examples/features/vitest-workspace-grader/workspace-template/app/page.tsx @@ -0,0 +1,8 @@ +export default function Page() { + return ( +
+

Welcome

+

TODO: add system status and dashboard link.

+
+ ); +} diff --git a/packages/sdk/README.md b/packages/sdk/README.md index cbe4a77a4..ea6c5cec8 100644 --- a/packages/sdk/README.md +++ b/packages/sdk/README.md @@ -56,6 +56,62 @@ export default defineCodeGrader(({ output, traceSummary }) => ({ Both functions handle stdin/stdout parsing, snake_case conversion, Zod validation, and error handling automatically. +### Vitest workspace verifiers (preferred deterministic workspace checks) + +Use normal Vitest tests when deterministic workspace checks can be expressed with `expect(...)`: + +```typescript +// graders/welcome-banner.test.ts +import { readFileSync } from 'node:fs'; +import { expect, it } from 'vitest'; + +it('links to the dashboard', () => { + const page = readFileSync('app/page.tsx', 'utf8'); + expect(page).toMatch(/href=["']\/dashboard["']/); +}); +``` + +Then reference the verifier directly from eval YAML through AgentV's built-in code-grader adapter: + +```yaml +assertions: + - name: vitest-welcome-banner + type: code-grader + command: [agentv, eval, vitest, graders/welcome-banner.test.ts] +``` + +The command reads the normal code-grader stdin payload, runs Vitest in `workspace_path`, maps each Vitest test to an AgentV assertion, and computes score as `passed / total`. + +Use `defineVitestWorkspaceGrader` when embedding this adapter in a custom script or when you need custom command options: + +```typescript +#!/usr/bin/env bun +import { defineVitestWorkspaceGrader } from '@agentv/sdk'; + +export default defineVitestWorkspaceGrader({ + testFile: 'graders/welcome-banner.test.ts', + copyTestFilesToWorkspace: true, +}); +``` + +### defineWorkspaceGrader (small file checks) + +Use `defineWorkspaceGrader` when a deterministic grader needs to inspect files in the evaluated workspace: + +```typescript +#!/usr/bin/env bun +import { defineWorkspaceGrader } from '@agentv/sdk'; + +export default defineWorkspaceGrader(async ({ workspace }) => [ + await workspace.file('app/page.tsx').contains('Status: All systems ready'), + await workspace.file('app/page.tsx').contains('Open dashboard'), + await workspace.file('app/page.tsx').matches(/href=["']\/dashboard["']/), + await workspace.file('app/page.tsx').notMatches(/TODO/i), +]); +``` + +The helper resolves `workspace_path` or `AGENTV_WORKSPACE_PATH`, reads files relative to the workspace, returns AgentV assertion objects, and computes `score` as passed checks divided by total checks. Prefer Vitest verifiers for checks that naturally fit a test file; use this lower-level helper for tiny one-off graders or custom score shaping. + ### defineEval (YAML-aligned `.eval.ts` authoring) ```typescript @@ -143,13 +199,15 @@ Python workflows should emit canonical YAML/JSONL or implement code graders over - `defineAssertion(handler)` - Define a custom assertion (pass/fail + optional score) - `defineCodeGrader(handler)` - Define a code grader (full score control) +- `defineVitestWorkspaceGrader(options)` - Embed the Vitest workspace verifier adapter in a custom script +- `defineWorkspaceGrader(handler)` - Define a workspace-aware code grader with file assertion helpers - `definePromptTemplate(handler)` - Define a dynamic prompt template - `defineEval(definition)` / `evalSuite(definition)` - Define a YAML-aligned `.eval.ts` suite - `graders` - Catalog of built-in AgentV grader config helpers - `containsGrader`, `equalsGrader`, `exactGrader`, `regexGrader`, `isJsonGrader`, `jsonGrader`, `rubricsGrader`, `llmGrader`, `codeGrader` - Named grader helper functions - `toEvalYamlObject(definition)` / `serializeEvalYaml(definition)` - Lower or serialize canonical eval YAML - `AssertionContext`, `AssertionScore` - Assertion types -- `CodeGraderInput`, `CodeGraderResult` - Code grader types +- `CodeGraderInput`, `CodeGraderResult`, `Workspace`, `WorkspaceAssertion` - Grader types - `TraceSummary`, `Message`, `ToolCall` - Trace data types - `createTargetClient()` - LLM target proxy for graders - `z` - Re-exported Zod for custom config schemas diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts index fce87cd0f..b9b4c15dc 100644 --- a/packages/sdk/src/index.ts +++ b/packages/sdk/src/index.ts @@ -33,6 +33,30 @@ * })); * ``` * + * @example Vitest workspace verifier adapter (custom wrapper form) + * ```typescript + * #!/usr/bin/env bun + * import { defineVitestWorkspaceGrader } from '@agentv/sdk'; + * + * export default defineVitestWorkspaceGrader({ + * testFile: 'graders/welcome-banner.test.ts', + * copyTestFilesToWorkspace: true, + * }); + * ``` + * + * @example Workspace grader (small file checks) + * ```typescript + * #!/usr/bin/env bun + * import { defineWorkspaceGrader } from '@agentv/sdk'; + * + * export default defineWorkspaceGrader(async ({ workspace }) => [ + * await workspace.file('app/page.tsx').contains('Status: All systems ready'), + * await workspace.file('app/page.tsx').contains('Open dashboard'), + * await workspace.file('app/page.tsx').matches(/href=["']\/dashboard["']/), + * await workspace.file('app/page.tsx').notMatches(/TODO/i), + * ]); + * ``` + * * @packageDocumentation */ @@ -164,6 +188,29 @@ export { type TargetInvokeResponse, } from './target-client.js'; +// Re-export workspace grader helpers +export { + createWorkspace, + defineWorkspaceGrader, + normalizeWorkspaceGraderResult, + runWorkspaceGrader, + type Workspace, + type WorkspaceAssertion, + type WorkspaceFile, + type WorkspaceFileAssertionOptions, + type WorkspaceGraderContext, + type WorkspaceGraderHandler, + type WorkspaceGraderReturn, +} from './workspace.js'; + +// Re-export Vitest workspace verifier adapter +export { + defineVitestWorkspaceGrader, + runVitestWorkspaceGrader, + vitestReportToCodeGraderResult, + type VitestWorkspaceGraderOptions, +} from './vitest.js'; + // Re-export Zod for typed config support export { z } from 'zod'; @@ -179,6 +226,7 @@ import { type AssertionHandler, runAssertion } from './assertion.js'; import { type PromptTemplateHandler, runPromptTemplate } from './prompt-template.js'; import { type CodeGraderHandler, runCodeGrader } from './runtime.js'; +export { runCodeGrader }; export type { CodeGraderHandler }; export type { PromptTemplateHandler }; diff --git a/packages/sdk/src/vitest.ts b/packages/sdk/src/vitest.ts new file mode 100644 index 000000000..c1df6ed96 --- /dev/null +++ b/packages/sdk/src/vitest.ts @@ -0,0 +1,408 @@ +/** + * Vitest workspace verifier adapter. + * + * This module keeps deterministic workspace verification in familiar Vitest + * tests while translating the JSON reporter output into AgentV's code-grader + * result contract. + */ +import { spawn } from 'node:child_process'; +import { copyFile, mkdtemp, readFile, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import nodePath from 'node:path'; + +import { runCodeGrader } from './runtime.js'; +import { type CodeGraderInput, type CodeGraderResult, CodeGraderResultSchema } from './schemas.js'; + +export interface VitestWorkspaceGraderOptions { + /** + * Vitest verifier file(s). By default these are relative to the prepared + * workspace. When `copyTestFilesToWorkspace` is true, relative paths resolve + * from `testFileRoot` instead. + * + * When provided without `command`, the adapter runs + * `bunx vitest run --reporter=json --outputFile `. + */ + readonly testFile?: string | readonly string[]; + /** + * Copy `testFile` entries into a temporary directory inside the workspace + * before running Vitest. Use this for hidden verifier files that live beside + * the eval instead of inside the prepared workspace. + */ + readonly copyTestFilesToWorkspace?: boolean; + /** + * Base directory for copied `testFile` entries. Defaults to `process.cwd()`. + */ + readonly testFileRoot?: string; + /** + * Full command to run. Use this for package scripts such as + * `["bun", "run", "verify:workspace"]`. + */ + readonly command?: readonly string[]; + /** + * Base Vitest command used with `testFile`. Defaults to + * `["bunx", "vitest", "run"]`. + */ + readonly vitestCommand?: readonly string[]; + /** Workspace-relative directory to run the command in. Defaults to workspace root. */ + readonly cwd?: string; + /** Append `--reporter=json --outputFile ` to the command. Defaults to true for testFile mode. */ + readonly appendReporterArgs?: boolean; + /** Read Vitest JSON from this path instead of stdout. Relative paths resolve under `cwd`. */ + readonly outputFile?: string; + readonly timeoutMs?: number; + readonly env?: Readonly>; + readonly passWithNoTests?: boolean; +} + +interface CommandResult { + readonly exitCode: number | null; + readonly stdout: string; + readonly stderr: string; +} + +interface VitestAssertionResult { + readonly ancestorTitles?: readonly string[]; + readonly fullName?: string; + readonly title?: string; + readonly status?: string; + readonly failureMessages?: readonly string[]; + readonly duration?: number; +} + +interface VitestFileResult { + readonly name?: string; + readonly assertionResults?: readonly VitestAssertionResult[]; +} + +interface VitestJsonReport { + readonly success?: boolean; + readonly numTotalTests?: number; + readonly numPassedTests?: number; + readonly numFailedTests?: number; + readonly numPendingTests?: number; + readonly numTodoTests?: number; + readonly testResults?: readonly VitestFileResult[]; +} + +function workspacePathFrom(input: CodeGraderInput): string | undefined { + const workspacePath = input.workspacePath ?? process.env.AGENTV_WORKSPACE_PATH; + return workspacePath?.trim() ? workspacePath : undefined; +} + +function truncate(value: string, maxLength = 2000): string { + if (value.length <= maxLength) { + return value; + } + return `${value.slice(0, maxLength)}\n...(truncated)`; +} + +function resolveInsideWorkspace( + workspacePath: string, + relativePath: string, + label: string, + options: { readonly allowRoot?: boolean } = {}, +): string { + if (!relativePath.trim()) { + throw new Error(`${label} must not be empty.`); + } + + if (nodePath.isAbsolute(relativePath)) { + throw new Error(`${label} must be relative to the workspace: ${relativePath}`); + } + + const root = nodePath.resolve(workspacePath); + const resolvedPath = nodePath.resolve(root, relativePath); + const relativeToRoot = nodePath.relative(root, resolvedPath); + if (relativeToRoot === '' && options.allowRoot === true) { + return resolvedPath; + } + + if ( + relativeToRoot === '' || + relativeToRoot.startsWith('..') || + nodePath.isAbsolute(relativeToRoot) + ) { + throw new Error(`${label} must stay inside the workspace: ${relativePath}`); + } + + return resolvedPath; +} + +function normalizeTestFiles(testFile: string | readonly string[] | undefined): readonly string[] { + if (testFile === undefined) { + return []; + } + return typeof testFile === 'string' ? [testFile] : [...testFile]; +} + +function buildCommand( + options: VitestWorkspaceGraderOptions, + testFiles: readonly string[] = normalizeTestFiles(options.testFile), +) { + if (options.command && options.command.length > 0) { + return [...options.command]; + } + + const command = [...(options.vitestCommand ?? ['bunx', 'vitest', 'run'])]; + command.push(...testFiles); + return command; +} + +function parseJsonObjectFromText(text: string): unknown { + const trimmed = text.trim(); + if (!trimmed) { + return undefined; + } + + try { + return JSON.parse(trimmed); + } catch { + const start = trimmed.indexOf('{'); + const end = trimmed.lastIndexOf('}'); + if (start >= 0 && end > start) { + return JSON.parse(trimmed.slice(start, end + 1)); + } + throw new Error('Vitest output did not contain a JSON object.'); + } +} + +function isVitestJsonReport(value: unknown): value is VitestJsonReport { + return ( + typeof value === 'object' && + value !== null && + Array.isArray((value as VitestJsonReport).testResults) + ); +} + +function assertionText(file: VitestFileResult, assertion: VitestAssertionResult): string { + const title = + assertion.fullName ?? + [...(assertion.ancestorTitles ?? []), assertion.title].filter(Boolean).join(' '); + return title || file.name || 'Vitest assertion'; +} + +export function vitestReportToCodeGraderResult( + report: VitestJsonReport, + options: Pick = {}, +): CodeGraderResult { + const assertions = (report.testResults ?? []).flatMap((file) => + (file.assertionResults ?? []).map((item) => { + const passed = item.status === 'passed'; + const evidence = + item.failureMessages && item.failureMessages.length > 0 + ? truncate(item.failureMessages.join('\n\n')) + : undefined; + return { + text: assertionText(file, item), + passed, + ...(evidence !== undefined ? { evidence } : {}), + }; + }), + ); + + if (assertions.length === 0) { + const passed = options.passWithNoTests === true; + return CodeGraderResultSchema.parse({ + score: passed ? 1 : 0, + assertions: [{ text: 'Vitest reported no tests', passed }], + details: { + vitest_success: report.success ?? false, + num_total_tests: report.numTotalTests ?? 0, + num_passed_tests: report.numPassedTests ?? 0, + num_failed_tests: report.numFailedTests ?? 0, + num_pending_tests: report.numPendingTests ?? 0, + num_todo_tests: report.numTodoTests ?? 0, + }, + }); + } + + const passedCount = assertions.filter((item) => item.passed).length; + return CodeGraderResultSchema.parse({ + score: passedCount / assertions.length, + assertions, + details: { + vitest_success: report.success ?? passedCount === assertions.length, + num_total_tests: report.numTotalTests ?? assertions.length, + num_passed_tests: report.numPassedTests ?? passedCount, + num_failed_tests: report.numFailedTests ?? assertions.length - passedCount, + num_pending_tests: report.numPendingTests ?? 0, + num_todo_tests: report.numTodoTests ?? 0, + }, + }); +} + +function runCommand( + command: readonly string[], + options: { + readonly cwd: string; + readonly timeoutMs?: number; + readonly env?: Readonly>; + }, +): Promise { + if (command.length === 0) { + return Promise.reject(new Error('Vitest command must not be empty.')); + } + + return new Promise((resolve, reject) => { + const child = spawn(command[0], command.slice(1), { + cwd: options.cwd, + env: { ...process.env, ...options.env }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + let stdout = ''; + let stderr = ''; + let timedOut = false; + const timeout = + options.timeoutMs !== undefined + ? setTimeout(() => { + timedOut = true; + child.kill('SIGTERM'); + }, options.timeoutMs) + : undefined; + + child.stdout.setEncoding('utf8'); + child.stderr.setEncoding('utf8'); + child.stdout.on('data', (chunk) => { + stdout += chunk; + }); + child.stderr.on('data', (chunk) => { + stderr += chunk; + }); + child.on('error', reject); + child.on('close', (exitCode) => { + if (timeout) { + clearTimeout(timeout); + } + if (timedOut) { + reject(new Error(`Vitest command timed out after ${options.timeoutMs}ms.`)); + return; + } + resolve({ exitCode, stdout, stderr }); + }); + }); +} + +function resolveSourceTestFile(testFileRoot: string, testFile: string): string { + if (!testFile.trim()) { + throw new Error('testFile entries must not be empty.'); + } + + return nodePath.isAbsolute(testFile) + ? nodePath.resolve(testFile) + : nodePath.resolve(testFileRoot, testFile); +} + +async function copyTestFilesIntoWorkspace( + testFiles: readonly string[], + options: Pick, + cwd: string, +): Promise<{ readonly testFiles: readonly string[]; readonly tempDir?: string }> { + if (testFiles.length === 0) { + return { testFiles }; + } + + const tempDir = await mkdtemp(nodePath.join(cwd, '.agentv-vitest-')); + const testFileRoot = nodePath.resolve(options.testFileRoot ?? process.cwd()); + const copiedFiles = await Promise.all( + testFiles.map(async (testFile, index) => { + const sourcePath = resolveSourceTestFile(testFileRoot, testFile); + const destinationPath = nodePath.join(tempDir, `${index}-${nodePath.basename(testFile)}`); + await copyFile(sourcePath, destinationPath); + return nodePath.relative(cwd, destinationPath); + }), + ); + return { testFiles: copiedFiles, tempDir }; +} + +async function readVitestReport( + commandResult: CommandResult, + outputFile: string | undefined, +): Promise { + const rawReport = + outputFile !== undefined ? await readFile(outputFile, 'utf8') : commandResult.stdout; + const parsed = parseJsonObjectFromText(rawReport); + if (!isVitestJsonReport(parsed)) { + throw new Error('Vitest JSON report did not include testResults[].'); + } + return parsed; +} + +export async function runVitestWorkspaceGrader( + options: VitestWorkspaceGraderOptions, + input: CodeGraderInput, +): Promise { + const workspacePath = workspacePathFrom(input); + if (!workspacePath) { + return { + score: 0, + assertions: [ + { + text: 'Vitest workspace verifier requires workspace_path', + passed: false, + evidence: 'Configure workspace in the eval YAML so AgentV can pass workspace_path.', + }, + ], + }; + } + + const tempDirs: string[] = []; + try { + const cwd = options.cwd + ? resolveInsideWorkspace(workspacePath, options.cwd, 'cwd', { allowRoot: true }) + : workspacePath; + const testFiles = normalizeTestFiles(options.testFile); + const preparedTestFiles = + options.copyTestFilesToWorkspace === true + ? await copyTestFilesIntoWorkspace(testFiles, options, cwd) + : { testFiles }; + if (preparedTestFiles.tempDir) { + tempDirs.push(preparedTestFiles.tempDir); + } + + const command = buildCommand(options, preparedTestFiles.testFiles); + const appendReporterArgs = options.appendReporterArgs ?? options.command === undefined; + let outputFile = options.outputFile + ? resolveInsideWorkspace(cwd, options.outputFile, 'outputFile') + : undefined; + + if (appendReporterArgs) { + const tempDir = await mkdtemp(nodePath.join(tmpdir(), 'agentv-vitest-')); + tempDirs.push(tempDir); + outputFile = nodePath.join(tempDir, 'results.json'); + command.push('--reporter=json', `--outputFile=${outputFile}`); + } + + const result = await runCommand(command, { + cwd, + timeoutMs: options.timeoutMs, + env: { + AGENTV_WORKSPACE_PATH: workspacePath, + ...(outputFile !== undefined ? { AGENTV_VITEST_JSON_PATH: outputFile } : {}), + ...options.env, + }, + }); + + const report = await readVitestReport(result, outputFile); + return vitestReportToCodeGraderResult(report, options); + } catch (error) { + return { + score: 0, + assertions: [ + { + text: 'Vitest workspace verifier failed to run', + passed: false, + evidence: error instanceof Error ? error.message : String(error), + }, + ], + }; + } finally { + for (const tempDir of tempDirs.reverse()) { + await rm(tempDir, { recursive: true, force: true }).catch(() => {}); + } + } +} + +export function defineVitestWorkspaceGrader(options: VitestWorkspaceGraderOptions): void { + runCodeGrader((input) => runVitestWorkspaceGrader(options, input)); +} diff --git a/packages/sdk/src/workspace.ts b/packages/sdk/src/workspace.ts new file mode 100644 index 000000000..b9c5a13a9 --- /dev/null +++ b/packages/sdk/src/workspace.ts @@ -0,0 +1,303 @@ +/** + * Workspace grader helpers for deterministic file assertions. + * + * `defineWorkspaceGrader()` wraps the code-grader runtime with a small + * workspace object so graders can read files and return assertion arrays + * without hand-rolling stdin parsing, workspace path fallback, file reads, or + * score aggregation. + */ +import { readFile, stat } from 'node:fs/promises'; +import nodePath from 'node:path'; + +import { runCodeGrader } from './runtime.js'; +import { type CodeGraderInput, type CodeGraderResult, CodeGraderResultSchema } from './schemas.js'; + +export interface WorkspaceAssertion { + readonly text: string; + readonly passed: boolean; + readonly evidence?: string; +} + +type Awaitable = T | Promise; + +export type WorkspaceGraderReturn = + | CodeGraderResult + | WorkspaceAssertion + | readonly Awaitable[]; + +export interface WorkspaceFileAssertionOptions { + readonly text?: string; +} + +export interface WorkspaceFile { + readonly path: string; + readonly absolutePath?: string; + readText(): Promise; + exists(options?: WorkspaceFileAssertionOptions): Promise; + contains(expected: string, options?: WorkspaceFileAssertionOptions): Promise; + notContains( + expected: string, + options?: WorkspaceFileAssertionOptions, + ): Promise; + matches(pattern: RegExp, options?: WorkspaceFileAssertionOptions): Promise; + notMatches(pattern: RegExp, options?: WorkspaceFileAssertionOptions): Promise; +} + +export interface Workspace { + readonly path?: string; + file(relativePath: string): WorkspaceFile; + readText(relativePath: string): Promise; +} + +export type WorkspaceGraderContext = CodeGraderInput & { + readonly workspace: Workspace; +}; + +export type WorkspaceGraderHandler = ( + context: WorkspaceGraderContext, +) => WorkspaceGraderReturn | Promise; + +interface ResolvedWorkspacePath { + readonly displayPath: string; + readonly absolutePath?: string; + readonly error?: string; +} + +function workspacePathFrom(input: CodeGraderInput): string | undefined { + const workspacePath = input.workspacePath ?? process.env.AGENTV_WORKSPACE_PATH; + if (!workspacePath?.trim()) { + return undefined; + } + return workspacePath; +} + +function normalizeDisplayPath(relativePath: string): string { + return relativePath.split(nodePath.sep).join('/'); +} + +function resolveWorkspacePath(workspacePath: string | undefined, relativePath: string) { + const displayPath = normalizeDisplayPath(relativePath); + + if (!workspacePath) { + return { + displayPath, + error: 'Workspace path is not available. Configure workspace in the eval YAML.', + }; + } + + if (!relativePath.trim()) { + return { displayPath, error: 'Workspace file path must not be empty.' }; + } + + if (nodePath.isAbsolute(relativePath)) { + return { + displayPath, + error: `Workspace file path must be relative: ${displayPath}`, + }; + } + + const root = nodePath.resolve(workspacePath); + const absolutePath = nodePath.resolve(root, relativePath); + const relativeToRoot = nodePath.relative(root, absolutePath); + + if ( + relativeToRoot === '' || + relativeToRoot.startsWith('..') || + nodePath.isAbsolute(relativeToRoot) + ) { + return { + displayPath, + error: `Workspace file path must stay inside the workspace: ${displayPath}`, + }; + } + + return { + displayPath: normalizeDisplayPath(relativeToRoot), + absolutePath, + }; +} + +function assertion(text: string, passed: boolean, evidence?: string): WorkspaceAssertion { + return { + text, + passed, + ...(evidence !== undefined ? { evidence } : {}), + }; +} + +function quote(value: string): string { + return JSON.stringify(value); +} + +function regexLabel(pattern: RegExp): string { + return `/${pattern.source}/${pattern.flags}`; +} + +async function readFileForAssertion( + file: WorkspaceFile, +): Promise<{ readonly content: string } | { readonly error: string }> { + try { + return { content: await file.readText() }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } +} + +function isCodeGraderResult(value: WorkspaceGraderReturn): value is CodeGraderResult { + return !Array.isArray(value) && typeof value === 'object' && value !== null && 'score' in value; +} + +export function createWorkspace(input: CodeGraderInput): Workspace { + const workspacePath = workspacePathFrom(input); + const textCache = new Map>(); + + async function readText(relativePath: string) { + const resolved = resolveWorkspacePath(workspacePath, relativePath); + if (resolved.error) { + throw new Error(resolved.error); + } + + const absolutePath = resolved.absolutePath; + if (!absolutePath) { + throw new Error(`Unable to resolve workspace file: ${resolved.displayPath}`); + } + + let pendingRead = textCache.get(absolutePath); + if (!pendingRead) { + pendingRead = readFile(absolutePath, 'utf8'); + textCache.set(absolutePath, pendingRead); + } + return pendingRead; + } + + function file(relativePath: string): WorkspaceFile { + const resolved = resolveWorkspacePath(workspacePath, relativePath); + const label = resolved.displayPath; + + return { + path: label, + ...(resolved.absolutePath !== undefined ? { absolutePath: resolved.absolutePath } : {}), + + readText() { + return readText(relativePath); + }, + + async exists(options: WorkspaceFileAssertionOptions = {}) { + const text = options.text ?? `${label} exists`; + if (resolved.error) { + return assertion(text, false, resolved.error); + } + + try { + const fileStat = await stat(resolved.absolutePath as string); + if (fileStat.isFile()) { + return assertion(text, true); + } + return assertion(text, false, `${label} exists but is not a file.`); + } catch { + return assertion(text, false, `${label} does not exist.`); + } + }, + + async contains(expected: string, options: WorkspaceFileAssertionOptions = {}) { + const text = options.text ?? `${label} contains ${quote(expected)}`; + const content = await readFileForAssertion(this); + if ('error' in content) { + return assertion(text, false, content.error); + } + + const passed = content.content.includes(expected); + return assertion( + text, + passed, + passed ? undefined : `${label} is missing ${quote(expected)}.`, + ); + }, + + async notContains(expected: string, options: WorkspaceFileAssertionOptions = {}) { + const text = options.text ?? `${label} does not contain ${quote(expected)}`; + const content = await readFileForAssertion(this); + if ('error' in content) { + return assertion(text, false, content.error); + } + + const passed = !content.content.includes(expected); + return assertion( + text, + passed, + passed ? undefined : `${label} contains unexpected text ${quote(expected)}.`, + ); + }, + + async matches(pattern: RegExp, options: WorkspaceFileAssertionOptions = {}) { + const text = options.text ?? `${label} matches ${regexLabel(pattern)}`; + const content = await readFileForAssertion(this); + if ('error' in content) { + return assertion(text, false, content.error); + } + + pattern.lastIndex = 0; + const passed = pattern.test(content.content); + return assertion( + text, + passed, + passed ? undefined : `${label} does not match ${regexLabel(pattern)}.`, + ); + }, + + async notMatches(pattern: RegExp, options: WorkspaceFileAssertionOptions = {}) { + const text = options.text ?? `${label} does not match ${regexLabel(pattern)}`; + const content = await readFileForAssertion(this); + if ('error' in content) { + return assertion(text, false, content.error); + } + + pattern.lastIndex = 0; + const passed = !pattern.test(content.content); + return assertion( + text, + passed, + passed ? undefined : `${label} matches unexpected pattern ${regexLabel(pattern)}.`, + ); + }, + }; + } + + return { + ...(workspacePath !== undefined ? { path: workspacePath } : {}), + file, + readText, + }; +} + +export async function normalizeWorkspaceGraderResult( + result: WorkspaceGraderReturn, +): Promise { + if (isCodeGraderResult(result)) { + return CodeGraderResultSchema.parse(result); + } + + const assertions = Array.isArray(result) ? await Promise.all(result) : [result]; + const passed = assertions.filter((item) => item.passed).length; + + return CodeGraderResultSchema.parse({ + score: assertions.length > 0 ? passed / assertions.length : 0, + assertions, + }); +} + +export async function runWorkspaceGrader( + handler: WorkspaceGraderHandler, + input: CodeGraderInput, +): Promise { + return normalizeWorkspaceGraderResult( + await handler({ + ...input, + workspace: createWorkspace(input), + }), + ); +} + +export function defineWorkspaceGrader(handler: WorkspaceGraderHandler): void { + runCodeGrader((input) => runWorkspaceGrader(handler, input)); +} diff --git a/packages/sdk/test/vitest-workspace-grader.test.ts b/packages/sdk/test/vitest-workspace-grader.test.ts new file mode 100644 index 000000000..f4f78736e --- /dev/null +++ b/packages/sdk/test/vitest-workspace-grader.test.ts @@ -0,0 +1,211 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { CodeGraderInputSchema } from '../src/schemas.js'; +import { runVitestWorkspaceGrader, vitestReportToCodeGraderResult } from '../src/vitest.js'; + +const mixedVitestReport = { + success: false, + numTotalTests: 2, + numPassedTests: 1, + numFailedTests: 1, + numPendingTests: 0, + numTodoTests: 0, + testResults: [ + { + name: '/workspace/verifiers/welcome-banner.test.ts', + assertionResults: [ + { + ancestorTitles: ['welcome banner'], + fullName: 'welcome banner contains status', + status: 'passed', + title: 'contains status', + duration: 3, + failureMessages: [], + }, + { + ancestorTitles: ['welcome banner'], + fullName: 'welcome banner links to dashboard', + status: 'failed', + title: 'links to dashboard', + duration: 4, + failureMessages: ['AssertionError: expected href to equal /dashboard'], + }, + ], + }, + ], +}; + +function buildInput(overrides?: Record) { + return CodeGraderInputSchema.parse({ + criteria: 'Verify the workspace with Vitest', + expectedOutput: [], + inputFiles: [], + input: [{ role: 'user', content: 'Update the workspace' }], + ...overrides, + }); +} + +describe('Vitest workspace grader adapter', () => { + let tmpDir: string; + let previousWorkspaceEnv: string | undefined; + + beforeEach(() => { + tmpDir = join(tmpdir(), `agentv-vitest-grader-${crypto.randomUUID()}`); + mkdirSync(tmpDir, { recursive: true }); + previousWorkspaceEnv = process.env.AGENTV_WORKSPACE_PATH; + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + if (previousWorkspaceEnv === undefined) { + process.env.AGENTV_WORKSPACE_PATH = undefined; + } else { + process.env.AGENTV_WORKSPACE_PATH = previousWorkspaceEnv; + } + }); + + it('maps individual Vitest test outcomes to AgentV assertions', () => { + const result = vitestReportToCodeGraderResult(mixedVitestReport); + + expect(result.score).toBe(0.5); + expect(result.assertions).toEqual([ + { text: 'welcome banner contains status', passed: true }, + { + text: 'welcome banner links to dashboard', + passed: false, + evidence: 'AssertionError: expected href to equal /dashboard', + }, + ]); + expect(result.details).toEqual({ + vitest_success: false, + num_total_tests: 2, + num_passed_tests: 1, + num_failed_tests: 1, + num_pending_tests: 0, + num_todo_tests: 0, + }); + }); + + it('runs a verifier-file command with JSON reporter args and reads the output file', async () => { + const fakeVitest = join(tmpDir, 'fake-vitest.ts'); + writeFileSync( + fakeVitest, + `import { writeFileSync } from 'node:fs'; + +const args = process.argv.slice(2); +writeFileSync('vitest-argv.json', JSON.stringify(args)); +const outputArg = args.find((arg) => arg.startsWith('--outputFile=')); +if (!outputArg) throw new Error('missing outputFile arg'); +writeFileSync(outputArg.slice('--outputFile='.length), JSON.stringify(${JSON.stringify(mixedVitestReport)})); +process.exit(1); +`, + ); + + const result = await runVitestWorkspaceGrader( + { + vitestCommand: ['bun', fakeVitest], + testFile: 'verifiers/welcome-banner.test.ts', + }, + buildInput({ workspacePath: tmpDir }), + ); + + expect(result.score).toBe(0.5); + expect(result.assertions.map((item) => item.text)).toEqual([ + 'welcome banner contains status', + 'welcome banner links to dashboard', + ]); + + const argv = JSON.parse(readFileSync(join(tmpDir, 'vitest-argv.json'), 'utf8')) as string[]; + expect(argv).toContain('verifiers/welcome-banner.test.ts'); + expect(argv).toContain('--reporter=json'); + expect(argv.some((arg) => arg.startsWith('--outputFile='))).toBe(true); + }); + + it('copies hidden verifier files into the workspace before running Vitest', async () => { + const testFileRoot = join(tmpDir, 'graders'); + mkdirSync(testFileRoot, { recursive: true }); + writeFileSync( + join(testFileRoot, 'welcome-banner.test.ts'), + 'import { expect, it } from "vitest";\n', + ); + + const fakeVitest = join(tmpDir, 'fake-vitest-copy.ts'); + writeFileSync( + fakeVitest, + `import { existsSync, writeFileSync } from 'node:fs'; +import { isAbsolute, join } from 'node:path'; + +const args = process.argv.slice(2); +const testFile = args[0]; +writeFileSync('vitest-test-file.json', JSON.stringify({ + testFile, + exists: existsSync(join(process.cwd(), testFile)), + isAbsolute: isAbsolute(testFile), +})); +const outputArg = args.find((arg) => arg.startsWith('--outputFile=')); +if (!outputArg) throw new Error('missing outputFile arg'); +writeFileSync(outputArg.slice('--outputFile='.length), JSON.stringify(${JSON.stringify(mixedVitestReport)})); +process.exit(1); +`, + ); + + const result = await runVitestWorkspaceGrader( + { + vitestCommand: ['bun', fakeVitest], + testFile: 'welcome-banner.test.ts', + testFileRoot, + copyTestFilesToWorkspace: true, + }, + buildInput({ workspacePath: tmpDir }), + ); + + expect(result.score).toBe(0.5); + const copied = JSON.parse(readFileSync(join(tmpDir, 'vitest-test-file.json'), 'utf8')) as { + testFile: string; + exists: boolean; + isAbsolute: boolean; + }; + expect(copied.testFile).toMatch(/^\.agentv-vitest-.+\/0-welcome-banner\.test\.ts$/); + expect(copied.exists).toBe(true); + expect(copied.isAbsolute).toBe(false); + }); + + it('runs a full Vitest command and parses JSON from stdout', async () => { + const fakeVitest = join(tmpDir, 'fake-vitest-stdout.ts'); + writeFileSync( + fakeVitest, + `console.log(JSON.stringify(${JSON.stringify(mixedVitestReport)})); +process.exit(1); +`, + ); + + const result = await runVitestWorkspaceGrader( + { + command: ['bun', fakeVitest], + appendReporterArgs: false, + }, + buildInput({ workspacePath: tmpDir }), + ); + + expect(result.score).toBe(0.5); + expect(result.assertions[1].passed).toBe(false); + }); + + it('returns a failed AgentV result when workspace_path is unavailable', async () => { + process.env.AGENTV_WORKSPACE_PATH = undefined; + + const result = await runVitestWorkspaceGrader( + { testFile: 'verifiers/welcome-banner.test.ts' }, + buildInput(), + ); + + expect(result.score).toBe(0); + expect(result.assertions[0]).toMatchObject({ + text: 'Vitest workspace verifier requires workspace_path', + passed: false, + }); + }); +}); diff --git a/packages/sdk/test/workspace-grader.test.ts b/packages/sdk/test/workspace-grader.test.ts new file mode 100644 index 000000000..c6feb2374 --- /dev/null +++ b/packages/sdk/test/workspace-grader.test.ts @@ -0,0 +1,141 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { CodeGraderInputSchema } from '../src/schemas.js'; +import { + createWorkspace, + normalizeWorkspaceGraderResult, + runWorkspaceGrader, +} from '../src/workspace.js'; + +function buildInput(overrides?: Record) { + return CodeGraderInputSchema.parse({ + criteria: 'Verify the workspace', + expectedOutput: [], + inputFiles: [], + input: [{ role: 'user', content: 'Update the workspace' }], + ...overrides, + }); +} + +describe('workspace grader helpers', () => { + let tmpDir: string; + let previousWorkspaceEnv: string | undefined; + + beforeEach(() => { + tmpDir = join(tmpdir(), `agentv-workspace-grader-${crypto.randomUUID()}`); + mkdirSync(join(tmpDir, 'app'), { recursive: true }); + previousWorkspaceEnv = process.env.AGENTV_WORKSPACE_PATH; + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + if (previousWorkspaceEnv === undefined) { + process.env.AGENTV_WORKSPACE_PATH = undefined; + } else { + process.env.AGENTV_WORKSPACE_PATH = previousWorkspaceEnv; + } + }); + + it('runs compact workspace file assertions and aggregates passing checks', async () => { + writeFileSync( + join(tmpDir, 'app/page.tsx'), + '
Status: All systems ready Open dashboard
', + ); + + const result = await runWorkspaceGrader( + async ({ workspace }) => [ + await workspace.file('app/page.tsx').contains('Status: All systems ready'), + await workspace.file('app/page.tsx').contains('Open dashboard'), + await workspace.file('app/page.tsx').matches(/href=["']\/dashboard["']/), + await workspace.file('app/page.tsx').notMatches(/TODO/i), + ], + buildInput({ workspacePath: tmpDir }), + ); + + expect(result.score).toBe(1); + expect(result.assertions).toHaveLength(4); + expect(result.assertions.every((item) => item.passed)).toBe(true); + }); + + it('scores failed file assertions by passed assertion count', async () => { + writeFileSync(join(tmpDir, 'app/page.tsx'), '
Hello TODO
'); + + const result = await runWorkspaceGrader( + async ({ workspace }) => [ + workspace.file('app/page.tsx').contains('Hello'), + workspace.file('app/page.tsx').contains('Open dashboard'), + workspace.file('app/page.tsx').notMatches(/TODO/i), + workspace.file('app/missing.tsx').contains('anything'), + ], + buildInput({ workspacePath: tmpDir }), + ); + + expect(result.score).toBe(0.25); + expect(result.assertions.map((item) => item.passed)).toEqual([true, false, false, false]); + expect(result.assertions[1].evidence).toContain('Open dashboard'); + expect(result.assertions[3].evidence).toContain('no such file'); + }); + + it('uses AGENTV_WORKSPACE_PATH when the stdin payload omits workspacePath', async () => { + process.env.AGENTV_WORKSPACE_PATH = tmpDir; + writeFileSync(join(tmpDir, 'app/page.tsx'), 'Ready'); + + const result = await runWorkspaceGrader( + async ({ workspace }) => [ + { text: 'Workspace env fallback is exposed', passed: workspace.path === tmpDir }, + await workspace.file('app/page.tsx').contains('Ready'), + ], + buildInput(), + ); + + expect(result.score).toBe(1); + }); + + it('returns failed assertions instead of requiring manual workspace path checks', async () => { + process.env.AGENTV_WORKSPACE_PATH = undefined; + + const result = await runWorkspaceGrader( + async ({ workspace }) => [await workspace.file('app/page.tsx').contains('Ready')], + buildInput(), + ); + + expect(result.score).toBe(0); + expect(result.assertions[0]).toMatchObject({ + text: 'app/page.tsx contains "Ready"', + passed: false, + }); + expect(result.assertions[0].evidence).toContain('Workspace path is not available'); + }); + + it('rejects file paths outside the workspace', async () => { + const workspace = createWorkspace(buildInput({ workspacePath: tmpDir })); + + const result = await normalizeWorkspaceGraderResult([ + await workspace.file('../outside.txt').exists(), + ]); + + expect(result.score).toBe(0); + expect(result.assertions[0].passed).toBe(false); + expect(result.assertions[0].evidence).toContain('inside the workspace'); + }); + + it('passes through explicit CodeGraderResult objects', async () => { + const result = await runWorkspaceGrader( + () => ({ + score: 0.75, + assertions: [{ text: 'custom weighted result', passed: true }], + details: { matched: 3, total: 4 }, + }), + buildInput({ workspacePath: tmpDir }), + ); + + expect(result).toEqual({ + score: 0.75, + assertions: [{ text: 'custom weighted result', passed: true }], + details: { matched: 3, total: 4 }, + }); + }); +});