Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
"files": ["dist", "README.md"],
"scripts": {
"dev": "bun src/cli.ts",
"build": "tsup && bun run copy-readme",
"build": "(cd ../../packages/sdk && bun run build) && tsup && bun run copy-readme",
"copy-readme": "bun -e \"import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')\"",
"prepublishOnly": "node -e \"if(process.env.ALLOW_PUBLISH!=='1'){console.error('ERROR: Use bun run publish:next, then bun run promote:latest');process.exit(1)}\"",
"typecheck": "tsc --noEmit",
"typecheck": "(cd ../../packages/sdk && bun run build) && tsc --noEmit",
"lint": "biome check .",
"format": "biome format --write .",
"fix": "biome check --write .",
"test": "bun test",
"test": "(cd ../../packages/sdk && bun run build) && bun test",
"test:watch": "bun test --watch"
},
"dependencies": {
Expand Down Expand Up @@ -53,6 +53,7 @@
},
"devDependencies": {
"@agentv/core": "workspace:*",
"@agentv/sdk": "workspace:*",
"@types/semver": "^7.7.1",
"execa": "^9.3.0"
}
Expand Down
64 changes: 64 additions & 0 deletions apps/cli/src/commands/eval/commands/vitest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import { command, flag, number, option, optional, restPositionals, string } from 'cmd-ts';

import { runCodeGrader, runVitestWorkspaceGrader } from '@agentv/sdk';

function parseCommand(value: string | undefined): readonly string[] | undefined {
const trimmed = value?.trim();
return trimmed ? trimmed.split(/\s+/) : undefined;
}

export const evalVitestCommand = command({
name: 'vitest',
description: 'Run Vitest workspace verifier files as an AgentV code-grader protocol adapter',
args: {
testFiles: restPositionals({
type: string,
displayName: 'test-files',
description: 'Vitest verifier file(s) to run',
}),
cwd: option({
type: optional(string),
long: 'cwd',
description: 'Workspace-relative directory where Vitest should run',
}),
vitestCommand: option({
type: optional(string),
long: 'vitest-command',
description: 'Vitest command to execute, defaults to "bunx vitest run"',
}),
timeoutMs: option({
type: optional(number),
long: 'timeout-ms',
description: 'Timeout for the Vitest command in milliseconds',
}),
inWorkspace: flag({
long: 'in-workspace',
description:
'Treat test files as already present in the prepared workspace instead of copying them from the current directory',
}),
passWithNoTests: flag({
long: 'pass-with-no-tests',
description: 'Return score 1 when Vitest reports zero tests',
}),
},
handler: async ({ testFiles, cwd, vitestCommand, timeoutMs, inWorkspace, passWithNoTests }) => {
await runCodeGrader((input) => {
if (testFiles.length === 0) {
throw new Error('Provide at least one Vitest verifier file.');
}

return runVitestWorkspaceGrader(
{
testFile: testFiles,
cwd,
vitestCommand: parseCommand(vitestCommand),
timeoutMs,
passWithNoTests,
copyTestFilesToWorkspace: !inWorkspace,
testFileRoot: process.cwd(),
},
input,
);
});
},
});
2 changes: 2 additions & 0 deletions apps/cli/src/commands/eval/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { evalAggregateCommand } from './commands/aggregate.js';
import { evalAssertCommand } from './commands/assert.js';
import { evalBundleCommand } from './commands/bundle.js';
import { evalRunCommand } from './commands/run.js';
import { evalVitestCommand } from './commands/vitest.js';

export const evalCommand = subcommands({
name: 'eval',
Expand All @@ -14,5 +15,6 @@ export const evalCommand = subcommands({
assert: evalAssertCommand,
aggregate: evalAggregateCommand,
bundle: evalBundleCommand,
vitest: evalVitestCommand,
},
});
24 changes: 15 additions & 9 deletions apps/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ export const app = subcommands({
* Known eval subcommand names — used to decide whether to inject the
* implicit `run` subcommand for backward-compatible `agentv eval <paths>`.
*/
const EVAL_SUBCOMMANDS = new Set(['run', 'assert', 'aggregate', 'bundle']);
const EVAL_SUBCOMMANDS = new Set(['run', 'assert', 'aggregate', 'bundle', 'vitest']);

/**
* Top-level CLI command names (excluding `eval` itself).
Expand Down Expand Up @@ -96,6 +96,10 @@ export function usesDeprecatedStudioAlias(argv: string[]): boolean {
return argv[2] === 'studio';
}

export function shouldRunBeforeSessionHook(argv: string[]): boolean {
return !(argv[2] === 'eval' && argv[3] === 'vitest');
}

/**
* Preprocess argv for convenience aliases:
* - `--eval-id` → `--test-id`
Expand Down Expand Up @@ -162,14 +166,16 @@ export async function runCli(argv: string[] = process.argv): Promise<void> {
);
}

// Run before_session hook once at startup, before any command executes.
// Uses cwd as the search root for .agentv/config.yaml.
const cwd = process.cwd();
const repoRoot = await findRepoRoot(cwd);
const sessionConfig = await loadConfig(path.join(cwd, '_'), repoRoot);
const beforeSessionCommand = sessionConfig?.hooks?.before_session;
if (beforeSessionCommand) {
runBeforeSessionHook(beforeSessionCommand);
if (shouldRunBeforeSessionHook(processedArgv)) {
// Run before_session hook once at startup, before any command executes.
// Uses cwd as the search root for .agentv/config.yaml.
const cwd = process.cwd();
const repoRoot = await findRepoRoot(cwd);
const sessionConfig = await loadConfig(path.join(cwd, '_'), repoRoot);
const beforeSessionCommand = sessionConfig?.hooks?.before_session;
if (beforeSessionCommand) {
runBeforeSessionHook(beforeSessionCommand);
}
}

await run(binary(app), processedArgv);
Expand Down
130 changes: 130 additions & 0 deletions apps/cli/test/commands/eval/vitest.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
import { mkdir, mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { execa } from 'execa';
import { assertCoreBuild } from '../../setup-core-build.js';

assertCoreBuild();

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const projectRoot = path.resolve(__dirname, '../../../../..');
const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts');

const report = {
success: false,
numTotalTests: 2,
numPassedTests: 1,
numFailedTests: 1,
numPendingTests: 0,
numTodoTests: 0,
testResults: [
{
name: '/workspace/.agentv-vitest/example.test.ts',
assertionResults: [
{
fullName: 'welcome banner includes ready status',
status: 'passed',
failureMessages: [],
},
{
fullName: 'welcome banner links to dashboard',
status: 'failed',
failureMessages: ['AssertionError: expected link to point at /dashboard'],
},
],
},
],
};

async function runCli(args: readonly string[], cwd: string, input: string) {
return execa('bun', ['--no-env-file', CLI_ENTRY, ...args], {
cwd,
input,
env: {
AGENTV_HOME: path.join(cwd, '.agentv-home'),
AGENTV_NO_UPDATE_CHECK: '1',
},
});
}

describe('agentv eval vitest', () => {
let tempDir: string;

beforeEach(async () => {
tempDir = await mkdtemp(path.join(tmpdir(), 'agentv-eval-vitest-test-'));
});

afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});

it('runs external verifier files through the code-grader protocol', async () => {
const workspacePath = path.join(tempDir, 'workspace');
const gradersPath = path.join(tempDir, 'graders');
const fakeVitest = path.join(tempDir, 'fake-vitest.ts');
await mkdir(workspacePath, { recursive: true });
await mkdir(gradersPath, { recursive: true });
await writeFile(
path.join(gradersPath, 'welcome-banner.test.ts'),
'import { expect, it } from "vitest";\n',
'utf8',
);
await writeFile(
fakeVitest,
`import { writeFileSync } from 'node:fs';

const args = process.argv.slice(2);
writeFileSync('vitest-args.json', JSON.stringify(args));
const outputArg = args.find((arg) => arg.startsWith('--outputFile='));
if (!outputArg) throw new Error('missing outputFile arg');
writeFileSync(outputArg.slice('--outputFile='.length), JSON.stringify(${JSON.stringify(report)}));
process.exit(1);
`,
'utf8',
);

const payload = JSON.stringify({
criteria: 'Verify the workspace',
expected_output: [],
input_files: [],
input: [{ role: 'user', content: 'Update the welcome banner' }],
workspace_path: workspacePath,
});

const result = await runCli(
['eval', 'vitest', '--vitest-command', `bun ${fakeVitest}`, 'graders/welcome-banner.test.ts'],
tempDir,
payload,
);

const output = JSON.parse(result.stdout);
expect(output.score).toBe(0.5);
expect(output.assertions).toEqual([
{ text: 'welcome banner includes ready status', passed: true },
{
text: 'welcome banner links to dashboard',
passed: false,
evidence: 'AssertionError: expected link to point at /dashboard',
},
]);
expect(output.details).toMatchObject({
vitest_success: false,
num_total_tests: 2,
num_passed_tests: 1,
num_failed_tests: 1,
});

const vitestArgs = JSON.parse(
await readFile(path.join(workspacePath, 'vitest-args.json'), 'utf8'),
) as string[];
expect(vitestArgs[0]).toMatch(/^\.agentv-vitest-.+\/0-welcome-banner\.test\.ts$/);
expect(vitestArgs).toContain('--reporter=json');
expect(vitestArgs.some((arg) => arg.startsWith('--outputFile='))).toBe(true);

const workspaceEntries = await readdir(workspacePath);
expect(workspaceEntries.some((entry) => entry.startsWith('.agentv-vitest-'))).toBe(false);
});
});
25 changes: 16 additions & 9 deletions apps/cli/test/setup-core-build.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
/**
* Pre-flight check for CLI integration tests.
*
* CLI integration tests depend on @agentv/core being built (they import
* from the dist output). Rather than building core inside the test — which
* is slow and hides staleness issues — we simply verify dist exists and
* CLI integration tests depend on @agentv/core and @agentv/sdk being built
* (they import from the dist output). Rather than building packages inside
* the test — which is slow and hides staleness issues — we simply verify dist exists and
* fail fast with a clear message if it doesn't.
*
* CI runs `bun run build` before `bun run test`, so dist is available in
* the normal merge gate. For ad-hoc local runs, build first:
*
* bun run --filter @agentv/core build && bun --filter agentv test
* bun --filter @agentv/core build && bun --filter @agentv/sdk build && bun --filter agentv test
*/

import { constants, accessSync } from 'node:fs';
Expand All @@ -19,12 +19,19 @@ import { fileURLToPath } from 'node:url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const projectRoot = path.resolve(__dirname, '../../..');
const coreDistEntry = path.join(projectRoot, 'packages/core/dist/index.js');
const distEntries = [
['@agentv/core', path.join(projectRoot, 'packages/core/dist/index.js')],
['@agentv/sdk', path.join(projectRoot, 'packages/sdk/dist/index.js')],
] as const;

export function assertCoreBuild(): void {
try {
accessSync(coreDistEntry, constants.R_OK);
} catch {
throw new Error('@agentv/core is not built. Run `bun run --filter @agentv/core build` first.');
for (const [packageName, distEntry] of distEntries) {
try {
accessSync(distEntry, constants.R_OK);
} catch {
throw new Error(
`${packageName} is not built. Run \`bun --filter @agentv/core build && bun --filter @agentv/sdk build\` first.`,
);
}
}
}
31 changes: 30 additions & 1 deletion apps/cli/test/unit/preprocess-argv.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import { describe, expect, it } from 'bun:test';

import { preprocessArgv, usesDeprecatedStudioAlias } from '../../src/index.js';
import {
preprocessArgv,
shouldRunBeforeSessionHook,
usesDeprecatedStudioAlias,
} from '../../src/index.js';

describe('preprocessArgv', () => {
describe('--eval-id convenience alias', () => {
Expand Down Expand Up @@ -31,6 +35,11 @@ describe('preprocessArgv', () => {
expect(preprocessArgv(argv)).toEqual(argv);
});

it('does not insert `run` for eval vitest', () => {
const argv = ['node', 'agentv', 'eval', 'vitest', 'graders/welcome-banner.test.ts'];
expect(preprocessArgv(argv)).toEqual(argv);
});

it('does not insert `run` when eval is followed by --help', () => {
const argv = ['node', 'agentv', 'eval', '--help'];
expect(preprocessArgv(argv)).toEqual(argv);
Expand Down Expand Up @@ -85,4 +94,24 @@ describe('preprocessArgv', () => {
expect(preprocessArgv(argv)).toEqual(argv);
});
});

describe('before_session hook guard', () => {
it('skips before_session hooks for the Vitest protocol adapter', () => {
expect(
shouldRunBeforeSessionHook([
'node',
'agentv',
'eval',
'vitest',
'graders/welcome-banner.test.ts',
]),
).toBe(false);
});

it('keeps before_session hooks for normal eval runs', () => {
expect(shouldRunBeforeSessionHook(['node', 'agentv', 'eval', 'run', 'evals/demo.yaml'])).toBe(
true,
);
});
});
});
Loading
Loading