Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
129cac2
feat(cli): bundle agentv-dev skills and add `agentv skills` subcommand
christso May 7, 2026
da04aad
fix(cli): resolve biome lint errors in skills command and tests
christso May 7, 2026
0a8b07f
feat(cli): skills get --ref, agents/ in --full, drop scripts/assets
christso May 7, 2026
19f8601
refactor(cli): move skills-data/ to repo root, align with agent-brows…
christso May 7, 2026
63b97fb
chore: drop agentv-onboarding from bundled skills (redundant with npm…
christso May 7, 2026
23bbe3f
feat(cli): add subagent mode guidance to pipeline input/run output
christso May 10, 2026
072558a
feat(cli): agent grades its own outputs in subagent-as-target mode
christso May 10, 2026
7b0ee6a
fix(cli): restore grader subagent spawning in pipeline guidance
christso May 10, 2026
5ee5ae6
fix(cli): clarify grader subagents need grader.md embedded as system …
christso May 10, 2026
5d32242
fix(cli): include rubrics criteria in llm_graders output
christso May 10, 2026
84d3ff7
feat(cli): make pipeline discoverable for agent targets
christso May 10, 2026
ce9812e
fix: clarify pipeline description so agents discover it from top-leve…
christso May 10, 2026
1431cc5
docs: add pipeline guidance to CLAUDE.md for agent discoverability
christso May 10, 2026
ceab053
Revert "docs: add pipeline guidance to CLAUDE.md for agent discoverab…
christso May 10, 2026
3b1b8d9
fix: add disk-read guidance after executor completion to prevent read…
christso May 11, 2026
45e072f
chore: remove duplicate content from plugins/agentv-dev/skills (keep …
christso May 11, 2026
8d841ca
chore: consolidate plugins skills into single agentv-dev wrapper
christso May 11, 2026
f4df645
fix: remove dead agentv-onboarding reference from wrapper
christso May 11, 2026
f9908a0
feat: dynamic skill directory discovery + bundle scripts
christso May 11, 2026
da93048
fix: regenerate eval-schema.json from Zod definition
christso May 11, 2026
abd08cb
fix: make agentv-dev plugin message client-agnostic
christso May 11, 2026
db4a5c2
refactor: move eval-schema.json from plugins/ to skills-data/
christso May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,16 @@ async function runSingleEvalFile(params: {
console.log(`${targetMessage}`);
}

// Hint about pipeline for CLI agent targets
const targetKind = resolvedTargetSelection.resolvedTarget.kind;
if ((targetKind === 'claude-cli' || targetKind === 'copilot-cli') && !options.dryRun) {
console.log('');
console.log(' TIP: For subagent-mode evals, use `agentv pipeline` instead of `eval run`.');
console.log(' The agent orchestrates executor + grader subagents directly.');
console.log(' Run: agentv pipeline --help');
console.log('');
}

const agentTimeoutMs =
options.agentTimeoutSeconds != null
? Math.max(0, options.agentTimeoutSeconds) * 1000
Expand Down
4 changes: 3 additions & 1 deletion apps/cli/src/commands/init/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ export interface InitCommandOptions {

function printSkillFirstInstructions(): void {
console.log('\nAI-skills-first setup (recommended):');
console.log(' agentv skills get agentv-bench');
console.log(' Then ask your agent: "Set up AgentV in this repo."');
console.log('\nOr install the agentv-dev plugin for automatic skill discovery:');
console.log(' npx allagents plugin marketplace add EntityProcess/agentv');
console.log(' npx allagents plugin install agentv-dev@agentv');
console.log(' Then ask your agent: "Set up AgentV in this repo."');
}

async function promptYesNo(message: string): Promise<boolean> {
Expand Down
3 changes: 2 additions & 1 deletion apps/cli/src/commands/pipeline/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import { evalRunCommand } from './run.js';

export const pipelineCommand = subcommands({
name: 'pipeline',
description: 'Agent-mode eval pipeline commands (input → grade → bench)',
description:
'Subagent-mode eval pipeline (input → executor subagents → grade → bench) — use this when the eval target is an AI agent (Claude, Codex, etc.)',
cmds: {
input: evalInputCommand,
grade: evalGradeCommand,
Expand Down
49 changes: 48 additions & 1 deletion apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,47 @@ export const evalInputCommand = command({
});

console.log(`Extracted ${testIds.length} test(s) to ${outDir}`);

// --- Subagent mode guidance ---
if (targetKind === 'agent') {
console.log(`
Target: ${targetName} (subagent-as-target mode)`);
console.log(` Tests: ${testIds.join(', ')}`);
console.log('');
console.log(' Next steps for the orchestrating agent:');
console.log(' 1. Dispatch executor subagents — one per test case (all in parallel):');
console.log(' - Each reads <run-dir>/<test-id>/input.json');
console.log(' - Executes the task, writes <run-dir>/<test-id>/response.md');
console.log(' 2. Run code graders: agentv pipeline grade <run-dir>');
console.log(
' 3. Dispatch grader subagents — one per (test × LLM grader) pair (all in parallel):',
);
console.log(
' - Read agents/grader.md and embed its content as system instructions in each subagent prompt',
);
console.log(' - Each subagent reads llm_graders/<name>.json + response.md for its test');
console.log(' - Each writes llm_grader_results/<name>.json');
console.log(' 4. Merge scores: agentv pipeline bench <run-dir>');
console.log('');
console.log(' For the full procedure:');
console.log(' agentv skills get agentv-bench --ref subagent-pipeline');
console.log('');
}
},
});

interface GraderCounts {
codeGraders: number;
llmGraders: number;
builtinAssertions: number;
}

async function writeGraderConfigs(
testDir: string,
assertions: readonly GraderConfig[],
evalDir: string,
): Promise<void> {
): Promise<GraderCounts> {
const counts: GraderCounts = { codeGraders: 0, llmGraders: 0, builtinAssertions: 0 };
const codeGradersDir = join(testDir, 'code_graders');
const llmGradersDir = join(testDir, 'llm_graders');

Expand Down Expand Up @@ -257,9 +290,22 @@ async function writeGraderConfigs(
promptContent = config.prompt;
}

// For rubrics assertions, include the criteria array directly
// so grader subagents can evaluate without needing a prompt file.
const rubrics = (config as LlmGraderConfig).rubrics;
const rubricsData = rubrics?.map((r) => ({
id: r.id,
outcome: r.outcome,
weight: r.weight ?? 1.0,
...(r.score_ranges ? { score_range: r.score_ranges } : {}),
...(r.required !== undefined ? { required: r.required } : {}),
...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}),
}));

await writeJson(join(llmGradersDir, `${config.name}.json`), {
name: config.name,
prompt_content: promptContent,
...(rubricsData && rubricsData.length > 0 ? { rubrics: rubricsData } : {}),
weight: config.weight ?? 1.0,
threshold: 0.5,
config: {},
Expand All @@ -280,6 +326,7 @@ async function writeGraderConfigs(
});
}
}
return counts;
}

async function writeJson(filePath: string, data: unknown): Promise<void> {
Expand Down
65 changes: 59 additions & 6 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ function loadEnvFile(dir: string): Record<string, string> {

export const evalRunCommand = command({
name: 'run',
description: 'Extract inputs, invoke CLI targets, and run code graders in one step',
description:
'Extract inputs, invoke CLI targets, and run code graders (for agent targets, use pipeline input + subagents)',
args: {
evalPath: positional({
type: string,
Expand Down Expand Up @@ -341,15 +342,47 @@ export const evalRunCommand = command({
await Promise.all(pending);
process.stderr.write('\n');
} else {
console.log('Subagent-as-target mode — skipping CLI invocation.');
console.log('Subagent-as-target mode — the agent IS the target.');
console.log('');
console.log(' What happened: pipeline extracted inputs but did NOT invoke a CLI target.');
console.log(
' The orchestrating agent must dispatch executor subagents to process each test.',
);
console.log('');
console.log(' Next steps:');
console.log(' 1. Dispatch executor subagents — one per test case (all in parallel):');
console.log(' - Each reads <run-dir>/<test-id>/input.json');
console.log(' - Executes the task, writes <run-dir>/<test-id>/response.md');
console.log(' 2. Run code graders: agentv pipeline grade <run-dir>');
console.log(
' 3. Dispatch grader subagents — one per (test x LLM grader) pair (all in parallel):',
);
console.log(
' - Read agents/grader.md and embed its content as system instructions in each subagent prompt',
);
console.log(' - Each subagent reads llm_graders/<name>.json + response.md');
console.log(' - Each writes llm_grader_results/<name>.json');
console.log(' 4. Merge scores: agentv pipeline bench <run-dir>');
console.log('');
console.log(' For the full procedure:');
console.log(' agentv skills get agentv-bench --ref subagent-pipeline');
console.log('');
}

// ── Step 3: Run code graders (only when explicitly requested) ─────
if (graderType !== 'code') {
console.log(`\nDone. Results in ${outDir}`);
console.log(
'To run code graders: agentv pipeline grade <run-dir> (or re-run with --grader-type code)',
);
console.log('');
if (targetKind === 'agent') {
console.log(' The agent must now:');
console.log(' 1. Dispatch executor subagents to generate response.md files');
console.log(' 2. Run code graders: agentv pipeline grade <run-dir>');
console.log(' 3. Dispatch grader subagents for llm_graders/ configs');
console.log(' 4. Merge scores: agentv pipeline bench <run-dir>');
} else {
console.log(' To run code graders: agentv pipeline grade <run-dir>');
console.log(' Or re-run with --grader-type code to grade inline.');
}
return;
}

Expand Down Expand Up @@ -382,7 +415,15 @@ export const evalRunCommand = command({
const graderConcurrency = workers ?? 10;
const { totalGraders, totalPassed } = await runCodeGraders(graderTasks, graderConcurrency);
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
console.log(`\nDone. Agent can now perform LLM grading on responses in ${outDir}`);
console.log('');
console.log(`Results in ${outDir}`);
console.log('');
console.log(' Remaining steps:');
console.log(' 1. If llm_graders/ configs exist, dispatch grader subagents');
console.log(
' - Read agents/grader.md, embed as system instructions in each subagent prompt',
);
console.log(' 2. Merge all scores: agentv pipeline bench <run-dir>');
},
});

Expand Down Expand Up @@ -433,9 +474,21 @@ async function writeGraderConfigs(
} else if (typeof config.prompt === 'string') {
promptContent = config.prompt;
}
// For rubrics assertions, include the criteria array directly
const rubrics = (config as LlmGraderConfig).rubrics;
const rubricsData = rubrics?.map((r) => ({
id: r.id,
outcome: r.outcome,
weight: r.weight ?? 1.0,
...(r.score_ranges ? { score_range: r.score_ranges } : {}),
...(r.required !== undefined ? { required: r.required } : {}),
...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}),
}));

await writeJson(join(llmGradersDir, `${config.name}.json`), {
name: config.name,
prompt_content: promptContent,
...(rubricsData && rubricsData.length > 0 ? { rubrics: rubricsData } : {}),
weight: config.weight ?? 1.0,
threshold: 0.5,
config: {},
Expand Down
Loading
Loading