diff --git a/apps/web/src/content/docs/docs/graders/composite.mdx b/apps/web/src/content/docs/docs/graders/composite.mdx index 62e8bb4e3..a755c7367 100644 --- a/apps/web/src/content/docs/docs/graders/composite.mdx +++ b/apps/web/src/content/docs/docs/graders/composite.mdx @@ -100,7 +100,10 @@ Because this is an average, the final score is the fraction of passing children ### OR Logic (Strict) -For a strict OR, add a custom code-grader aggregator and return `1.0` when any child score passes: +For a strict OR, add a custom code-grader aggregator and return `1.0` when any child score passes. + +Composite aggregator execution accepts either a direct script path or a shell command. +The `bun run` form is the recommended pattern: ```yaml assertions: @@ -108,7 +111,7 @@ assertions: type: composite aggregator: type: code-grader - path: ./scripts/or-aggregator.js + path: bun run ../scripts/or-aggregator.js assertions: - name: mentions-paris type: contains @@ -132,8 +135,7 @@ console.log( verdict: anyPassed ? 'pass' : 'fail', assertions: [{ text: `Any-or gate: ${anyPassed ? 'passed' : 'failed'}`, passed: anyPassed }], }), -); -``` + ); ``` ### Code Grader Aggregator @@ -143,7 +145,7 @@ Run a custom command to decide the final score based on all grader results: ```yaml aggregator: type: code-grader - path: node ./scripts/safety-gate.js + path: bun run ./scripts/safety-gate.js cwd: ./graders # optional working directory ``` diff --git a/examples/features/composite/README.md b/examples/features/composite/README.md index 4ac966a80..9148fca3f 100644 --- a/examples/features/composite/README.md +++ b/examples/features/composite/README.md @@ -13,10 +13,13 @@ Demonstrates composite grader patterns for combining multiple evaluation criteri ```bash # From repository root -bun agentv eval examples/features/composite/evals/dataset.eval.yaml +bun agentv eval run examples/features/composite/evals/dataset.eval.yaml +# Run only the strict-or script path example in dry-run (no live LLM targets required) +bun agentv eval run examples/features/composite/evals/dataset.eval.yaml --test-id strict-or-local --dry-run ``` ## Key Files - `evals/dataset.eval.yaml` - Test cases with composite grader patterns +- `scripts/or-aggregator.js` - Strict OR aggregator script used by `composite` examples - `apps/web/src/content/docs/docs/graders/composite.mdx` - Detailed AND/OR and strict-OR composition guidance diff --git a/examples/features/composite/evals/dataset.eval.yaml b/examples/features/composite/evals/dataset.eval.yaml index a8bfef655..affbeb2dc 100644 --- a/examples/features/composite/evals/dataset.eval.yaml +++ b/examples/features/composite/evals/dataset.eval.yaml @@ -56,9 +56,34 @@ tests: prompt: ../prompts/technical-accuracy.md aggregator: type: code-grader - path: node ../scripts/safety-gate-aggregator.js + path: bun run ../scripts/safety-gate-aggregator.js - # Example 3: LLM Grader Aggregator + # Example 3: Strict OR with a local code-grader aggregator + - id: strict-or-local + input: + - role: user + content: "Where is Paris?" + expected_output: + - role: assistant + content: | + Paris is the capital city of France. + criteria: | + The response should include either Paris or the phrase "capital of France". + assertions: + - name: strict_or + type: composite + assertions: + - name: mentions-paris + type: contains + value: Paris + - name: mentions-capital + type: contains + value: capital + aggregator: + type: code-grader + path: bun run ../scripts/or-aggregator.js + + # Example 4: LLM Grader Aggregator - id: llm-grader-conflict-resolution # Baseline note: aggregator may report minor omissions (score ~0.9). input: @@ -84,7 +109,7 @@ tests: type: llm-grader prompt: ../prompts/conflict-resolution.md - # Example 4: Nested Composite Graders + # Example 5: Nested Composite Graders - id: nested-composite input: - role: user diff --git a/examples/features/composite/scripts/or-aggregator.js b/examples/features/composite/scripts/or-aggregator.js new file mode 100644 index 000000000..c5ee27a68 --- /dev/null +++ b/examples/features/composite/scripts/or-aggregator.js @@ -0,0 +1,53 @@ +const fs = require('node:fs'); + +function getScore(result) { + if (result === null || typeof result !== 'object') { + return 0; + } + + if (result.verdict === 'pass') { + return 1; + } + + if (typeof result.verdict === 'string' && result.verdict === 'skip') { + return 0; + } + + if (typeof result.score === 'number') { + return result.score >= 0.5 ? 1 : 0; + } + + return 0; +} + +try { + const input = JSON.parse(fs.readFileSync(0, 'utf8')); + const results = Object.values(input.results ?? {}); + const anyPassed = results.some(getScore); + + console.log( + JSON.stringify({ + score: anyPassed ? 1 : 0, + verdict: anyPassed ? 'pass' : 'fail', + assertions: [ + { + text: `Strict OR passed if any child passed: ${anyPassed ? 'true' : 'false'}`, + passed: anyPassed, + }, + ], + }), + ); +} catch (error) { + console.log( + JSON.stringify({ + score: 0, + verdict: 'fail', + assertions: [ + { + text: `Failed to evaluate strict OR: ${error instanceof Error ? error.message : String(error)}`, + passed: false, + }, + ], + }), + ); +}