diff --git a/.agents/verification.md b/.agents/verification.md index 0f02e1237..9a4a18d3d 100644 --- a/.agents/verification.md +++ b/.agents/verification.md @@ -127,7 +127,7 @@ Use live dogfood before marking PRs ready when they affect eval execution, exper - Live means both sides are real: a live agent/provider target and a live grader target. Do not count `mock`, `--dry-run`, or deterministic-only assertions as dogfood for these changes. - Prefer the smallest realistic eval: one or two cases, bounded timeouts, and `workers: 1` for heavyweight agent providers. - For native experiment changes, run through `agentv eval run ... --experiment ` so resolution, setup, scripts, target selection, run knobs, and artifact metadata are exercised together. -- For repeat-run changes, use an experiment-level repeat config with `count >= 2`, `early_exit: false` when validating all attempts are persisted. Inspect root `index.jsonl`, root `benchmark.json`, and the repeated case folder. The repeated case folder should carry aggregate `summary.json` with flattened snake_case timing fields plus AgentV aggregate `grading.json`; attempt-specific outputs and transcripts live under `run-N/`. Each `run-N/` folder should contain `result.json`, `grading.json`, `transcript.json`, `transcript-raw.jsonl`, and `outputs/answer.md`. Do not write per-run `metrics.json`; timing and o11y fields belong in `result.json`, and `result.json` points at `./grading.json` through `grading_path`. +- For repeat-run changes, use an experiment-level repeat config with `count >= 2`, `early_exit: false` when validating all attempts are persisted. Inspect root `index.jsonl`, root `benchmark.json`, and the repeated case folder. The repeated case folder should carry aggregate `summary.json` with flattened snake_case timing fields plus AgentV aggregate `grading.json`; attempt-specific outputs, transcripts, and metrics live under `run-N/`. Each `run-N/` folder should contain `result.json`, `grading.json`, `metrics.json`, `transcript.jsonl`, `transcript-raw.jsonl`, and `outputs/answer.md` when answer output is available. `result.json` should point at `./grading.json`, `./metrics.json`, `./transcript.jsonl`, and `./transcript-raw.jsonl` through the corresponding path fields. - For local OpenAI-compatible grading through the OAuth proxy, use `endpoint: http://127.0.0.1:10531/v1`, but still route `api_key` and `model` through environment references such as `${{ LOCAL_OPENAI_PROXY_API_KEY }}` and `${{ LOCAL_OPENAI_PROXY_MODEL }}`. Literal secrets and literal model values are intentionally rejected by target validation unless a resolver explicitly allows them. - Preserve review evidence in `agentv-private` on an `evidence/` branch. Include the run bundle, source eval/experiment/targets files, a short README, an artifact tree, and screenshots when folder structure or UI behavior is under review. - If comparing against an external convention such as Vercel `agent-eval`, verify both semantic provenance and the physical `run-N` artifact layout for repeat runs. diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index e1591675b..26dc9471a 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -99,6 +99,7 @@ export function buildIndexArtifactEntry( answerPath?: string; tracePath?: string; transcriptPath?: string; + transcriptRawPath?: string; metricsPath?: string; rawProviderLogPath?: string; responsePath?: string; diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts index 113fa2461..2e95aec07 100644 --- a/apps/cli/src/commands/results/combine-run.ts +++ b/apps/cli/src/commands/results/combine-run.ts @@ -369,6 +369,7 @@ const MANIFEST_PATH_FIELDS = [ 'response_path', 'trace_path', 'transcript_path', + 'transcript_raw_path', 'metrics_path', 'raw_provider_log_path', 'task_dir', diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 0372b79e8..4944648a7 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -6,10 +6,11 @@ import { type ExternalTraceMetadataWire, type ResultArtifactPointersWire, type TraceSummary, - type TranscriptJsonLine, buildTraceFromMessages, + fromTraceEnvelopeWire, toCamelCaseDeep, - traceFromTranscriptJsonLines, + traceEnvelopeToTraceSummary, + traceEnvelopeToTranscriptMessages, } from '@agentv/core'; import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js'; @@ -55,6 +56,7 @@ export interface ResultManifestRecord { readonly answer_path?: string; readonly trace_path?: string; readonly transcript_path?: string; + readonly transcript_raw_path?: string; readonly metrics_path?: string; readonly raw_provider_log_path?: string; readonly artifact_pointers?: ResultArtifactPointersWire; @@ -86,6 +88,7 @@ export type ArtifactPointer = export interface ArtifactPointerMap { readonly transcript_path?: string; + readonly transcript_raw_path?: string; readonly answer_path?: string; readonly transcript?: ArtifactPointer; readonly answer?: ArtifactPointer; @@ -101,14 +104,6 @@ export interface ManifestHydrationOptions { readonly hydrateTranscriptTrace?: boolean; } -function parseJsonlLines(content: string): T[] { - return content - .split(/\r?\n/) - .map((line) => line.trim()) - .filter((line) => line.length > 0) - .map((line) => JSON.parse(line) as T); -} - function parseResultRows(content: string, sourceLabel?: string): ResultManifestRecord[] { return content .split(/\r?\n/) @@ -162,33 +157,6 @@ function readOptionalJson(baseDir: string, relativePath: string | undefined): } } -function nonEmptyString(value: unknown): string | undefined { - return typeof value === 'string' && value.trim().length > 0 ? value : undefined; -} - -function artifactPointerPath(pointer: ArtifactPointer | undefined): string | undefined { - if (typeof pointer === 'string') { - return nonEmptyString(pointer); - } - if (!pointer) { - return undefined; - } - return ( - nonEmptyString(pointer.path) ?? - nonEmptyString(pointer.artifact_path) ?? - nonEmptyString(pointer.relative_path) - ); -} - -function resolveTranscriptPath(record: ResultManifestRecord): string | undefined { - return ( - record.transcript_path ?? - record.artifact_pointers?.transcript?.path ?? - record.artifacts?.transcript_path ?? - artifactPointerPath(record.transcript ?? record.artifacts?.transcript) - ); -} - function hydrateInput( baseDir: string, record: ResultManifestRecord, @@ -217,19 +185,46 @@ function hydrateOutput( return responseText.trimEnd(); } +function hydrateTraceEnvelope( + baseDir: string, + record: ResultManifestRecord, +): EvaluationResult['trace'] | undefined { + const traceWire = readOptionalJson(baseDir, record.trace_path); + if (!traceWire) { + return undefined; + } + + try { + const envelope = fromTraceEnvelopeWire(traceWire); + const summary = traceEnvelopeToTraceSummary(envelope); + return buildTraceFromMessages({ + output: traceEnvelopeToTranscriptMessages(envelope), + summary: summary.trace, + finalOutput: hydrateOutput(baseDir, record), + tokenUsage: summary.tokenUsage, + costUsd: summary.costUsd, + durationMs: summary.durationMs, + startTime: summary.startTime, + endTime: summary.endTime, + provider: envelope.source.provider, + target: record.target ?? envelope.eval.target, + testId: record.test_id ?? envelope.eval.testId, + conversationId: envelope.eval.runId, + }); + } catch { + return undefined; + } +} + function hydrateTrace( baseDir: string, record: ResultManifestRecord, options: ManifestHydrationOptions, ): EvaluationResult['trace'] { if (options.hydrateTranscriptTrace !== false) { - const transcriptText = readOptionalText(baseDir, resolveTranscriptPath(record)); - if (transcriptText) { - try { - return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText)); - } catch { - // Fall through to a minimal trace below. - } + const trace = hydrateTraceEnvelope(baseDir, record); + if (trace) { + return trace; } } diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts index 042259775..a25d45d67 100644 --- a/apps/cli/src/commands/results/projection-bundle.ts +++ b/apps/cli/src/commands/results/projection-bundle.ts @@ -94,6 +94,7 @@ export type ProjectionBundleArtifactRefs = Partial< | 'output_path' | 'answer_path' | 'transcript_path' + | 'transcript_raw_path' | 'metrics_path' | 'task_dir' | 'eval_path' @@ -178,6 +179,7 @@ function artifactRefs( output_path: indexEntry.output_path, answer_path: indexEntry.answer_path, transcript_path: indexEntry.transcript_path, + transcript_raw_path: indexEntry.transcript_raw_path, metrics_path: indexEntry.metrics_path, trace_path: tracePathFor(indexEntry), task_dir: indexEntry.task_dir, diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index d6f649fce..f32eacf69 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -858,6 +858,7 @@ function buildResultArtifactCatalog( addDirectArtifactCatalogEntry(entries, seen, record.response_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.answer_path, 'answer'); addDirectArtifactCatalogEntry(entries, seen, record.transcript_path, 'transcript'); + addDirectArtifactCatalogEntry(entries, seen, record.transcript_raw_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, recordWithTrace.trace_path, 'trace'); addDirectArtifactCatalogEntry(entries, seen, record.eval_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.targets_path, 'artifact'); @@ -1153,7 +1154,8 @@ function buildRepeatTrialReadModels( const metricsPath = caseTrialArtifactPath(artifactDir, runPath, 'metrics.json'); const timingPath = caseTrialArtifactPath(artifactDir, runPath, 'timing.json'); const gradingPath = caseTrialArtifactPath(artifactDir, runPath, 'grading.json'); - const transcriptPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript-raw.jsonl'); + const transcriptPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript.jsonl'); + const transcriptRawPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript-raw.jsonl'); const answerPath = caseTrialArtifactPath(artifactDir, runPath, 'outputs/answer.md'); const metrics = readArtifactJsonObject(baseDir, metricsPath); const timing = readArtifactJsonObject(baseDir, timingPath); @@ -1180,6 +1182,7 @@ function buildRepeatTrialReadModels( ...(timingPath && { timing_path: timingPath }), ...(gradingPath && { grading_path: gradingPath }), ...(transcriptPath && { transcript_path: transcriptPath }), + ...(transcriptRawPath && { transcript_raw_path: transcriptRawPath }), ...(answerPath && { answer_path: answerPath }), }; }); @@ -1203,6 +1206,7 @@ function attachRunDetailReadModelFields>( ...(record.timing_path && { timing_path: record.timing_path }), ...(record.metrics_path && { metrics_path: record.metrics_path }), ...(record.transcript_path && { transcript_path: record.transcript_path }), + ...(record.transcript_raw_path && { transcript_raw_path: record.transcript_raw_path }), ...(record.output_path && { output_path: record.output_path }), ...(record.answer_path && { answer_path: record.answer_path }), ...(trials && { trials }), diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index bf30802d4..6c9b67c1c 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -888,7 +888,7 @@ describe('writeArtifactsFromResults', () => { 'result.json', 'timing.json', 'transcript-raw.jsonl', - 'transcript.json', + 'transcript.jsonl', ]); const alphaGrading: GradingArtifact = JSON.parse( @@ -1044,7 +1044,7 @@ describe('writeArtifactsFromResults', () => { 'result.json', 'timing.json', 'transcript-raw.jsonl', - 'transcript.json', + 'transcript.jsonl', ]); } @@ -1060,7 +1060,8 @@ describe('writeArtifactsFromResults', () => { duration_seconds: 2, model: 'test-target', grading_path: './grading.json', - transcript_path: './transcript.json', + metrics_path: './metrics.json', + transcript_path: './transcript.jsonl', transcript_raw_path: './transcript-raw.jsonl', output_paths: { answer: './outputs/answer.md' }, timing: { @@ -1082,7 +1083,8 @@ describe('writeArtifactsFromResults', () => { ) as Record; expect(runTwoResult).toMatchObject({ grading_path: './grading.json', - transcript_path: './transcript.json', + metrics_path: './metrics.json', + transcript_path: './transcript.jsonl', transcript_raw_path: './transcript-raw.jsonl', timing: { duration_ms: 4000, @@ -1136,7 +1138,7 @@ describe('writeArtifactsFromResults', () => { expect(timingOne.duration_ms).toBe(0); }); - it('writes transcript.jsonl as provider-neutral v1 rows projected from the execution trace', async () => { + it('writes normalized transcript.jsonl rows plus raw transcript evidence', async () => { const input = [{ role: 'user' as const, content: 'Inspect artifact output' }]; const output = [ { @@ -1145,8 +1147,18 @@ describe('writeArtifactsFromResults', () => { toolCalls: [ { tool: 'Read', + id: 'read-1', input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' }, output: 'file contents', + status: 'ok' as const, + durationMs: 25, + }, + { + tool: 'Bash', + id: 'bash-1', + input: { command: 'bun test missing.test.ts' }, + status: 'error' as const, + durationMs: 10, }, ], }, @@ -1177,80 +1189,64 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); - const transcriptPath = path.join(testDir, 'transcript-case', 'run-1', 'transcript-raw.jsonl'); + const transcriptPath = path.join(testDir, 'transcript-case', 'run-1', 'transcript.jsonl'); const transcriptLines = (await readFile(transcriptPath, 'utf8')) .trim() .split('\n') .map((line) => JSON.parse(line)); - const transcriptMessages = JSON.parse( - await readFile(path.join(testDir, 'transcript-case', 'run-1', 'transcript.json'), 'utf8'), - ); + const rawTranscriptLines = ( + await readFile(path.join(testDir, 'transcript-case', 'run-1', 'transcript-raw.jsonl'), 'utf8') + ) + .trim() + .split('\n') + .map((line) => JSON.parse(line)); - expect(Array.isArray(transcriptMessages)).toBe(true); - expect(transcriptMessages).toHaveLength(2); expect(transcriptLines).toHaveLength(2); expect(transcriptLines[0]).toMatchObject({ - schema_version: 'agentv.transcript.v1', - test_id: 'transcript-case', - target: 'codex', - message_index: 0, - role: 'user', - content: 'Inspect artifact output', - transcript_token_usage: { input: 100, output: 40, cached: 10, reasoning: 5 }, - transcript_duration_ms: 4200, - transcript_cost_usd: 0.25, - capture: { content: 'full', redaction_level: 'none', redacted_fields: [] }, - trace: { - schema_version: 'agentv.trace.v1', - artifact_id: expect.any(String), - trace_id: expect.any(String), - span_id: expect.any(String), - }, - source: { - kind: 'agentv_run', - provider: 'codex', - session_id: 'session-123', - path: 'index.jsonl', - format: 'agentv_result', - version: '1', - }, - }); - expect(transcriptLines[0].source.metadata).toMatchObject({ - target: 'codex', - provider_session_id: 'session-123', - eval_case_id: 'transcript-case', + v: 1, + agent: 'codex', + type: 'user', + content: [{ type: 'text', text: 'Inspect artifact output' }], }); expect(transcriptLines[1]).toMatchObject({ - schema_version: 'agentv.transcript.v1', - test_id: 'transcript-case', - target: 'codex', - message_index: 1, - role: 'assistant', - content: 'Reading artifact-writer.ts', - tool_calls: [ + v: 1, + agent: 'codex', + type: 'assistant', + content: [ + { type: 'text', text: 'Reading artifact-writer.ts' }, { - tool: 'Read', + type: 'tool_use', + id: 'read-1', + name: 'Read', input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' }, - output: 'file contents', - status: 'ok', - trace: { - schema_version: 'agentv.trace.v1', - artifact_id: expect.any(String), - trace_id: expect.any(String), + result: { + status: 'success', + output: 'file contents', + duration_ms: 25, + }, + }, + { + type: 'tool_use', + id: 'bash-1', + name: 'Bash', + input: { command: 'bun test missing.test.ts' }, + result: { + status: 'error', + duration_ms: 10, }, }, ], - capture: { content: 'full', redaction_level: 'none', redacted_fields: [] }, - source: { - kind: 'agentv_run', - provider: 'codex', - session_id: 'session-123', - }, }); - expect(transcriptLines[1].tool_calls[0].trace.span_id).toBeTruthy(); - expect(transcriptLines[1]).not.toHaveProperty('provider_session_id'); - expect(transcriptLines[1]).not.toHaveProperty('providerSessionId'); + expect(transcriptLines[1]).not.toHaveProperty('schema_version'); + expect(transcriptLines[1]).not.toHaveProperty('o11y'); + expect(rawTranscriptLines[0]).toMatchObject({ + schema_version: 'agentv.transcript.v1', + test_id: 'transcript-case', + target: 'codex', + message_index: 0, + role: 'user', + }); await expect( readFile(path.join(testDir, 'transcript-case', 'transcript.json'), 'utf8'), ).rejects.toThrow(); @@ -1259,7 +1255,8 @@ describe('writeArtifactsFromResults', () => { (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); expect(indexLine).not.toHaveProperty('trace_path'); - expect(indexLine.transcript_path).toBe('transcript-case/run-1/transcript-raw.jsonl'); + expect(indexLine.transcript_path).toBe('transcript-case/run-1/transcript.jsonl'); + expect(indexLine.transcript_raw_path).toBe('transcript-case/run-1/transcript-raw.jsonl'); expect(indexLine.metrics_path).toBe('transcript-case/run-1/metrics.json'); expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true); @@ -1370,11 +1367,14 @@ describe('writeArtifactsFromResults', () => { expect(summary.schema_version).toBe(METRICS_SCHEMA_VERSION); expect(summary.source_artifacts).toMatchObject({ - trace_path: 'transcript.json', - transcript_path: 'transcript-raw.jsonl', + transcript_path: 'transcript.jsonl', grading_path: 'grading.json', timing_path: 'timing.json', }); + expect(summary.source_artifacts).not.toHaveProperty('trace_path'); + await expect( + readFile(path.join(testDir, 'summary-case', 'run-1', 'trace.json'), 'utf8'), + ).rejects.toThrow(); expect(summary.metrics.total_turns).toBe(2); expect(summary.metrics.total_tool_calls).toBe(4); expect(summary.metrics.total_steps).toBe(2); @@ -1553,7 +1553,7 @@ describe('writeArtifactsFromResults', () => { }); }); - it('copies optional raw provider logs as non-canonical evidence', async () => { + it('copies optional raw provider logs as raw transcript evidence', async () => { const rawLogPath = path.join(testDir, 'provider-source.log'); const rawLog = [ '# provider-native stream log', @@ -1578,23 +1578,30 @@ describe('writeArtifactsFromResults', () => { expect(await readFile(copiedRawLogPath, 'utf8')).toBe(rawLog); const transcriptPath = path.join(testDir, 'raw-log-case', 'run-1', 'transcript-raw.jsonl'); - await expect(readFile(transcriptPath, 'utf8')).resolves.toContain( - '"schema_version":"agentv.transcript.v1"', - ); + await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog); await expect( readFile(path.join(testDir, 'raw-log-case', 'transcript.json'), 'utf8'), ).rejects.toThrow(); - const transcriptMessages = JSON.parse( - await readFile(path.join(testDir, 'raw-log-case', 'run-1', 'transcript.json'), 'utf8'), - ); - expect(Array.isArray(transcriptMessages)).toBe(true); + const transcriptLines = ( + await readFile(path.join(testDir, 'raw-log-case', 'run-1', 'transcript.jsonl'), 'utf8') + ) + .trim() + .split('\n') + .map((line) => JSON.parse(line)); + expect(transcriptLines[0]).toMatchObject({ + v: 1, + agent: 'codex', + type: 'assistant', + content: [{ type: 'text', text: 'Raw log copied' }], + }); const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); expect(indexLine.raw_provider_log_path).toBeUndefined(); - expect(indexLine.transcript_path).toBe('raw-log-case/run-1/transcript-raw.jsonl'); + expect(indexLine.transcript_path).toBe('raw-log-case/run-1/transcript.jsonl'); + expect(indexLine.transcript_raw_path).toBe('raw-log-case/run-1/transcript-raw.jsonl'); expect(indexLine).not.toHaveProperty('transcript_json_path'); }); @@ -1641,7 +1648,7 @@ describe('writeArtifactsFromResults', () => { expect(JSON.stringify(indexLine)).not.toContain('api_key'); const transcriptJson = await readFile( - path.join(testDir, 'external-trace-case', 'run-1', 'transcript.json'), + path.join(testDir, 'external-trace-case', 'run-1', 'transcript.jsonl'), 'utf8', ); expect(transcriptJson).not.toContain('secret'); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index 8e7f58c31..b39734d28 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -360,7 +360,8 @@ describe('results export', () => { metrics_path: 'privacy/test-private/run-1/metrics.json', output_path: 'privacy/test-private/run-1/outputs/answer.md', answer_path: 'privacy/test-private/run-1/outputs/answer.md', - transcript_path: 'privacy/test-private/run-1/transcript-raw.jsonl', + transcript_path: 'privacy/test-private/run-1/transcript.jsonl', + transcript_raw_path: 'privacy/test-private/run-1/transcript-raw.jsonl', trace_path: 'privacy/test-private/trace.json', }); expect(bundle.entries[0].artifact_refs).not.toHaveProperty('input_path'); @@ -429,7 +430,8 @@ describe('results export', () => { metrics_path: 'demo/test-greeting/run-1/metrics.json', output_path: 'demo/test-greeting/run-1/outputs/answer.md', answer_path: 'demo/test-greeting/run-1/outputs/answer.md', - transcript_path: 'demo/test-greeting/run-1/transcript-raw.jsonl', + transcript_path: 'demo/test-greeting/run-1/transcript.jsonl', + transcript_raw_path: 'demo/test-greeting/run-1/transcript-raw.jsonl', }); expect(entries[0]).not.toHaveProperty('input_path'); expect(entries[0].projection_identity).toMatchObject({ diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index 9eaab0a6e..1760071e7 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -4,6 +4,12 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; +import { + buildTraceEnvelopeFromEvaluationResult, + buildTraceFromMessages, + toTraceEnvelopeWire, +} from '@agentv/core'; + import { resolveRunManifestPath } from '../../../src/commands/eval/result-layout.js'; import { loadManifestResults } from '../../../src/commands/results/manifest.js'; import { resolveSourceFile } from '../../../src/commands/results/shared.js'; @@ -79,7 +85,7 @@ describe('results shared source resolution', () => { expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); }); - it('hydrates transcripts from artifact pointers when transcript_path is absent', () => { + it('ignores legacy transcript artifact pointers when hydrating traces', () => { const runDir = path.join(tempDir, '.agentv', 'results', 'default', '2026-03-25T10-00-00-000Z'); const transcriptRelativePath = 'pointer-case/transcript.jsonl'; mkdirSync(path.join(runDir, 'pointer-case'), { recursive: true }); @@ -95,6 +101,7 @@ describe('results shared source resolution', () => { source: { provider: 'codex', session_id: 'session-pointer' }, })}\n`, ); + writeFileSync(path.join(runDir, 'pointer-case/answer.md'), 'Loaded from output\n'); const indexPath = path.join(runDir, 'index.jsonl'); writeFileSync( indexPath, @@ -105,6 +112,7 @@ describe('results shared source resolution', () => { score: 1, grading_path: 'pointer-case/grading.json', timing_path: 'pointer-case/timing.json', + answer_path: 'pointer-case/answer.md', artifact_pointers: { transcript: { ref: 'agentv/artifacts/v1', @@ -124,8 +132,168 @@ describe('results shared source resolution', () => { const results = loadManifestResults(indexPath); expect(results).toHaveLength(1); - expect(results[0].trace.messages[0]?.content).toBe('Loaded from pointer'); + expect(results[0].trace.messages[0]?.content).toBe('Loaded from output'); expect(results[0].trace.messages[0]?.role).toBe('assistant'); + expect(results[0].trace.toolCalls).toEqual({}); + }); + + it('hydrates trace evidence from trace_path when transcript_path is normalized', () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'default', '2026-03-25T10-00-00-000Z'); + mkdirSync(path.join(runDir, 'trace-case'), { recursive: true }); + const trace = buildTraceFromMessages({ + output: [ + { + role: 'assistant', + content: 'Loaded from trace artifact', + toolCalls: [ + { + tool: 'shell', + id: 'tool-1', + input: { cmd: 'pwd' }, + output: '/repo', + status: 'ok', + }, + ], + }, + ], + finalOutput: 'Loaded from trace artifact', + target: 'codex', + testId: 'trace-case', + }); + const envelope = buildTraceEnvelopeFromEvaluationResult( + { + timestamp: '2026-03-25T10:00:00.000Z', + testId: 'trace-case', + target: 'codex', + score: 1, + assertions: [], + output: 'Loaded from trace artifact', + trace, + }, + { + capture: { content: 'full', redactionLevel: 'none' }, + now: () => new Date('2026-03-25T10:00:00.000Z'), + }, + ); + writeFileSync( + path.join(runDir, 'trace-case/trace.json'), + `${JSON.stringify(toTraceEnvelopeWire(envelope))}\n`, + ); + writeFileSync( + path.join(runDir, 'trace-case/transcript.jsonl'), + `${JSON.stringify({ + v: 1, + agent: 'codex', + type: 'assistant', + content: [{ type: 'text', text: 'Loaded from normalized transcript' }], + })}\n`, + ); + const indexPath = path.join(runDir, 'index.jsonl'); + writeFileSync( + indexPath, + `${JSON.stringify({ + timestamp: '2026-03-25T10:00:00.000Z', + test_id: 'trace-case', + target: 'codex', + score: 1, + trace_path: 'trace-case/trace.json', + transcript_path: 'trace-case/transcript.jsonl', + })}\n`, + ); + + const results = loadManifestResults(indexPath); + + expect(results).toHaveLength(1); + expect(results[0].trace.messages[0]?.content).toBe('Loaded from trace artifact'); + expect(results[0].trace.toolCalls).toEqual({ shell: 1 }); + }); + + it('does not hydrate trace evidence from transcript_raw_path', () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'default', '2026-03-25T10-00-00-000Z'); + mkdirSync(path.join(runDir, 'raw-case'), { recursive: true }); + writeFileSync( + path.join(runDir, 'raw-case/transcript.jsonl'), + `${JSON.stringify({ + v: 1, + agent: 'codex', + type: 'assistant', + content: [{ type: 'text', text: 'Loaded from normalized transcript' }], + })}\n`, + ); + writeFileSync( + path.join(runDir, 'raw-case/transcript-raw.jsonl'), + `${JSON.stringify({ + schema_version: 'agentv.transcript.v1', + test_id: 'raw-case', + target: 'codex', + message_index: 0, + role: 'assistant', + content: 'Loaded from raw transcript', + tool_calls: [ + { + tool: 'shell', + id: 'tool-1', + input: { cmd: 'pwd' }, + output: '/repo', + status: 'ok', + }, + ], + source: { provider: 'codex', session_id: 'session-raw' }, + })}\n`, + ); + writeFileSync(path.join(runDir, 'raw-case/answer.md'), 'Loaded from output fallback\n'); + const indexPath = path.join(runDir, 'index.jsonl'); + writeFileSync( + indexPath, + `${JSON.stringify({ + timestamp: '2026-03-25T10:00:00.000Z', + test_id: 'raw-case', + target: 'codex', + score: 1, + transcript_path: 'raw-case/transcript.jsonl', + transcript_raw_path: 'raw-case/transcript-raw.jsonl', + answer_path: 'raw-case/answer.md', + })}\n`, + ); + + const results = loadManifestResults(indexPath); + + expect(results).toHaveLength(1); + expect(results[0].trace.messages[0]?.content).toBe('Loaded from output fallback'); + expect(results[0].trace.toolCalls).toEqual({}); + }); + + it('falls back to a minimal trace when only normalized transcript_path is present', () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'default', '2026-03-25T10-00-00-000Z'); + mkdirSync(path.join(runDir, 'normalized-only'), { recursive: true }); + writeFileSync( + path.join(runDir, 'normalized-only/transcript.jsonl'), + `${JSON.stringify({ + v: 1, + agent: 'codex', + type: 'assistant', + content: [{ type: 'text', text: 'Normalized transcript text' }], + })}\n`, + ); + writeFileSync(path.join(runDir, 'normalized-only/answer.md'), 'Fallback answer\n'); + const indexPath = path.join(runDir, 'index.jsonl'); + writeFileSync( + indexPath, + `${JSON.stringify({ + timestamp: '2026-03-25T10:00:00.000Z', + test_id: 'normalized-only', + target: 'codex', + score: 1, + transcript_path: 'normalized-only/transcript.jsonl', + answer_path: 'normalized-only/answer.md', + })}\n`, + ); + + const results = loadManifestResults(indexPath); + + expect(results).toHaveLength(1); + expect(results[0].trace.messages[0]?.content).toBe('Fallback answer'); + expect(results[0].trace.toolCalls).toEqual({}); }); it('rejects eval-case-only rows with migration guidance', () => { diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index ce53172ff..bf19b2b0e 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -407,8 +407,10 @@ describe('agentv eval CLI', () => { expect(canonicalResults).toHaveLength(2); await expectFileExists(path.join(outputDir, 'summary.json')); for (const row of canonicalResults) { - expect(row.transcript_path).toMatch(/run-1\/transcript-raw\.jsonl$/); + expect(row.transcript_path).toMatch(/run-1\/transcript\.jsonl$/); + expect(row.transcript_raw_path).toMatch(/run-1\/transcript-raw\.jsonl$/); await expectFileExists(path.join(outputDir, row.transcript_path as string)); + await expectFileExists(path.join(outputDir, row.transcript_raw_path as string)); } } finally { await rm(fixture.baseDir, { recursive: true, force: true }); diff --git a/apps/dashboard/src/components/TranscriptTimeline.tsx b/apps/dashboard/src/components/TranscriptTimeline.tsx index eab1f3d5e..79928bff6 100644 --- a/apps/dashboard/src/components/TranscriptTimeline.tsx +++ b/apps/dashboard/src/components/TranscriptTimeline.tsx @@ -33,6 +33,8 @@ export interface TranscriptJsonLine { target: string; message_index: number; role: string; + agent?: string; + model?: string; name?: string; content?: unknown; tool_calls?: readonly Record[]; @@ -115,6 +117,70 @@ function isTranscriptJsonLine(value: unknown): value is TranscriptJsonLine { ); } +function isNormalizedTranscriptLine(value: unknown): value is Record { + return ( + isRecord(value) && + value.v === 1 && + typeof value.agent === 'string' && + (value.type === 'system' || value.type === 'user' || value.type === 'assistant') && + Array.isArray(value.content) + ); +} + +function normalizeToolUseBlock(block: Record): Record { + const result = isRecord(block.result) ? block.result : undefined; + return { + id: typeof block.id === 'string' ? block.id : undefined, + tool: typeof block.name === 'string' ? block.name : 'tool', + input: block.input, + output: result?.output, + status: typeof result?.status === 'string' ? result.status : undefined, + duration_ms: typeof result?.duration_ms === 'number' ? result.duration_ms : undefined, + metadata: isRecord(block.metadata) ? block.metadata : undefined, + }; +} + +function normalizedTranscriptLineToTimelineEntry( + value: Record, + messageIndex: number, +): TranscriptJsonLine { + const content = value.content as readonly unknown[]; + const toolCalls = content + .filter( + (block): block is Record => isRecord(block) && block.type === 'tool_use', + ) + .map(normalizeToolUseBlock); + const inputTokens = typeof value.input_tokens === 'number' ? value.input_tokens : undefined; + const outputTokens = typeof value.output_tokens === 'number' ? value.output_tokens : undefined; + const tokenUsage = + inputTokens !== undefined || outputTokens !== undefined + ? { + input: inputTokens ?? 0, + output: outputTokens ?? 0, + } + : undefined; + + return { + test_id: '', + target: value.agent as string, + message_index: messageIndex, + role: value.type as string, + agent: value.agent as string, + model: typeof value.model === 'string' ? value.model : undefined, + content, + tool_calls: toolCalls.length > 0 ? toolCalls : undefined, + start_time: typeof value.ts === 'string' ? value.ts : undefined, + token_usage: tokenUsage, + metadata: typeof value.id === 'string' ? { id: value.id } : undefined, + source: { + provider: value.agent as string, + session_id: '', + model: typeof value.model === 'string' ? value.model : undefined, + timestamp: typeof value.ts === 'string' ? value.ts : undefined, + }, + }; +} + export function parseTranscriptJsonl(rawJsonl: string): TranscriptParseResult { const entries: TranscriptJsonLine[] = []; const lines = rawJsonl.split(/\r?\n/); @@ -125,6 +191,10 @@ export function parseTranscriptJsonl(rawJsonl: string): TranscriptParseResult { try { const parsed = JSON.parse(line) as unknown; + if (isNormalizedTranscriptLine(parsed)) { + entries.push(normalizedTranscriptLineToTimelineEntry(parsed, entries.length)); + continue; + } if (!isTranscriptJsonLine(parsed)) { return { entries, @@ -220,10 +290,16 @@ function formatContent(value: unknown): string { if (isRecord(block) && block.type === 'text' && typeof block.text === 'string') { return block.text; } + if (isRecord(block) && block.type === 'thinking' && typeof block.text === 'string') { + return `Thinking:\n${block.text}`; + } + if (isRecord(block) && block.type === 'image' && typeof block.source === 'string') { + return `Image: ${block.source}`; + } return undefined; }) .filter((text): text is string => text !== undefined); - if (textBlocks.length === value.length && textBlocks.length > 0) { + if (textBlocks.length > 0) { return textBlocks.join('\n'); } } @@ -473,8 +549,8 @@ function TranscriptSummary({ transcriptPath, }: { entries: readonly TranscriptJsonLine[]; transcriptPath?: string }) { const first = entries[0]; - const provider = first?.source?.provider; - const model = first?.source?.model; + const provider = first?.source?.provider ?? first?.agent; + const model = first?.source?.model ?? first?.model; const sessionId = first?.source?.session_id; const duration = formatDurationMs(first?.transcript_duration_ms); const tokenUsage = formatTokenUsage(first?.transcript_token_usage); diff --git a/apps/dashboard/src/components/__fixtures__/structured-transcript.ts b/apps/dashboard/src/components/__fixtures__/structured-transcript.ts index 0a51654de..fa6e1e36a 100644 --- a/apps/dashboard/src/components/__fixtures__/structured-transcript.ts +++ b/apps/dashboard/src/components/__fixtures__/structured-transcript.ts @@ -2,78 +2,44 @@ import type { FileNode } from '~/lib/types'; export const structuredTranscriptJsonl = [ { - schema_version: 'agentv.transcript.v1', - test_id: 'final-json-answer', - target: 'codex', - message_index: 0, - role: 'user', - content: 'Inspect the workspace and return the final JSON only.', - capture: { content: 'full', redaction_level: 'none', redacted_fields: [] }, - source: { - kind: 'agentv_run', - provider: 'codex', - session_id: 'session-123', - model: 'gpt-5-codex', - }, - transcript_token_usage: { input: 120, output: 80 }, - transcript_duration_ms: 2450, - transcript_cost_usd: 0.0123, + v: 1, + agent: 'codex', + model: 'gpt-5-codex', + type: 'user', + ts: '2026-06-26T12:00:00.000Z', + content: [{ type: 'text', text: 'Inspect the workspace and return the final JSON only.' }], }, { - schema_version: 'agentv.transcript.v1', - test_id: 'final-json-answer', - target: 'codex', - message_index: 1, - role: 'assistant', - content: 'I will inspect the file before answering.', - tool_calls: [ + v: 1, + agent: 'codex', + model: 'gpt-5-codex', + type: 'assistant', + ts: '2026-06-26T12:00:01.000Z', + input_tokens: 120, + output_tokens: 80, + content: [ + { type: 'text', text: 'I will inspect the file before answering.' }, { + type: 'tool_use', id: 'call-read-1', - tool: 'read_file', + name: 'read_file', input: { path: 'src/app.ts' }, - duration_ms: 32, metadata: { cwd: '/tmp/agentv-fixture' }, + result: { + status: 'success', + output: { ok: true, text: 'export const answer = 42;' }, + duration_ms: 32, + }, }, ], - capture: { content: 'full', redaction_level: 'none', redacted_fields: [] }, - source: { - kind: 'agentv_run', - provider: 'codex', - session_id: 'session-123', - model: 'gpt-5-codex', - }, }, { - schema_version: 'agentv.transcript.v1', - test_id: 'final-json-answer', - target: 'codex', - message_index: 2, - role: 'tool', - name: 'read_file', - content: { ok: true, text: 'export const answer = 42;' }, - duration_ms: 32, - capture: { content: 'full', redaction_level: 'none', redacted_fields: [] }, - source: { - kind: 'agentv_run', - provider: 'codex', - session_id: 'session-123', - model: 'gpt-5-codex', - }, - }, - { - schema_version: 'agentv.transcript.v1', - test_id: 'final-json-answer', - target: 'codex', - message_index: 3, - role: 'assistant', - content: '{"answer":42,"source":"src/app.ts"}', - capture: { content: 'full', redaction_level: 'none', redacted_fields: [] }, - source: { - kind: 'agentv_run', - provider: 'codex', - session_id: 'session-123', - model: 'gpt-5-codex', - }, + v: 1, + agent: 'codex', + model: 'gpt-5-codex', + type: 'assistant', + ts: '2026-06-26T12:00:02.000Z', + content: [{ type: 'text', text: '{"answer":42,"source":"src/app.ts"}' }], }, ] .map((line) => JSON.stringify(line)) diff --git a/apps/dashboard/src/components/transcript-timeline.test.tsx b/apps/dashboard/src/components/transcript-timeline.test.tsx index 98bee480b..bba7b936c 100644 --- a/apps/dashboard/src/components/transcript-timeline.test.tsx +++ b/apps/dashboard/src/components/transcript-timeline.test.tsx @@ -17,13 +17,9 @@ describe('TranscriptTimeline', () => { const parsed = parseTranscriptJsonl(structuredTranscriptJsonl); expect(parsed.error).toBeUndefined(); - expect(parsed.entries.map((entry) => entry.role)).toEqual([ - 'user', - 'assistant', - 'tool', - 'assistant', - ]); + expect(parsed.entries.map((entry) => entry.role)).toEqual(['user', 'assistant', 'assistant']); expect(parsed.entries[1].tool_calls?.[0]?.tool).toBe('read_file'); + expect(parsed.entries[1].tool_calls?.[0]?.status).toBe('success'); }); it('rejects malformed optional tool_calls fields before rendering', () => { @@ -68,10 +64,10 @@ describe('TranscriptTimeline', () => { expect(html).toContain('Transcript timeline'); expect(html).toContain('User'); expect(html).toContain('Assistant'); - expect(html).toContain('Tool result'); expect(html).toContain('read_file'); expect(html).toContain('Arguments'); expect(html).toContain('Result'); + expect(html).toContain('success'); expect(html).toContain('Open raw JSONL'); expect(html).toContain('Download JSONL'); expect(html).toContain('{"answer":42,"source":"src/app.ts"}'); diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index e790c8558..74f72e6ae 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -120,6 +120,7 @@ export interface EvalCaseTrial { timing_path?: string; grading_path?: string; transcript_path?: string; + transcript_raw_path?: string; answer_path?: string; } @@ -255,6 +256,7 @@ export interface EvalResult { timing_path?: string; metrics_path?: string; transcript_path?: string; + transcript_raw_path?: string; output_path?: string; answer_path?: string; } @@ -297,6 +299,7 @@ export type TranscriptArtifactStatus = 'ok' | 'missing' | 'dangling' | 'unsuppor export interface TranscriptArtifactResponse { status: TranscriptArtifactStatus; transcript_path?: string; + transcript_raw_path?: string; answer_path?: string; answer_content?: string; content?: string; diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index 600a57791..ccc27f89c 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -121,15 +121,17 @@ token/cost usage. Every case uses aggregate `summary.json`, then stores attempt details under `run-N/`. Each `run-N/` contains a compact per-attempt manifest `result.json`, -`grading.json`, `metrics.json`, `timing.json`, `transcript.json`, +`grading.json`, `metrics.json`, `timing.json`, `transcript.jsonl`, `transcript-raw.jsonl`, and `outputs/answer.md`. The `result.json` file carries -`grading_path`, transcript/output paths, and embedded timing/o11y metrics. - -`transcript-raw.jsonl` remains the ordered conversational/log compatibility -projection. Full trace detail stays in `trace.json` (`agentv.trace.v1`) when -emitted. `summary.json` remains the run-level aggregate summary, and -`index.jsonl` carries lightweight explicit paths such as `metrics_path` plus -the trace/transcript artifact pointers used for detached payload publishing. +`grading_path`, `metrics_path`, transcript, and output paths. + +`transcript-raw.jsonl` preserves native provider or harness transcript bytes +when they are available, while `transcript.jsonl` is the normalized +conversation transcript with joined `tool_use.result` blocks. Full trace detail +stays in `trace.json` (`agentv.trace.v1`) when emitted. `summary.json` remains +the run-level aggregate summary, and `index.jsonl` carries lightweight explicit +paths such as `transcript_path`, `transcript_raw_path`, and `metrics_path` plus +artifact pointers only when detached payload publishing needs them. Duration, token, and cost usage remains in `timing.json`, including source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. @@ -162,11 +164,11 @@ Agent Skills eval artifacts map into AgentV like this: |----------------------|--------------|-------------------| | Authored `evals/evals.json` cases | AgentV eval cases and task bundle paths | Eval source plus optional `task_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` | | Per-case answer | Generated target output artifact | `run-N/outputs/answer.md` | -| Per-attempt sidecars | Trace, transcript, metrics, and raw provider evidence | `run-N/transcript.json`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json`, `provider.log` when present | +| Per-attempt sidecars | Trace, normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json`, `provider.log` when present | | Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `run-N/timing.json` | | Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `run-N/grading.json`; summary fields can reference the same trace/result facts | | Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` | -| Transcript/log outlier analysis | Ordered transcript and canonical trace | `transcript.jsonl` for log compatibility; `trace.json` for full detail | +| Transcript/log outlier analysis | Normalized transcript, raw evidence, and canonical trace | `transcript.jsonl` for portable review; `transcript-raw.jsonl` for native evidence; `trace.json` for full detail | | Aggregate pass rate/time/tokens/delta | Run summaries and comparison tooling | `summary.json`, result comparisons, and projection bundles | ### Vendor-neutral projection bundle diff --git a/packages/core/src/evaluation/metrics.ts b/packages/core/src/evaluation/metrics.ts index b0243a9d2..4c367d175 100644 --- a/packages/core/src/evaluation/metrics.ts +++ b/packages/core/src/evaluation/metrics.ts @@ -186,7 +186,7 @@ export const MetricsArtifactWireSchema = z .strict(), source_artifacts: z .object({ - trace_path: z.string(), + trace_path: z.string().optional(), transcript_path: z.string().optional(), grading_path: z.string().optional(), timing_path: z.string().optional(), @@ -864,7 +864,7 @@ export function buildMetricsArtifact( generatedAt?: string; } = {}, ): MetricsArtifactWire { - const tracePath = options.tracePath ?? CANONICAL_TRACE_ARTIFACT_PATH; + const tracePath = options.tracePath; return MetricsArtifactWireSchema.parse( dropUndefined({ schema_version: METRICS_SCHEMA_VERSION, @@ -879,7 +879,7 @@ export function buildMetricsArtifact( artifact_id: envelope.artifactId, trace_id: envelope.trace.traceId, root_span_id: envelope.trace.rootSpanId, - path: tracePath, + path: tracePath ?? CANONICAL_TRACE_ARTIFACT_PATH, }, source_artifacts: dropUndefined({ trace_path: tracePath, diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts index d06135ea0..f1dcf6dba 100644 --- a/packages/core/src/evaluation/providers/pi-cli.ts +++ b/packages/core/src/evaluation/providers/pi-cli.ts @@ -680,9 +680,49 @@ function extractMessages(events: unknown[]): readonly Message[] { * Scan JSONL events for tool_execution_start / tool_execution_end pairs and * reconstruct ToolCall objects from them. */ +function eventTimestampIso(record: Record): string | undefined { + const timestamp = record.timestamp ?? record.time; + if (typeof timestamp === 'string') { + return timestamp; + } + if (typeof timestamp === 'number' && Number.isFinite(timestamp)) { + return new Date(timestamp).toISOString(); + } + return undefined; +} + +function deriveDurationMs( + startTime: string | undefined, + endTime: string | undefined, +): number | undefined { + if (!startTime || !endTime) { + return undefined; + } + const start = Date.parse(startTime); + const end = Date.parse(endTime); + if (!Number.isFinite(start) || !Number.isFinite(end) || end < start) { + return undefined; + } + return end - start; +} + +function toolStatusFromEvent(record: Record): ToolCall['status'] | undefined { + if (record.error) { + return 'error'; + } + const status = record.status; + if (status === 'ok' || status === 'error' || status === 'timeout' || status === 'cancelled') { + return status; + } + return undefined; +} + function extractToolCallsFromEvents(events: unknown[]): ToolCall[] { - const starts = new Map(); - const results = new Map(); + const starts = new Map(); + const results = new Map< + string, + { output: unknown; status?: ToolCall['status']; endTime?: string; durationMs?: number } + >(); for (const event of events) { if (!event || typeof event !== 'object') continue; @@ -690,50 +730,89 @@ function extractToolCallsFromEvents(events: unknown[]): ToolCall[] { const type = r.type; if (type === 'tool_execution_start' && typeof r.toolName === 'string') { const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined; - starts.set(id ?? `anon-${starts.size}`, { tool: r.toolName, input: r.args }); + starts.set(id ?? `anon-${starts.size}`, { + tool: r.toolName, + input: r.args, + startTime: eventTimestampIso(r), + }); } else if (type === 'tool_execution_end') { const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined; - if (id) results.set(id, r.result); + if (id) { + results.set(id, { + output: r.result, + status: toolStatusFromEvent(r) ?? (Object.hasOwn(r, 'result') ? 'ok' : undefined), + endTime: eventTimestampIso(r), + durationMs: toFiniteNumber(r.durationMs ?? r.duration_ms), + }); + } } } const toolCalls: ToolCall[] = []; - for (const [id, { tool, input }] of starts) { + for (const [id, { tool, input, startTime }] of starts) { + const result = results.get(id); toolCalls.push( normalizeToolCall('pi-cli', { tool, input: input as Record | undefined, id: id.startsWith('anon-') ? undefined : id, - output: results.get(id), + output: result?.output, + status: result?.status, + startTime, + endTime: result?.endTime, + durationMs: result?.durationMs ?? deriveDurationMs(startTime, result?.endTime), }), ); } return toolCalls; } +function toolCallDedupKey(toolCall: ToolCall): string { + return `${toolCall.tool}:${JSON.stringify(toolCall.input)}`; +} + +function mergeToolCallEvidence(existing: ToolCall, eventToolCall: ToolCall): ToolCall { + return { + ...existing, + output: existing.output ?? eventToolCall.output, + status: existing.status ?? eventToolCall.status, + startTime: existing.startTime ?? eventToolCall.startTime, + endTime: existing.endTime ?? eventToolCall.endTime, + durationMs: existing.durationMs ?? eventToolCall.durationMs, + }; +} + /** - * Merge event-sourced tool calls into messages. For each tool call, if it - * already exists (by id) in some message, skip it. Otherwise, append it to - * the last assistant message (creating one if needed). + * Merge event-sourced tool calls into messages. Existing calls are enriched + * with stream result/timing evidence; missing calls are appended to the last + * assistant message, creating one if needed. */ function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]): void { - const existingIds = new Set(); - const existingTools = new Set(); - for (const msg of messages) { - if (!msg.toolCalls) continue; - for (const tc of msg.toolCalls) { - if (tc.id) existingIds.add(tc.id); - // Track tool+input combos to avoid duplicates when there's no id - existingTools.add(`${tc.tool}:${JSON.stringify(tc.input)}`); + const missing: ToolCall[] = []; + for (const eventToolCall of eventToolCalls) { + let merged = false; + for (const [messageIndex, msg] of messages.entries()) { + if (!msg.toolCalls) continue; + const toolIndex = msg.toolCalls.findIndex((toolCall) => { + if (eventToolCall.id && toolCall.id === eventToolCall.id) { + return true; + } + return toolCallDedupKey(toolCall) === toolCallDedupKey(eventToolCall); + }); + if (toolIndex < 0) { + continue; + } + const toolCalls = [...msg.toolCalls]; + toolCalls[toolIndex] = mergeToolCallEvidence(toolCalls[toolIndex], eventToolCall); + messages[messageIndex] = { ...msg, toolCalls }; + merged = true; + break; + } + if (!merged) { + missing.push(eventToolCall); } } - const missing = eventToolCalls.filter((tc) => { - if (tc.id && existingIds.has(tc.id)) return false; - if (existingTools.has(`${tc.tool}:${JSON.stringify(tc.input)}`)) return false; - return true; - }); - if (missing.length === 0) return; // Find the last assistant message and replace it with an enriched copy diff --git a/packages/core/src/evaluation/result-row-schema.ts b/packages/core/src/evaluation/result-row-schema.ts index 709684547..8dfd9fae3 100644 --- a/packages/core/src/evaluation/result-row-schema.ts +++ b/packages/core/src/evaluation/result-row-schema.ts @@ -46,6 +46,7 @@ const RESULT_ROW_ALIASES = { tokenUsage: 'token_usage', tracePath: 'trace_path', transcriptPath: 'transcript_path', + transcriptRawPath: 'transcript_raw_path', workspacePath: 'workspace_path', } as const; diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 9d23fb2b5..e4040aad1 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -11,7 +11,10 @@ import { createHash } from 'node:crypto'; import { copyFile, mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; -import { traceEnvelopeToTranscriptJsonLines } from '../import/types.js'; +import { + traceEnvelopeToNormalizedTranscriptJsonLines, + traceEnvelopeToTranscriptJsonLines, +} from '../import/types.js'; import type { ExperimentArtifactMetadata } from './experiment.js'; import { type ExternalTraceMetadataWire, @@ -302,6 +305,7 @@ export interface IndexArtifactEntry { readonly answer_path?: string; readonly trace_path?: string; readonly transcript_path?: string; + readonly transcript_raw_path?: string; readonly metrics_path?: string; readonly artifact_pointers?: ResultArtifactPointersWire; readonly raw_provider_log_path?: string; @@ -349,6 +353,7 @@ export interface VercelRunResultArtifact { readonly duration_seconds: number; readonly model: string; readonly grading_path: string; + readonly metrics_path: string; readonly transcript_path?: string; readonly transcript_raw_path?: string; readonly o11y: { @@ -743,7 +748,8 @@ function buildVercelRunResultArtifact(params: { duration_seconds: resultDurationSeconds(params.result), model: params.result.target ?? 'unknown', grading_path: './grading.json', - transcript_path: params.hasTranscript ? './transcript.json' : undefined, + metrics_path: `./${CANONICAL_METRICS_ARTIFACT_PATH}`, + transcript_path: params.hasTranscript ? `./${CANONICAL_TRANSCRIPT_ARTIFACT_PATH}` : undefined, transcript_raw_path: params.hasTranscript ? './transcript-raw.jsonl' : undefined, o11y: { total_turns: metrics.total_turns, @@ -822,7 +828,9 @@ async function writeTrialRunArtifacts(params: { duplicatePolicy: params.duplicatePolicy, }); const hasTranscript = hasTranscriptProjection(result, envelope); - const transcriptPath = hasTranscript ? path.join(runDir, 'transcript.json') : undefined; + const transcriptPath = hasTranscript + ? path.join(runDir, CANONICAL_TRANSCRIPT_ARTIFACT_PATH) + : undefined; const transcriptRawPath = hasTranscript ? path.join(runDir, 'transcript-raw.jsonl') : undefined; await mkdir(runDir, { recursive: true }); @@ -838,19 +846,14 @@ async function writeTrialRunArtifacts(params: { await copyRawProviderLogArtifact(rawProviderLogSource, runDir); } if (transcriptPath && transcriptRawPath) { - await writeFile( - transcriptPath, - `${JSON.stringify(traceEnvelopeToTranscriptMessages(envelope), null, 2)}\n`, - 'utf8', - ); - await writeTranscriptJsonl(transcriptRawPath, result, envelope); + await writeNormalizedTranscriptJsonl(transcriptPath, envelope); + await writeRawTranscriptJsonl(transcriptRawPath, result, envelope); } const metricsArtifact = await writeMetricsArtifact({ filePath: metricsPath, result, envelope, - traceArtifactPath: 'transcript.json', - transcriptArtifactPath: transcriptRawPath ? 'transcript-raw.jsonl' : undefined, + transcriptArtifactPath: transcriptPath ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined, gradingArtifactPath: 'grading.json', timingArtifactPath: 'timing.json', timing, @@ -1446,6 +1449,7 @@ export function buildIndexArtifactEntry( answerPath?: string; tracePath?: string; transcriptPath?: string; + transcriptRawPath?: string; metricsPath?: string; artifactPointers?: ResultArtifactPointersWire; rawProviderLogPath?: string; @@ -1499,6 +1503,9 @@ export function buildIndexArtifactEntry( transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : undefined, + transcript_raw_path: options.transcriptRawPath + ? toRelativeArtifactPath(options.outputDir, options.transcriptRawPath) + : undefined, metrics_path: options.metricsPath ? toRelativeArtifactPath(options.outputDir, options.metricsPath) : undefined, @@ -1564,6 +1571,10 @@ export function buildResultIndexArtifact( answer_path: isSingleRun && hasAnswer ? path.posix.join(singleRunDir, 'outputs', 'answer.md') : undefined, transcript_path: + isSingleRun && hasTranscript + ? path.posix.join(singleRunDir, CANONICAL_TRANSCRIPT_ARTIFACT_PATH) + : undefined, + transcript_raw_path: isSingleRun && hasTranscript ? path.posix.join(singleRunDir, 'transcript-raw.jsonl') : undefined, @@ -1588,7 +1599,17 @@ function hasTranscriptProjection(result: EvaluationResult, envelope: TraceEnvelo return result.output.length > 0 || traceEnvelopeToTranscriptMessages(envelope).length > 0; } -async function writeTranscriptJsonl( +async function writeNormalizedTranscriptJsonl( + filePath: string, + envelope: TraceEnvelope, +): Promise { + const lines = traceEnvelopeToNormalizedTranscriptJsonLines(envelope); + const content = + lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join('\n')}\n` : ''; + await writeFile(filePath, content, 'utf8'); +} + +async function writeGeneratedRawTranscriptJsonl( filePath: string, result: EvaluationResult, envelope: TraceEnvelope, @@ -1602,6 +1623,19 @@ async function writeTranscriptJsonl( await writeFile(filePath, content, 'utf8'); } +async function writeRawTranscriptJsonl( + filePath: string, + result: EvaluationResult, + envelope: TraceEnvelope, +): Promise { + const rawSource = rawProviderLogSourcePath(result); + if (rawSource) { + await copyFile(rawSource, filePath); + return; + } + await writeGeneratedRawTranscriptJsonl(filePath, result, envelope); +} + function buildMetricsArtifactPayload(params: { readonly result: EvaluationResult; readonly envelope: TraceEnvelope; @@ -1613,7 +1647,7 @@ function buildMetricsArtifactPayload(params: { readonly timing?: TimingArtifact; }): ReturnType & { readonly timing?: TimingArtifact } { const artifact = buildMetricsArtifact(params.result, params.envelope, { - tracePath: params.traceArtifactPath ?? CANONICAL_TRACE_ARTIFACT_PATH, + tracePath: params.traceArtifactPath, transcriptPath: params.transcriptArtifactPath ?? (params.transcriptPath ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined), @@ -2005,6 +2039,10 @@ export async function writePerTestArtifacts( ? path.join(singleRunDir, 'outputs', 'answer.md') : undefined; const singleTranscriptPath = + isSingleRun && hasTranscriptProjection(result, envelope) + ? path.join(singleRunDir, CANONICAL_TRANSCRIPT_ARTIFACT_PATH) + : undefined; + const singleTranscriptRawPath = isSingleRun && hasTranscriptProjection(result, envelope) ? path.join(singleRunDir, 'transcript-raw.jsonl') : undefined; @@ -2033,6 +2071,7 @@ export async function writePerTestArtifacts( outputPath: singleAnswerPath, answerPath: singleAnswerPath, transcriptPath: singleTranscriptPath, + transcriptRawPath: singleTranscriptRawPath, extraIndexFields, projectionIdentity, duplicatePolicy, @@ -2098,6 +2137,10 @@ export async function writeArtifactsFromResults( ? path.join(singleRunDir, 'outputs', 'answer.md') : undefined; const singleTranscriptPath = + isSingleRun && hasTranscriptProjection(result, envelope) + ? path.join(singleRunDir, CANONICAL_TRANSCRIPT_ARTIFACT_PATH) + : undefined; + const singleTranscriptRawPath = isSingleRun && hasTranscriptProjection(result, envelope) ? path.join(singleRunDir, 'transcript-raw.jsonl') : undefined; @@ -2114,6 +2157,7 @@ export async function writeArtifactsFromResults( isSingleRun, singleAnswerPath, singleTranscriptPath, + singleTranscriptRawPath, singleGradingPath, singleTimingPath, singleMetricsPath, @@ -2190,6 +2234,7 @@ export async function writeArtifactsFromResults( outputPath: plan.singleAnswerPath, answerPath: plan.singleAnswerPath, transcriptPath: plan.singleTranscriptPath, + transcriptRawPath: plan.singleTranscriptRawPath, extraIndexFields, projectionIdentity: plan.projectionIdentity, duplicatePolicy, diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts index 1c85d229c..ccc174f7b 100644 --- a/packages/core/src/import/types.ts +++ b/packages/core/src/import/types.ts @@ -133,6 +133,60 @@ export interface TranscriptJsonLine { }; } +export type NormalizedTranscriptTurnType = 'system' | 'user' | 'assistant'; +export type NormalizedToolResultStatus = 'success' | 'error' | 'cancelled' | 'unknown'; + +export interface NormalizedTranscriptRawRef { + readonly line?: number; + readonly start_line?: number; + readonly end_line?: number; + readonly id?: string; +} + +export type NormalizedTranscriptContentBlock = + | { + readonly type: 'text'; + readonly text: string; + readonly raw_refs?: readonly NormalizedTranscriptRawRef[]; + } + | { + readonly type: 'tool_use'; + readonly id: string; + readonly name: string; + readonly input: unknown; + readonly result?: { + readonly status: NormalizedToolResultStatus; + readonly output?: unknown; + readonly duration_ms?: number; + }; + readonly raw_refs?: readonly NormalizedTranscriptRawRef[]; + readonly metadata?: Readonly>; + } + | { + readonly type: 'image'; + readonly source: string; + readonly mime_type?: string; + readonly metadata?: Readonly>; + } + | { + readonly type: 'thinking'; + readonly text: string; + readonly raw_refs?: readonly NormalizedTranscriptRawRef[]; + }; + +export interface NormalizedTranscriptJsonLine { + readonly v: 1; + readonly agent: string; + readonly type: NormalizedTranscriptTurnType; + readonly content: readonly NormalizedTranscriptContentBlock[]; + readonly ts?: string; + readonly id?: string; + readonly model?: string; + readonly input_tokens?: number; + readonly output_tokens?: number; + readonly raw_refs?: readonly NormalizedTranscriptRawRef[]; +} + /** * Grouped replayable transcript reconstructed from per-message rows. */ @@ -416,6 +470,244 @@ function projectedToolCalls( }); } +function normalizedTurnType(role: string): NormalizedTranscriptTurnType | undefined { + if (role === 'system' || role === 'user' || role === 'assistant') { + return role; + } + return undefined; +} + +function normalizeToolResultStatus( + status: ToolCall['status'] | undefined, +): NormalizedToolResultStatus { + if (status === 'ok') return 'success'; + if (status === 'cancelled') return 'cancelled'; + if (status === 'error' || status === 'timeout') return 'error'; + return 'unknown'; +} + +function normalizedToolResult( + toolCall: ToolCall, +): Extract['result'] | undefined { + const hasResult = + toolCall.status !== undefined || + toolCall.durationMs !== undefined || + toolCall.output !== undefined; + if (!hasResult) { + return undefined; + } + return dropUndefined({ + status: normalizeToolResultStatus(toolCall.status), + output: toolCall.output, + duration_ms: toolCall.durationMs, + }) as Extract['result']; +} + +function normalizedToolMetadata(toolCall: ToolCall): Record | undefined { + const metadata = dropUndefined({ + start_time: toolCall.startTime, + end_time: toolCall.endTime, + }); + return Object.keys(metadata).length > 0 ? metadata : undefined; +} + +function normalizedToolBlock( + toolCall: ToolCall, + messageIndex: number, + toolIndex: number, +): NormalizedTranscriptContentBlock { + return dropUndefined({ + type: 'tool_use', + id: toolCall.id ?? `tool_${messageIndex + 1}_${toolIndex + 1}`, + name: toolCall.tool, + input: toolCall.input ?? {}, + result: normalizedToolResult(toolCall), + metadata: normalizedToolMetadata(toolCall), + }) as NormalizedTranscriptContentBlock; +} + +function normalizedImageMetadata( + block: Record, +): Record | undefined { + const metadata = Object.fromEntries( + Object.entries(block).filter( + ([key]) => key !== 'type' && key !== 'source' && key !== 'media_type' && key !== 'mime_type', + ), + ); + return Object.keys(metadata).length > 0 ? metadata : undefined; +} + +function normalizedContentBlocks( + message: Message, + messageIndex: number, +): NormalizedTranscriptContentBlock[] { + const blocks: NormalizedTranscriptContentBlock[] = []; + const content = message.content; + + if (typeof content === 'string') { + if (content.length > 0) { + blocks.push({ type: 'text', text: content }); + } + } else if (Array.isArray(content)) { + for (const contentBlock of content) { + const block: unknown = contentBlock; + if (!isRecord(block) || typeof block.type !== 'string') { + continue; + } + if (block.type === 'text' && typeof block.text === 'string') { + blocks.push({ type: 'text', text: block.text }); + } else if ( + (block.type === 'thinking' || block.type === 'reasoning') && + typeof block.text === 'string' + ) { + blocks.push({ type: 'thinking', text: block.text }); + } else if (block.type === 'image' && typeof block.source === 'string') { + blocks.push( + dropUndefined({ + type: 'image', + source: block.source, + mime_type: + typeof block.mime_type === 'string' + ? block.mime_type + : typeof block.media_type === 'string' + ? block.media_type + : undefined, + metadata: normalizedImageMetadata(block), + }) as NormalizedTranscriptContentBlock, + ); + } + } + } + + for (const [toolIndex, toolCall] of (message.toolCalls ?? []).entries()) { + blocks.push(normalizedToolBlock(toolCall, messageIndex, toolIndex)); + } + + return blocks; +} + +function modelFromSource(source: TranscriptJsonLine['source']): string | undefined { + if (source.model) { + return source.model; + } + const model = source.metadata?.model; + return typeof model === 'string' && model.length > 0 ? model : undefined; +} + +function normalizedTurnId(message: Message): string | undefined { + const metadata = message.metadata; + if (!metadata) { + return undefined; + } + for (const key of ['message_id', 'id', 'span_id']) { + const value = metadata[key]; + if (typeof value === 'string' && value.length > 0) { + return value; + } + } + return undefined; +} + +function applyToolResultToPriorTurn( + turns: NormalizedTranscriptJsonLine[], + message: Message, + messageIndex: number, +): boolean { + const name = message.name; + for (let turnIndex = turns.length - 1; turnIndex >= 0; turnIndex -= 1) { + const turn = turns[turnIndex]; + const content = [...turn.content]; + const blockIndex = content.findIndex((block) => { + if (block.type !== 'tool_use' || block.result !== undefined) { + return false; + } + return name ? block.name === name || block.id === name : true; + }); + if (blockIndex < 0) { + continue; + } + const block = content[blockIndex]; + if (block.type !== 'tool_use') { + return false; + } + content[blockIndex] = { + ...block, + result: { + status: 'success', + output: message.content, + duration_ms: message.durationMs, + }, + }; + turns[turnIndex] = { ...turn, content }; + return true; + } + + turns.push({ + v: 1, + agent: 'agentv', + type: 'assistant', + content: [ + { + type: 'tool_use', + id: normalizedTurnId(message) ?? `tool_${messageIndex + 1}`, + name: name ?? 'tool', + input: {}, + result: { + status: 'success', + output: message.content, + duration_ms: message.durationMs, + }, + }, + ], + ts: message.startTime ?? message.endTime, + }); + return true; +} + +export function traceEnvelopeToNormalizedTranscriptJsonLines( + envelope: TraceEnvelope, +): NormalizedTranscriptJsonLine[] { + const messages = traceEnvelopeToTranscriptMessages(envelope); + const summary = traceEnvelopeToTraceSummary(envelope); + const source = sourceFromEnvelope(envelope, summary); + const agent = source.provider ?? envelope.eval.target ?? 'agentv'; + const model = modelFromSource(source); + const turns: NormalizedTranscriptJsonLine[] = []; + + messages.forEach((message, index) => { + if (message.role === 'tool' || message.role === 'function') { + applyToolResultToPriorTurn(turns, message, index); + return; + } + + const type = normalizedTurnType(message.role); + if (!type) { + return; + } + + const content = normalizedContentBlocks(message, index); + if (content.length === 0) { + return; + } + + turns.push( + dropUndefined({ + v: 1, + agent, + type, + ts: message.startTime, + id: normalizedTurnId(message), + model, + input_tokens: type === 'assistant' ? message.tokenUsage?.input : undefined, + output_tokens: type === 'assistant' ? message.tokenUsage?.output : undefined, + content, + }) as unknown as NormalizedTranscriptJsonLine, + ); + }); + + return turns; +} + export function traceEnvelopeToTranscriptJsonLines( envelope: TraceEnvelope, options?: { testId?: string; target?: string }, diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 461b3486f..64b2cb6b7 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -756,7 +756,7 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, '{"event":"provider-native"}\n', ); expect(readdirSync(runDir)).toContain('transcript-raw.jsonl'); - expect(readdirSync(runDir)).toContain('transcript.json'); + expect(readdirSync(runDir)).toContain('transcript.jsonl'); expect(readdirSync(outputsDir)).not.toContain('transcript.jsonl'); expect(readdirSync(outputsDir)).not.toContain('transcript.json'); @@ -766,7 +766,10 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, .map((line) => JSON.parse(line) as Record); expect(indexRows[0]?.raw_provider_log_path).toBeUndefined(); expect(indexRows[0]?.trace_path).toBeUndefined(); - expect(indexRows[0]?.transcript_path).toBe('test-dataset/case-1/run-1/transcript-raw.jsonl'); + expect(indexRows[0]?.transcript_path).toBe('test-dataset/case-1/run-1/transcript.jsonl'); + expect(indexRows[0]?.transcript_raw_path).toBe( + 'test-dataset/case-1/run-1/transcript-raw.jsonl', + ); }); it('reports failed progress status for batch item errors', async () => { diff --git a/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts index 55bf24352..3691722c5 100644 --- a/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts +++ b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from 'vitest'; import { _internal } from '../../../src/evaluation/providers/pi-cli.js'; +import { buildTraceEnvelopeFromEvaluationResult } from '../../../src/evaluation/trace-envelope.js'; +import { buildTraceFromMessages } from '../../../src/evaluation/trace.js'; +import type { EvaluationResult } from '../../../src/evaluation/types.js'; +import { traceEnvelopeToNormalizedTranscriptJsonLines } from '../../../src/import/types.js'; const { extractMessages, extractToolCallsFromEvents } = _internal; @@ -14,12 +18,14 @@ describe('pi-cli tool call extraction from events', () => { toolName: 'read', toolCallId: 'tc-1', args: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + timestamp: '2026-06-26T09:00:00.000Z', }, { type: 'tool_execution_end', toolName: 'read', toolCallId: 'tc-1', result: 'skill content here', + timestamp: '2026-06-26T09:00:00.025Z', }, { type: 'message_end' }, { @@ -42,6 +48,8 @@ describe('pi-cli tool call extraction from events', () => { file_path: '.agents/skills/csv-analyzer/SKILL.md', }); expect(toolCalls[0].output).toBe('skill content here'); + expect(toolCalls[0].status).toBe('ok'); + expect(toolCalls[0].durationMs).toBe(25); }); it('should inject event tool calls into messages when content has no tool calls', () => { @@ -76,7 +84,7 @@ describe('pi-cli tool call extraction from events', () => { }); }); - it('should not duplicate tool calls already present in messages', () => { + it('should join event tool results into existing message tool calls without duplicating', () => { const events = [ { type: 'tool_execution_start', @@ -112,6 +120,102 @@ describe('pi-cli tool call extraction from events', () => { expect(messages).toHaveLength(1); expect(messages[0].toolCalls).toHaveLength(1); + expect(messages[0].toolCalls?.[0]).toMatchObject({ + tool: 'Read', + id: 'tc-1', + input: { + path: '.agents/skills/csv-analyzer/SKILL.md', + file_path: '.agents/skills/csv-analyzer/SKILL.md', + }, + output: 'content', + status: 'ok', + }); + }); + + it('emits normalized transcript tool_use.result for Pi event result payloads', () => { + const events = [ + { + type: 'tool_execution_start', + toolName: 'bash', + toolCallId: 'tc-bash', + args: { command: 'cat package.json' }, + timestamp: '2026-06-26T09:00:00.000Z', + }, + { + type: 'tool_execution_end', + toolName: 'bash', + toolCallId: 'tc-bash', + result: { stdout: '{"scripts":{"test":"bun test"}}' }, + timestamp: '2026-06-26T09:00:00.040Z', + }, + { + type: 'agent_end', + messages: [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Checking scripts.' }, + { + type: 'tool_use', + name: 'bash', + id: 'tc-bash', + input: { command: 'cat package.json' }, + }, + ], + }, + ], + }, + ]; + + const output = extractMessages(events); + const trace = buildTraceFromMessages({ + input: [{ role: 'user', content: 'Check the package script.' }], + output, + finalOutput: 'Checking scripts.', + target: 'pi-cli', + testId: 'pi-tool-result-normalized', + startTime: '2026-06-26T09:00:00.000Z', + endTime: '2026-06-26T09:00:00.040Z', + }); + const result: EvaluationResult = { + timestamp: '2026-06-26T09:00:00.000Z', + testId: 'pi-tool-result-normalized', + suite: 'pi-cli', + score: 1, + assertions: [{ text: 'ok', passed: true }], + target: 'pi-cli', + durationMs: 40, + startTime: '2026-06-26T09:00:00.000Z', + endTime: '2026-06-26T09:00:00.040Z', + input: [{ role: 'user', content: 'Check the package script.' }], + output: 'Checking scripts.', + executionStatus: 'ok', + trace, + }; + const envelope = buildTraceEnvelopeFromEvaluationResult(result, { + source: { + kind: 'pi_session', + provider: 'pi', + format: 'jsonl', + }, + capture: { content: 'full', redactionLevel: 'none', redactedFields: [] }, + }); + + const rows = traceEnvelopeToNormalizedTranscriptJsonLines(envelope); + const assistant = rows.find((row) => row.type === 'assistant'); + const toolUse = assistant?.content.find((block) => block.type === 'tool_use'); + + expect(toolUse).toMatchObject({ + type: 'tool_use', + id: 'tc-bash', + name: 'Bash', + input: { command: 'cat package.json' }, + result: { + status: 'success', + output: { stdout: '{"scripts":{"test":"bun test"}}' }, + duration_ms: 40, + }, + }); }); it('should handle multiple tool execution events', () => {