From 8b15647e7acd19b43239efc3f518a84dce668c7d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 26 Jun 2026 11:00:55 +0200 Subject: [PATCH] refactor(results): remove public trace artifact surface --- .agents/conventions.md | 4 +- AGENTS.md | 2 +- ROADMAP.md | 2 +- apps/cli/src/commands/eval/artifact-writer.ts | 1 - apps/cli/src/commands/results/combine-run.ts | 44 +++++---- .../src/commands/results/projection-bundle.ts | 14 +-- .../commands/results/serve-file-tree.test.ts | 36 ++++--- apps/cli/src/commands/results/serve.ts | 11 ++- apps/cli/src/commands/results/validate.ts | 20 ++++ .../commands/eval/artifact-writer.test.ts | 23 ++--- .../cli/test/commands/results/combine.test.ts | 20 +--- apps/cli/test/commands/results/export.test.ts | 5 +- .../results/remote-auto-export.test.ts | 18 +--- apps/cli/test/commands/results/serve.test.ts | 2 +- .../test/commands/results/validate.test.ts | 53 ++++++++++ .../docs/docs/evaluation/running-evals.mdx | 41 ++++---- .../src/content/docs/docs/tools/import.mdx | 12 +-- .../src/content/docs/docs/tools/prepare.mdx | 2 +- .../src/content/docs/docs/tools/results.mdx | 21 ++-- ...-run-adapter-over-agentv-result-bundles.md | 19 ++-- ...normalized-transcript-artifact-contract.md | 4 +- packages/core/src/evaluation/metrics.ts | 16 +-- .../evaluation/result-artifact-contract.ts | 5 +- packages/core/src/evaluation/results-repo.ts | 38 ++++--- packages/core/src/evaluation/run-artifacts.ts | 98 ------------------- .../core/test/evaluation/results-repo.test.ts | 49 ++++++---- 26 files changed, 260 insertions(+), 300 deletions(-) diff --git a/.agents/conventions.md b/.agents/conventions.md index 6e1d0deae..c0d1e3a11 100644 --- a/.agents/conventions.md +++ b/.agents/conventions.md @@ -126,9 +126,9 @@ If you spot a camelCase key already on disk or in a response, treat it as a bug ## Result Artifact Pointers -`artifact_pointers` are for offloading large detached payload bytes from the results metadata/control plane. They describe where payloads such as trace or transcript files live when a run is projected to `agentv/artifacts/v1` or a future object store, including `key`, `object_version`, `sha256`, `size`, `media_type`, and `schema_version`. +`artifact_pointers` are for offloading large detached payload bytes from the results metadata/control plane. They describe where payloads such as transcript files live when a run is projected to `agentv/artifacts/v1` or a future object store, including `key`, `object_version`, `sha256`, `size`, `media_type`, and `schema_version`. -Do not add an `artifact_pointers.*` entry just because a new per-case artifact exists. Normal sidecars that stay in the run tree should be discoverable through explicit path fields on `index.jsonl`, manifests, or trace envelope artifacts, for example `metrics_path` for `outputs/metrics.json`. +Do not add an `artifact_pointers.*` entry just because a new per-case artifact exists. Normal sidecars that stay in the run tree should be discoverable through explicit path fields on `index.jsonl` or manifests, for example `metrics_path` for `outputs/metrics.json`. Before adding a new pointer family, verify that the artifact is large enough or detached enough to benefit from offloading and that published result repos should avoid carrying those payload bytes on the primary results branch. diff --git a/AGENTS.md b/AGENTS.md index 963e17c75..c23d93d75 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -48,7 +48,7 @@ Read the full rationale and examples in [.agents/product-boundary.md](.agents/pr - When dogfood or review reveals a durable workflow lesson, capture it in this guide or the relevant `.agents/*.md` guide before merge; do not leave durable agent instructions only in PR comments, Bead comments, or private evidence. Use `docs/solutions/` for fuller reusable writeups. - Wire formats are `snake_case`; internal TypeScript is `camelCase`. Translate only at the boundary. - In AgentV, a `project` holds runs, traces, and experiments; a `benchmark` is a curated eval suite. Do not collapse those terms. -- `artifact_pointers` are an offload indirection for large detached payload bytes, such as trace and transcript artifacts. Do not use them as the discovery path for ordinary per-case sidecars; expose those with explicit index/manifest path fields such as `metrics_path`. +- `artifact_pointers` are an offload indirection for large detached payload bytes, such as transcript artifacts. Do not use them as the discovery path for ordinary per-case sidecars; expose those with explicit index/manifest path fields such as `metrics_path`. ## Repo Map diff --git a/ROADMAP.md b/ROADMAP.md index fb69a1c16..e8d9f1b49 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -15,7 +15,7 @@ This roadmap translates [STRATEGY.md](STRATEGY.md) into the next few product pha ## Phase 1: Finish the artifact and local inspection foundation -- Keep the canonical handoff surface centered on completed run bundles, `index.jsonl`, grading/timing artifacts, and `outputs/trace.json` sidecars. +- Keep the canonical handoff surface centered on completed run bundles, `index.jsonl`, grading/timing/metrics artifacts, normalized transcripts, and optional `external_trace` link metadata. - Finish the vendor-neutral local export seams that let completed runs be re-read, compared, exported, and attached to non-Phoenix adapters without vendor-specific logic in core. - Keep OTLP/OpenInference mapping generic and reusable before building backend-specific upload or import paths. diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 26dc9471a..0979f36a9 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -97,7 +97,6 @@ export function buildIndexArtifactEntry( summaryPath?: string; outputPath?: string; answerPath?: string; - tracePath?: string; transcriptPath?: string; transcriptRawPath?: string; metricsPath?: string; diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts index 2e95aec07..fde67f12f 100644 --- a/apps/cli/src/commands/results/combine-run.ts +++ b/apps/cli/src/commands/results/combine-run.ts @@ -15,6 +15,7 @@ import { existsSync, mkdirSync, readFileSync, + rmSync, statSync, writeFileSync, } from 'node:fs'; @@ -367,7 +368,6 @@ const MANIFEST_PATH_FIELDS = [ 'input_path', 'output_path', 'response_path', - 'trace_path', 'transcript_path', 'transcript_raw_path', 'metrics_path', @@ -380,7 +380,6 @@ const MANIFEST_PATH_FIELDS = [ ] as const; const POINTER_FAMILIES = { - trace: 'traces', transcript: 'transcripts', } as const; @@ -467,15 +466,29 @@ function rewriteArtifactPointers( return undefined; } - return { - trace: rewriteArtifactPointer('trace', pointers.trace, sourceBaseDir, outputDir, sourceIndex), - transcript: rewriteTranscriptArtifactPointer( - pointers.transcript, - sourceBaseDir, - outputDir, - sourceIndex, - ), - }; + const transcript = rewriteTranscriptArtifactPointer( + pointers.transcript, + sourceBaseDir, + outputDir, + sourceIndex, + ); + return transcript ? { transcript } : undefined; +} + +function removeCopiedDeprecatedTraceArtifact( + row: SelectedRow, + outputDir: string, + sourceBaseDir: string, +): void { + const tracePath = row.record.trace_path; + if (!tracePath || !isSafeRelativeArtifactPath(tracePath)) { + return; + } + if (!existsSync(path.join(sourceBaseDir, tracePath))) { + return; + } + const copiedTracePath = path.join(outputDir, `sources/source-${row.source.index + 1}`, tracePath); + rmSync(copiedTracePath, { force: true }); } function rewriteAndCopyRecord( @@ -501,13 +514,8 @@ function rewriteAndCopyRecord( row.source.index, ); rewritten.artifact_pointers = artifactPointers; - if ( - row.record.trace_path && - rewritten.trace_path === row.record.trace_path && - artifactPointers?.trace?.path - ) { - rewritten.trace_path = artifactPointers.trace.path; - } + removeCopiedDeprecatedTraceArtifact(row, outputDir, sourceBaseDir); + rewritten.trace_path = undefined; if ( row.record.transcript_path && rewritten.transcript_path === row.record.transcript_path && diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts index a25d45d67..56fe98a5e 100644 --- a/apps/cli/src/commands/results/projection-bundle.ts +++ b/apps/cli/src/commands/results/projection-bundle.ts @@ -62,7 +62,6 @@ export interface ProjectionBundleEntry { readonly trace_id: string; readonly root_span_id: string; readonly span_count: number; - readonly envelope_ref?: string; }; readonly trace_envelope: TraceEnvelopeWire; readonly feedback: { @@ -101,7 +100,7 @@ export type ProjectionBundleArtifactRefs = Partial< | 'targets_path' | 'files_path' | 'graders_path' - > & { readonly trace_path: string } + > > & { readonly status: 'planned_export' | 'emitted'; }; @@ -147,13 +146,6 @@ function shortHash(parts: readonly string[], length = 20): string { return createHash('sha256').update(parts.join('\n')).digest('hex').slice(0, length); } -function tracePathFor(indexEntry: IndexArtifactEntry): string | undefined { - return ( - indexEntry.trace_path ?? - (indexEntry.artifact_dir ? path.posix.join(indexEntry.artifact_dir, 'trace.json') : undefined) - ); -} - function artifactRefs( indexEntry: IndexArtifactEntry, options: { @@ -181,7 +173,6 @@ function artifactRefs( transcript_path: indexEntry.transcript_path, transcript_raw_path: indexEntry.transcript_raw_path, metrics_path: indexEntry.metrics_path, - trace_path: tracePathFor(indexEntry), task_dir: indexEntry.task_dir, eval_path: indexEntry.eval_path, targets_path: indexEntry.targets_path, @@ -274,13 +265,11 @@ function buildEntry( ): ProjectionBundleEntry { const includeRawContent = options.includeRawContent ?? false; const sourcePath = toPortablePath(options.sourceFile, options.cwd); - const plannedIndexEntry = buildResultIndexArtifact(result); const envelope = buildTraceEnvelopeFromEvaluationResult(result, { evalPath: sourcePath, runId: options.runId, source: { kind: 'agentv_run', path: sourcePath, format: 'agentv_result' }, artifacts: { - trace_path: tracePathFor(indexRecord ?? plannedIndexEntry), answer_path: result.output.length > 0 ? 'outputs/answer.md' : undefined, }, duplicatePolicy: options.duplicatePolicy, @@ -334,7 +323,6 @@ function buildEntry( trace_id: envelopeWire.trace.trace_id, root_span_id: envelopeWire.trace.root_span_id, span_count: envelopeWire.trace.spans.length, - envelope_ref: refs.trace_path, }), trace_envelope: envelopeWire, feedback, diff --git a/apps/cli/src/commands/results/serve-file-tree.test.ts b/apps/cli/src/commands/results/serve-file-tree.test.ts index be1dc9c1b..123c23818 100644 --- a/apps/cli/src/commands/results/serve-file-tree.test.ts +++ b/apps/cli/src/commands/results/serve-file-tree.test.ts @@ -26,13 +26,13 @@ function localTreeRootedAtTestDir(prefix: string): FileNode[] { ]; } -function gitTraceEntry(prefix: string): ArtifactCatalogEntry { +function gitTranscriptEntry(prefix: string): ArtifactCatalogEntry { return { - displayPath: `${prefix}/outputs/trace.json`, - kind: 'trace', + displayPath: `${prefix}/transcript.jsonl`, + kind: 'transcript', storage: 'git', ref: 'agentv/artifacts/v1', - key: `runs/default/2026-06-22T01-12-44-924Z/${prefix}/outputs/trace.json`, + key: `runs/default/2026-06-22T01-12-44-924Z/${prefix}/transcript.jsonl`, }; } @@ -45,22 +45,18 @@ describe('overlayCatalogFileNodes', () => { it('overlays git artifacts into the existing folder instead of a duplicate subtree', () => { const files = localTreeRootedAtTestDir(prefix); - overlayCatalogFileNodes(files, [gitTraceEntry(prefix)], prefix); + overlayCatalogFileNodes(files, [gitTranscriptEntry(prefix)], prefix); // No duplicate `wtg-academy-n1-test` root node was created. expect(findByName(files, 'wtg-academy-n1-test')).toBeUndefined(); - // trace.json merged into the existing top-level `outputs` folder... - const outputs = findByName(files, 'outputs'); - expect(outputs?.type).toBe('dir'); - const trace = findByName(outputs?.children ?? [], 'trace.json'); - expect(trace).toBeDefined(); - // ...alongside the local answer.md, and with its full manifest-relative path - // preserved for content reads. - expect(findByName(outputs?.children ?? [], 'answer.md')).toBeDefined(); - expect(trace?.path).toBe(`${prefix}/outputs/trace.json`); - expect(trace?.storage).toBe('git'); - expect(trace?.ref).toBe('agentv/artifacts/v1'); + // transcript.jsonl merged into the existing top-level test artifact view + // with its full manifest-relative path preserved for content reads. + const transcript = findByName(files, 'transcript.jsonl'); + expect(transcript).toBeDefined(); + expect(transcript?.path).toBe(`${prefix}/transcript.jsonl`); + expect(transcript?.storage).toBe('git'); + expect(transcript?.ref).toBe('agentv/artifacts/v1'); }); it('does not re-add local files already present in the tree', () => { @@ -80,8 +76,8 @@ describe('overlayCatalogFileNodes', () => { it('falls back to full-path nesting when no root prefix applies', () => { const files: FileNode[] = []; const entry: ArtifactCatalogEntry = { - displayPath: 'outputs/trace.json', - kind: 'trace', + displayPath: 'outputs/transcript.jsonl', + kind: 'transcript', storage: 'git', ref: 'agentv/artifacts/v1', }; @@ -89,6 +85,8 @@ describe('overlayCatalogFileNodes', () => { const outputs = findByName(files, 'outputs'); expect(outputs?.type).toBe('dir'); - expect(findByName(outputs?.children ?? [], 'trace.json')?.path).toBe('outputs/trace.json'); + expect(findByName(outputs?.children ?? [], 'transcript.jsonl')?.path).toBe( + 'outputs/transcript.jsonl', + ); }); }); diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index d8502a8ed..0e7447208 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -514,11 +514,16 @@ function resolveRecordArtifactPointer( record: ResultManifestRecord, kind: 'transcript' | 'answer' | 'trace', ): ResolvedArtifactPointer { + const legacyArtifactPointers = record.artifact_pointers as + | (ResultManifestRecord['artifact_pointers'] & { + readonly trace?: NonNullable['transcript']; + }) + | undefined; const pointer = kind === 'transcript' ? record.artifact_pointers?.transcript : kind === 'trace' - ? record.artifact_pointers?.trace + ? legacyArtifactPointers?.trace : undefined; const pointerPath = artifactPointerPath(pointer); const description = artifactPointerDescription(pointer); @@ -1059,8 +1064,8 @@ function traceSessionArtifactResponse( function missingTraceMessage(): string { return [ - 'This result does not include canonical trace.json metadata.', - 'Dashboard trace sessions require an agentv.trace.v1 sidecar artifact.', + 'This result does not include legacy trace artifact metadata.', + 'Dashboard transcript inspection uses transcript.jsonl for current run bundles.', ].join(' '); } diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts index cbd2f8679..49cb206be 100644 --- a/apps/cli/src/commands/results/validate.ts +++ b/apps/cli/src/commands/results/validate.ts @@ -149,6 +149,26 @@ function checkIndexJsonl(runDir: string): { diagnostics: Diagnostic[]; entries: }); } + if (typeof entry.trace_path === 'string') { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): trace_path is no longer supported; use transcript_path and metrics_path`, + }); + } + + const artifactPointers = entry.artifact_pointers; + if ( + artifactPointers && + typeof artifactPointers === 'object' && + !Array.isArray(artifactPointers) && + Object.hasOwn(artifactPointers, 'trace') + ) { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): artifact_pointers.trace is no longer supported`, + }); + } + if (!entry.scores || !Array.isArray(entry.scores) || entry.scores.length === 0) { diagnostics.push({ severity: 'warning', diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 6c9b67c1c..39997d310 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -1,27 +1,17 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; -import { createHash } from 'node:crypto'; import { mkdir, readFile, readdir, rm, writeFile } from 'node:fs/promises'; import path from 'node:path'; import { - AGENTV_RESULTS_ARTIFACTS_REF, CANONICAL_METRICS_ARTIFACT_PATH, - CANONICAL_TRACE_ARTIFACT_PATH, CANONICAL_TRANSCRIPT_ARTIFACT_PATH, - EXECUTION_TRACE_SCHEMA_VERSION, type EvalTest, type EvaluationResult, type GraderResult, METRICS_SCHEMA_VERSION, MetricsArtifactWireSchema, - TRACE_JSON_MEDIA_TYPE, - TRANSCRIPT_JSONL_MEDIA_TYPE, - TRANSCRIPT_SCHEMA_VERSION, - TraceEnvelopeWireSchema, buildTraceFromMessages, - fromTraceEnvelopeWire, parseYamlValue, - traceEnvelopeToTranscriptJsonLines, } from '@agentv/core'; import { @@ -85,10 +75,6 @@ function makeEvaluatorResult(overrides: Partial = {}): GraderResul } as GraderResult; } -function sha256Hex(content: Buffer): string { - return createHash('sha256').update(content).digest('hex'); -} - // --------------------------------------------------------------------------- // Grading artifact // --------------------------------------------------------------------------- @@ -1250,6 +1236,9 @@ describe('writeArtifactsFromResults', () => { await expect( readFile(path.join(testDir, 'transcript-case', 'transcript.json'), 'utf8'), ).rejects.toThrow(); + await expect( + readFile(path.join(testDir, 'transcript-case', 'run-1', 'trace.json'), 'utf8'), + ).rejects.toThrow(); const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), @@ -1366,6 +1355,12 @@ describe('writeArtifactsFromResults', () => { ); expect(summary.schema_version).toBe(METRICS_SCHEMA_VERSION); + expect(summary.trace).toMatchObject({ + schema_version: 'agentv.trace.v1', + trace_id: expect.any(String), + root_span_id: expect.any(String), + }); + expect(summary.trace).not.toHaveProperty('path'); expect(summary.source_artifacts).toMatchObject({ transcript_path: 'transcript.jsonl', grading_path: 'grading.json', diff --git a/apps/cli/test/commands/results/combine.test.ts b/apps/cli/test/commands/results/combine.test.ts index bed14923f..bfc2155f7 100644 --- a/apps/cli/test/commands/results/combine.test.ts +++ b/apps/cli/test/commands/results/combine.test.ts @@ -193,17 +193,6 @@ describe('results combine', () => { metrics_path: 'demo/test-a/metrics.json', raw_provider_log_path: 'demo/test-a/provider.log', artifact_pointers: { - trace: { - ref: 'agentv/artifacts/v1', - key: 'traces/demo/test-a/trace.json', - object_version: 'sha256:trace', - path: 'demo/test-a/trace.json', - sha256: 'trace', - size: 18, - schema_version: 'agentv.trace.v1', - media_type: 'application/vnd.agentv.trace.v1+json', - family: 'traces', - }, transcript: { ref: 'agentv/artifacts/v1', key: 'transcripts/demo/test-a/transcript.jsonl', @@ -263,23 +252,20 @@ describe('results combine', () => { const [record] = readIndex(combined.manifestPath); expect(record.artifact_dir).toBe('sources/source-1/demo/test-a'); - expect(record.trace_path).toBe('sources/source-1/demo/test-a/trace.json'); + expect(record).not.toHaveProperty('trace_path'); expect(record.transcript_path).toBe('sources/source-1/demo/test-a/transcript.jsonl'); expect(record.metrics_path).toBe('sources/source-1/demo/test-a/metrics.json'); expect(record.raw_provider_log_path).toBe('sources/source-1/demo/test-a/provider.log'); expect(record.artifact_pointers).toMatchObject({ - trace: { - key: 'traces/sources/source-1/demo/test-a/trace.json', - path: 'sources/source-1/demo/test-a/trace.json', - }, transcript: { key: 'transcripts/sources/source-1/demo/test-a/transcript.jsonl', path: 'sources/source-1/demo/test-a/transcript.jsonl', }, }); + expect(record.artifact_pointers).not.toHaveProperty('trace'); expect(record.artifact_pointers).not.toHaveProperty('metrics'); expect(existsSync(path.join(combined.runDir, 'sources/source-1/demo/test-a/trace.json'))).toBe( - true, + false, ); expect( existsSync(path.join(combined.runDir, 'sources/source-1/demo/test-a/transcript.jsonl')), diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index b39734d28..275c1d5be 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -362,11 +362,12 @@ describe('results export', () => { answer_path: 'privacy/test-private/run-1/outputs/answer.md', transcript_path: 'privacy/test-private/run-1/transcript.jsonl', transcript_raw_path: 'privacy/test-private/run-1/transcript-raw.jsonl', - trace_path: 'privacy/test-private/trace.json', }); + expect(bundle.entries[0].artifact_refs).not.toHaveProperty('trace_path'); expect(bundle.entries[0].artifact_refs).not.toHaveProperty('input_path'); - expect(bundle.entries[0].trace.envelope_ref).toBe('privacy/test-private/trace.json'); + expect(bundle.entries[0].trace).not.toHaveProperty('envelope_ref'); expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined(); + expect(bundle.entries[0].trace_envelope.artifacts).not.toHaveProperty('trace_path'); expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/run-1/grading.json'); expect(bundle.entries[0].raw_content).toBeDefined(); expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence'); diff --git a/apps/cli/test/commands/results/remote-auto-export.test.ts b/apps/cli/test/commands/results/remote-auto-export.test.ts index cfbfadfe1..c3621b125 100644 --- a/apps/cli/test/commands/results/remote-auto-export.test.ts +++ b/apps/cli/test/commands/results/remote-auto-export.test.ts @@ -81,13 +81,10 @@ function writeRunArtifactsWithPointers(projectDir: string): string { const runDir = path.join(projectDir, '.agentv', 'results', 'default', 'run-002'); const artifactDir = path.join(runDir, 'alpha'); mkdirSync(artifactDir, { recursive: true }); - const traceContent = Buffer.from('{"schema_version":"agentv.trace.v1","spans":[]}\n'); const transcriptContent = Buffer.from( '{"schema_version":"agentv.transcript.v1","role":"assistant","content":"ok"}\n', ); - writeFileSync(path.join(artifactDir, 'trace.json'), traceContent); writeFileSync(path.join(artifactDir, 'transcript.jsonl'), transcriptContent); - const traceSha = sha256Hex(traceContent); const transcriptSha = sha256Hex(transcriptContent); writeFileSync( path.join(runDir, 'index.jsonl'), @@ -95,17 +92,6 @@ function writeRunArtifactsWithPointers(projectDir: string): string { test_id: 'alpha', score: 1, artifact_pointers: { - trace: { - ref: AGENTV_RESULTS_ARTIFACTS_REF, - key: 'traces/alpha/trace.json', - object_version: `sha256:${traceSha}`, - path: 'alpha/trace.json', - sha256: traceSha, - size: traceContent.byteLength, - schema_version: 'agentv.trace.v1', - media_type: 'application/vnd.agentv.trace.v1+json', - family: 'traces', - }, transcript: { ref: AGENTV_RESULTS_ARTIFACTS_REF, key: 'transcripts/alpha/transcript.jsonl', @@ -236,7 +222,7 @@ describe('maybeAutoExportRunArtifacts', () => { rootDir, ), ); - expect(index.artifact_pointers.trace.key).toBe('runs/default/run-002/alpha/trace.json'); + expect(index.artifact_pointers).not.toHaveProperty('trace'); expect(index.artifact_pointers.transcript.key).toBe( 'runs/default/run-002/alpha/transcript.jsonl', ); @@ -244,7 +230,7 @@ describe('maybeAutoExportRunArtifacts', () => { `git --git-dir "${remoteDir}" ls-tree -r --name-only ${AGENTV_RESULTS_ARTIFACTS_REF}`, rootDir, ); - expect(artifactTree).toContain('runs/default/run-002/alpha/trace.json'); + expect(artifactTree).not.toContain('runs/default/run-002/alpha/trace.json'); expect(artifactTree).toContain('runs/default/run-002/alpha/transcript.jsonl'); }, 20_000); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index ca5f6943b..95fd1ab48 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -3148,7 +3148,7 @@ describe('serve app', () => { }; expect(traceData.schema_version).toBe('agentv.dashboard.trace_artifact.v1'); expect(traceData.status).toBe('missing'); - expect(traceData.message).toContain('trace.json'); + expect(traceData.message).toContain('transcript.jsonl'); const detailRes = await app.request(`/api/runs/${encodeURIComponent(runId)}`); expect(detailRes.status).toBe(200); diff --git a/apps/cli/test/commands/results/validate.test.ts b/apps/cli/test/commands/results/validate.test.ts index 3836b90bc..4c68e016f 100644 --- a/apps/cli/test/commands/results/validate.test.ts +++ b/apps/cli/test/commands/results/validate.test.ts @@ -64,4 +64,57 @@ describe('results validate', () => { rmSync(tempDir, { recursive: true, force: true }); } }); + + it('rejects public trace artifact fields in index.jsonl', () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-validate-test-')); + + try { + const runDir = path.join( + tempDir, + '.agentv', + 'results', + 'with-skills', + '2026-03-27T12-42-24-429Z', + ); + mkdirSync(runDir, { recursive: true }); + writeFileSync( + path.join(runDir, 'index.jsonl'), + `${JSON.stringify({ + timestamp: '2026-03-27T12:42:24.429Z', + test_id: 'test-greeting', + score: 1, + target: 'gpt-4o', + scores: [{ name: 'quality', type: 'llm', score: 1, verdict: 'pass' }], + execution_status: 'ok', + summary_path: 'test-greeting/summary.json', + trace_path: 'test-greeting/run-1/trace.json', + artifact_pointers: { + trace: { + ref: 'agentv/artifacts/v1', + key: 'traces/test-greeting/run-1/trace.json', + path: 'test-greeting/run-1/trace.json', + }, + }, + })}\n`, + ); + mkdirSync(path.join(runDir, 'test-greeting'), { recursive: true }); + writeFileSync(path.join(runDir, 'test-greeting', 'summary.json'), '{}\n'); + writeFileSync(path.join(runDir, 'summary.json'), '{}\n'); + + const { diagnostics } = validateRunDirectory(runDir); + + expect(diagnostics).toContainEqual({ + severity: 'error', + message: + 'index.jsonl line 1 (test-greeting): trace_path is no longer supported; use transcript_path and metrics_path', + }); + expect(diagnostics).toContainEqual({ + severity: 'error', + message: + 'index.jsonl line 1 (test-greeting): artifact_pointers.trace is no longer supported', + }); + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } + }); }); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 2596eedd8..80e624e9a 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -441,20 +441,20 @@ agentv eval evals/my-eval.yaml --transcript .agentv/transcripts/claude-.json See the [Import tool docs](/docs/tools/import/) for all providers and options. -## Transcript And Trace Artifacts +## Transcript And Result Artifacts Each result row's `artifact_dir` is a case-local folder under the timestamped -run bundle. It can include `trace.json`, `transcript.jsonl`, `provider.log`, -`grading.json`, `timing.json`, `metrics.json`, and generated outputs under -`outputs/`. The run root does not contain a mixed transcript artifact; use each -index row's `transcript_path` to find the per-result transcript. +run bundle. It can include `transcript.jsonl`, `transcript-raw.jsonl`, +`provider.log`, `grading.json`, `timing.json`, `metrics.json`, and generated +outputs under `outputs/`. The run root does not contain a mixed transcript +artifact; use each index row's `transcript_path` to find the per-result +transcript. Rows also include `artifact_pointers` for AgentV-owned artifact storage. Pointer -entries such as `artifact_pointers.trace` and `artifact_pointers.transcript` -carry the storage `ref`, artifact `key`, canonical run-relative `path`, -`object_version`, `sha256`, `size`, `schema_version`, and `media_type` so -viewers and exports can migrate from git refs to object storage without changing -the run record contract. +entries such as `artifact_pointers.transcript` carry the storage `ref`, artifact +`key`, canonical run-relative `path`, `object_version`, `sha256`, `size`, +`schema_version`, and `media_type` so viewers and exports can migrate from git +refs to object storage without changing the run record contract. When automatic remote publishing sees pointers whose `ref` is `agentv/artifacts/v1`, it also pushes those payload bytes to the @@ -462,17 +462,16 @@ When automatic remote publishing sees pointers whose `ref` is `runs//` and rewrites the published pointer `key` to that backend object key. The configured results branch is the metadata/control plane for `index.jsonl`, `summary.json`, tags, and pointers; it does not -duplicate canonical trace/transcript payload bodies when those rows name -`agentv/artifacts/v1`. Local pre-publish run workspaces can still contain the -files beside the manifest, and Dashboard resolves the published pointers lazily -when a transcript or trace view requests the payload. AgentV keeps this explicit -pointer/backend contract instead of using Git LFS as the core abstraction so -S3, B2, or other object stores can use the same `key`, `object_version`, -`sha256`, `size`, `media_type`, and `schema_version` fields later. - -`trace.json` is the full-fidelity `agentv.trace.v1` sidecar. -It stores the canonical span graph, source metadata, capture/redaction policy, -conversion warnings, score provenance, and opaque evidence references. +duplicate canonical transcript payload bodies when those rows name +`agentv/artifacts/v1`. Dashboard resolves the published pointers lazily when a +transcript view requests the payload. AgentV keeps this explicit pointer/backend +contract instead of using Git LFS as the core abstraction so S3, B2, or other +object stores can use the same `key`, `object_version`, `sha256`, `size`, +`media_type`, and `schema_version` fields later. + +AgentV does not persist a public `trace.json` sidecar in run bundles. Use +`external_trace` metadata for link-out correlation when another observability +system already owns spans. `transcript.jsonl` is the canonical AgentV transcript/timeline artifact. It uses provider-neutral `agentv.transcript.v1` rows with stable top-level fields diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx index ad23a6303..aa4739722 100644 --- a/apps/web/src/content/docs/docs/tools/import.mdx +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -174,12 +174,12 @@ row keys. Rows without `schema_version`, `capture`, or `trace` from older AgentV transcript exports remain replayable. New eval run artifacts write the v1 shape. -For eval run artifacts, `transcript.jsonl` is derived from -`trace.json`; it is a portable message/event projection, not a second -canonical trace source or a provider-native session dump. Provider-native -session or stream logs, when captured during an eval run, are separate raw -evidence artifacts referenced by `raw_provider_log_path`; Agent Skills import, -convert, transpile, and run paths do not require them. +For eval run artifacts, `transcript.jsonl` is the portable message/event +projection. AgentV does not persist a public `trace.json` run sidecar, and the +transcript is not a provider-native session dump. Provider-native session or +stream logs, when captured during an eval run, are separate raw evidence +artifacts referenced by `raw_provider_log_path`; Agent Skills import, convert, +transpile, and run paths do not require them. ## What Gets Parsed diff --git a/apps/web/src/content/docs/docs/tools/prepare.mdx b/apps/web/src/content/docs/docs/tools/prepare.mdx index 47defa41e..c93470fd1 100644 --- a/apps/web/src/content/docs/docs/tools/prepare.mdx +++ b/apps/web/src/content/docs/docs/tools/prepare.mdx @@ -61,7 +61,7 @@ Supported `--trace` inputs: | Format | Typical source | |--------|----------------| -| `agentv.trace.v1` JSON or JSONL | `trace.json` from an AgentV run or replay/export workflow | +| `agentv.trace.v1` JSON or JSONL | Explicit trace replay/export files | | AgentV transcript JSONL | `agentv import claude`, `agentv import codex`, or `agentv import copilot` output | Single-record trace files are accepted directly. Multi-record files are matched by `test_id` and target. The selected trace is projected into AgentV's normal `trace` and `messages` grader context, so `tool-trajectory`, execution-metrics, and code graders receive the same shape they see during eval runs. diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index ccc27f89c..544f20f9e 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -127,11 +127,13 @@ Every case uses aggregate `summary.json`, then stores attempt details under `transcript-raw.jsonl` preserves native provider or harness transcript bytes when they are available, while `transcript.jsonl` is the normalized -conversation transcript with joined `tool_use.result` blocks. Full trace detail -stays in `trace.json` (`agentv.trace.v1`) when emitted. `summary.json` remains -the run-level aggregate summary, and `index.jsonl` carries lightweight explicit -paths such as `transcript_path`, `transcript_raw_path`, and `metrics_path` plus -artifact pointers only when detached payload publishing needs them. +conversation transcript with joined `tool_use.result` blocks. AgentV does not +persist a public `trace.json` sidecar in run bundles; external observability +systems can be linked through safe `external_trace` metadata when available. +`summary.json` remains the run-level aggregate summary, and `index.jsonl` +carries lightweight explicit paths such as `transcript_path`, +`transcript_raw_path`, and `metrics_path` plus artifact pointers only when +detached payload publishing needs them. Duration, token, and cost usage remains in `timing.json`, including source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. @@ -164,11 +166,11 @@ Agent Skills eval artifacts map into AgentV like this: |----------------------|--------------|-------------------| | Authored `evals/evals.json` cases | AgentV eval cases and task bundle paths | Eval source plus optional `task_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` | | Per-case answer | Generated target output artifact | `run-N/outputs/answer.md` | -| Per-attempt sidecars | Trace, normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json`, `provider.log` when present | +| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json`, `provider.log` when present | | Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `run-N/timing.json` | | Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `run-N/grading.json`; summary fields can reference the same trace/result facts | | Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` | -| Transcript/log outlier analysis | Normalized transcript, raw evidence, and canonical trace | `transcript.jsonl` for portable review; `transcript-raw.jsonl` for native evidence; `trace.json` for full detail | +| Transcript/log outlier analysis | Normalized transcript, raw evidence, metrics, and optional external trace link | `transcript.jsonl` for portable review; `transcript-raw.jsonl` for native evidence; `metrics.json` for behavior summaries; `external_trace` for link-out correlation | | Aggregate pass rate/time/tokens/delta | Run summaries and comparison tooling | `summary.json`, result comparisons, and projection bundles | ### Vendor-neutral projection bundle @@ -206,9 +208,8 @@ export `index.jsonl` and use `artifact_refs.status: "emitted"`. Raw prompt text, final output, and tool arguments/results are excluded by default, and raw-bearing artifact refs such as `grading_path`, `input_path`, -`answer_path`, `transcript_path`, and `trace_path` are omitted from -metadata-only bundles. To include raw payloads and raw-bearing refs in the -bundle, opt in explicitly: +`answer_path`, and `transcript_path` are omitted from metadata-only bundles. To +include raw payloads and raw-bearing refs in the bundle, opt in explicitly: ```bash agentv results export --dry-run --include-raw-content diff --git a/docs/adr/0003-keep-opik-export-as-post-run-adapter-over-agentv-result-bundles.md b/docs/adr/0003-keep-opik-export-as-post-run-adapter-over-agentv-result-bundles.md index 6fee23bd8..c2370b02b 100644 --- a/docs/adr/0003-keep-opik-export-as-post-run-adapter-over-agentv-result-bundles.md +++ b/docs/adr/0003-keep-opik-export-as-post-run-adapter-over-agentv-result-bundles.md @@ -4,7 +4,12 @@ Date: 2026-06-18 ## Status -Proposed +Superseded by the 2026-06-20 Phoenix/observability boundary and ADR 0008. + +AgentV no longer persists or advertises public `outputs/trace.json` / +`trace_path` run-bundle sidecars. The durable public run artifact contract is +`index.jsonl`, grading/timing/metrics artifacts, normalized transcript +sidecars, answer outputs, and optional `external_trace` link metadata. ## Context @@ -14,7 +19,6 @@ AgentV already has the post-run artifacts an Opik exporter should consume: - per-test grading, timing, answer, and transcript artifacts from `packages/core/src/evaluation/run-artifacts.ts`, with the CLI wrapper in `apps/cli/src/commands/eval/artifact-writer.ts`; -- canonical trace sidecars in `outputs/trace.json` using `agentv.trace.v1`; - in-memory `EvaluationResult` and `TraceEnvelope` read models in `packages/core/src/evaluation/types.ts` and `packages/core/src/evaluation/trace-envelope.ts`. That is the correct product boundary. AgentV remains the runner, gate, and artifact source of truth. Opik should be a projection over completed AgentV runs, not the runtime owner of AgentV execution. @@ -24,10 +28,10 @@ Two existing constraints matter: 1. `av-vwa.16.4` is the planned vendor-neutral projection bundle that external adapters should consume. 2. `av-vwa.16.2` is the planned stable external identity and duplicate-policy work. -There is also a privacy mismatch in current artifact generation: the canonical -trace envelope builder defaults to metadata-only capture, but -`run-artifacts.ts` currently overrides that and writes `outputs/trace.json` -sidecars with full content capture. +There was also a privacy mismatch in an older artifact design: the canonical +trace envelope builder defaulted to metadata-only capture, while result +artifacts could persist full content capture. The current public run-bundle +contract avoids that persisted trace sidecar surface. ## Audit @@ -91,7 +95,8 @@ The future Opik adapter should consume one of these equivalent inputs: - `summary.json` - per-test `grading.json` - per-test `timing.json` - - per-test `outputs/trace.json` + - per-test `metrics.json` + - per-test `transcript.jsonl` The adapter should emit or upload Opik-native objects only after the AgentV run is complete. diff --git a/docs/adr/0008-normalized-transcript-artifact-contract.md b/docs/adr/0008-normalized-transcript-artifact-contract.md index 8c07e6588..b8711cc43 100644 --- a/docs/adr/0008-normalized-transcript-artifact-contract.md +++ b/docs/adr/0008-normalized-transcript-artifact-contract.md @@ -103,7 +103,9 @@ path fields: Do not use `artifact_pointers` as the discovery path for ordinary per-run transcript sidecars. `artifact_pointers` remains an offload indirection for large -detached payload bytes. +detached transcript payload bytes. AgentV run bundles do not persist or +advertise a public `trace.json`/`trace_path` sidecar; external observability +systems can be correlated through `external_trace` link metadata when available. AgentV may derive additional event-oriented projections from `transcript.jsonl` for Dashboard queries, tool-trajectory scoring, OpenTelemetry/OpenInference diff --git a/packages/core/src/evaluation/metrics.ts b/packages/core/src/evaluation/metrics.ts index 4c367d175..c66d1d17b 100644 --- a/packages/core/src/evaluation/metrics.ts +++ b/packages/core/src/evaluation/metrics.ts @@ -2,19 +2,15 @@ * AgentV metrics v1. * * This is a derived per-case executor metrics projection over `EvaluationResult` - * and `agentv.trace.v1`. It aligns with AgentV's case-local `metrics.json` + * and the internal trace envelope. It aligns with AgentV's case-local `metrics.json` * while carrying the compact Vercel-style observability fields. It is not the - * canonical trace store; full detail stays in `trace.json`, ordered - * transcript compatibility rows stay in `transcript.jsonl`, and + * canonical trace store; portable transcript detail stays in `transcript.jsonl`, and * duration/token/cost usage stays in `timing.json`. */ import { z } from 'zod'; import type { Message, ToolCall } from './providers/types.js'; -import { - CANONICAL_TRACE_ARTIFACT_PATH, - METRICS_SCHEMA_VERSION, -} from './result-artifact-contract.js'; +import { METRICS_SCHEMA_VERSION } from './result-artifact-contract.js'; import { EXECUTION_TRACE_SCHEMA_VERSION, type TraceEnvelope } from './trace-envelope.js'; import type { TraceEvent } from './trace.js'; import type { EvaluationResult } from './types.js'; @@ -181,12 +177,10 @@ export const MetricsArtifactWireSchema = z artifact_id: z.string(), trace_id: z.string(), root_span_id: z.string(), - path: z.string(), }) .strict(), source_artifacts: z .object({ - trace_path: z.string().optional(), transcript_path: z.string().optional(), grading_path: z.string().optional(), timing_path: z.string().optional(), @@ -857,14 +851,12 @@ export function buildMetricsArtifact( result: EvaluationResult, envelope: TraceEnvelope, options: { - tracePath?: string; transcriptPath?: string; gradingPath?: string; timingPath?: string; generatedAt?: string; } = {}, ): MetricsArtifactWire { - const tracePath = options.tracePath; return MetricsArtifactWireSchema.parse( dropUndefined({ schema_version: METRICS_SCHEMA_VERSION, @@ -879,10 +871,8 @@ export function buildMetricsArtifact( artifact_id: envelope.artifactId, trace_id: envelope.trace.traceId, root_span_id: envelope.trace.rootSpanId, - path: tracePath ?? CANONICAL_TRACE_ARTIFACT_PATH, }, source_artifacts: dropUndefined({ - trace_path: tracePath, transcript_path: options.transcriptPath, grading_path: options.gradingPath, timing_path: options.timingPath, diff --git a/packages/core/src/evaluation/result-artifact-contract.ts b/packages/core/src/evaluation/result-artifact-contract.ts index 1a7166c21..b15312c77 100644 --- a/packages/core/src/evaluation/result-artifact-contract.ts +++ b/packages/core/src/evaluation/result-artifact-contract.ts @@ -10,7 +10,7 @@ * sidecars should use explicit path fields such as `metrics_path`. * * Git remote publishing treats the configured results branch as the - * metadata/control plane and stores transcript or trace payload bytes whose + * metadata/control plane and stores transcript payload bytes whose * `ref` is `agentv/artifacts/v1` on that artifact ref at the published pointer * `key` (`runs//` for the git backend). */ @@ -25,14 +25,12 @@ export const AGENTV_RESULTS_REFS = { oplog: AGENTV_RESULTS_OPLOG_REF, } as const; -export const CANONICAL_TRACE_ARTIFACT_PATH = 'trace.json' as const; export const CANONICAL_TRANSCRIPT_ARTIFACT_PATH = 'transcript.jsonl' as const; export const CANONICAL_METRICS_ARTIFACT_PATH = 'metrics.json' as const; export const TRANSCRIPT_SCHEMA_VERSION = 'agentv.transcript.v1' as const; export const METRICS_SCHEMA_VERSION = 'agentv.metrics.v1' as const; export const TRANSCRIPT_JSONL_MEDIA_TYPE = 'application/x-ndjson' as const; -export const TRACE_JSON_MEDIA_TYPE = 'application/vnd.agentv.trace.v1+json' as const; export const METRICS_JSON_MEDIA_TYPE = 'application/vnd.agentv.metrics.v1+json' as const; export type AgentVResultsRefName = (typeof AGENTV_RESULTS_REFS)[keyof typeof AGENTV_RESULTS_REFS]; @@ -81,7 +79,6 @@ export type TranscriptArtifactPointerWire = ResultArtifactPointerWire & { }; export interface ResultArtifactPointersWire { - readonly trace?: ResultArtifactPointerWire; readonly transcript?: TranscriptArtifactPointerWire; } diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index 4379b7e86..96171c666 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -2969,12 +2969,10 @@ function artifactSidecarPointers(record: unknown): ArtifactSidecarPointer[] { } const pointers: ArtifactSidecarPointer[] = []; - for (const pointer of Object.values(record.artifact_pointers)) { - if (!isRecord(pointer)) { - continue; - } + const pointer = record.artifact_pointers.transcript; + if (isRecord(pointer)) { if (pointer.ref !== AGENTV_RESULTS_ARTIFACTS_REF || typeof pointer.path !== 'string') { - continue; + return pointers; } pointers.push({ path: pointer.path, @@ -2995,6 +2993,10 @@ function artifactSidecarKey(destinationPath: string, pointerPath: string): strin ); } +function isDeprecatedTraceArtifactPath(relativePath: string): boolean { + return relativePath === 'trace.json' || relativePath.endsWith('/trace.json'); +} + function collectArtifactSidecarPointers(sourceDir: string): ArtifactSidecarPointer[] { const indexPath = path.join(sourceDir, RESULT_INDEX_FILENAME); if (!existsSync(indexPath)) { @@ -3098,18 +3100,27 @@ function rewritePublishedIndexLine(line: string, destinationPath: string): strin } let changed = false; - for (const pointer of Object.values(record.artifact_pointers)) { - if (!isRecord(pointer)) { - continue; - } - if (pointer.ref !== AGENTV_RESULTS_ARTIFACTS_REF || typeof pointer.path !== 'string') { - continue; - } + const pointer = record.artifact_pointers.transcript; + if ( + isRecord(pointer) && + pointer.ref === AGENTV_RESULTS_ARTIFACTS_REF && + typeof pointer.path === 'string' + ) { const key = artifactSidecarKey(destinationPath, pointer.path); if (pointer.key !== key) { pointer.key = key; changed = true; } + if ( + Object.keys(record.artifact_pointers).length !== 1 || + record.artifact_pointers.transcript !== pointer + ) { + changed = true; + } + record.artifact_pointers = { transcript: pointer }; + } else { + record.artifact_pointers = undefined; + changed = true; } return changed ? JSON.stringify(record) : line; @@ -3144,6 +3155,9 @@ async function preparePublishedResultsSource(params: { writeFileSync(destinationFile, rewritten); continue; } + if (isDeprecatedTraceArtifactPath(relativeFile)) { + continue; + } if (omittedPaths.has(relativeFile)) { continue; } diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index e4040aad1..041b91fe6 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -35,25 +35,14 @@ import { import type { Message } from './providers/types.js'; import { extractLastAssistantContent } from './providers/types.js'; import { - AGENTV_RESULTS_ARTIFACTS_REF, CANONICAL_METRICS_ARTIFACT_PATH, - CANONICAL_TRACE_ARTIFACT_PATH, CANONICAL_TRANSCRIPT_ARTIFACT_PATH, - type ResultArtifactFamily, - type ResultArtifactPointerWire, type ResultArtifactPointersWire, - TRACE_JSON_MEDIA_TYPE, - TRANSCRIPT_JSONL_MEDIA_TYPE, - TRANSCRIPT_SCHEMA_VERSION, - type TranscriptArtifactPointerWire, - toResultArtifactPointerWire, } from './result-artifact-contract.js'; import { normalizeResultRow } from './result-row-schema.js'; import { - EXECUTION_TRACE_SCHEMA_VERSION, type TraceEnvelope, buildTraceEnvelopeFromEvaluationResult, - toTraceEnvelopeWire, traceEnvelopeToTranscriptMessages, } from './trace-envelope.js'; import { type TokenUsage, type TraceSummary, buildTraceFromMessages } from './trace.js'; @@ -303,7 +292,6 @@ export interface IndexArtifactEntry { readonly summary_path?: string; readonly output_path?: string; readonly answer_path?: string; - readonly trace_path?: string; readonly transcript_path?: string; readonly transcript_raw_path?: string; readonly metrics_path?: string; @@ -1349,7 +1337,6 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv source: { path: RESULT_INDEX_FILENAME }, capture: { content: 'full', redactionLevel: 'none', redactedFields: [] }, artifacts: { - trace_path: CANONICAL_TRACE_ARTIFACT_PATH, answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined, transcript_path: hasTranscript ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined, metrics_path: CANONICAL_METRICS_ARTIFACT_PATH, @@ -1359,84 +1346,6 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv }); } -async function writeTraceEnvelopeSidecar( - params: TraceEnvelopeSidecarParams, -): Promise { - const envelope = buildTraceEnvelopeSidecar(params); - await writeFile( - path.join(params.testDir, CANONICAL_TRACE_ARTIFACT_PATH), - `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}\n`, - 'utf8', - ); - return envelope; -} - -function buildSidecarArtifactKey(family: ResultArtifactFamily, runRelativePath: string): string { - return path.posix.join(family, runRelativePath); -} - -async function buildArtifactPointer(params: { - readonly filePath: string; - readonly runRelativePath: string; - readonly family: ResultArtifactFamily; - readonly schemaVersion: string; - readonly mediaType: string; -}): Promise { - const content = await readFile(params.filePath); - const sha256 = createHash('sha256').update(content).digest('hex'); - return toResultArtifactPointerWire({ - ref: AGENTV_RESULTS_ARTIFACTS_REF, - key: buildSidecarArtifactKey(params.family, params.runRelativePath), - objectVersion: `sha256:${sha256}`, - path: params.runRelativePath, - sha256, - size: content.byteLength, - schemaVersion: params.schemaVersion, - mediaType: params.mediaType, - family: params.family, - }); -} - -async function buildTracePointer( - outputDir: string, - tracePath: string, -): Promise { - return buildArtifactPointer({ - filePath: tracePath, - runRelativePath: toRelativeArtifactPath(outputDir, tracePath), - family: 'traces', - schemaVersion: EXECUTION_TRACE_SCHEMA_VERSION, - mediaType: TRACE_JSON_MEDIA_TYPE, - }); -} - -async function buildTranscriptPointer( - outputDir: string, - transcriptPath: string, -): Promise { - const pointer = await buildArtifactPointer({ - filePath: transcriptPath, - runRelativePath: toRelativeArtifactPath(outputDir, transcriptPath), - family: 'transcripts', - schemaVersion: TRANSCRIPT_SCHEMA_VERSION, - mediaType: TRANSCRIPT_JSONL_MEDIA_TYPE, - }); - return pointer as TranscriptArtifactPointerWire; -} - -async function buildArtifactPointers(params: { - readonly outputDir: string; - readonly tracePath: string; - readonly transcriptPath?: string; -}): Promise { - return { - trace: await buildTracePointer(params.outputDir, params.tracePath), - ...(params.transcriptPath - ? { transcript: await buildTranscriptPointer(params.outputDir, params.transcriptPath) } - : {}), - }; -} - export function buildIndexArtifactEntry( result: EvaluationResult, options: { @@ -1447,7 +1356,6 @@ export function buildIndexArtifactEntry( summaryPath?: string; outputPath?: string; answerPath?: string; - tracePath?: string; transcriptPath?: string; transcriptRawPath?: string; metricsPath?: string; @@ -1497,9 +1405,6 @@ export function buildIndexArtifactEntry( answer_path: options.answerPath ? toRelativeArtifactPath(options.outputDir, options.answerPath) : undefined, - trace_path: options.tracePath - ? toRelativeArtifactPath(options.outputDir, options.tracePath) - : undefined, transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : undefined, @@ -1640,14 +1545,12 @@ function buildMetricsArtifactPayload(params: { readonly result: EvaluationResult; readonly envelope: TraceEnvelope; readonly transcriptPath?: string; - readonly traceArtifactPath?: string; readonly transcriptArtifactPath?: string; readonly gradingArtifactPath?: string; readonly timingArtifactPath?: string | null; readonly timing?: TimingArtifact; }): ReturnType & { readonly timing?: TimingArtifact } { const artifact = buildMetricsArtifact(params.result, params.envelope, { - tracePath: params.traceArtifactPath, transcriptPath: params.transcriptArtifactPath ?? (params.transcriptPath ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined), @@ -1663,7 +1566,6 @@ async function writeMetricsArtifact(params: { readonly result: EvaluationResult; readonly envelope: TraceEnvelope; readonly transcriptPath?: string; - readonly traceArtifactPath?: string; readonly transcriptArtifactPath?: string; readonly gradingArtifactPath?: string; readonly timingArtifactPath?: string | null; diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts index 5e7ac501f..bfb418fc9 100644 --- a/packages/core/test/evaluation/results-repo.test.ts +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -266,16 +266,17 @@ function writeRunArtifactsWithPointers( runDir: string, experiment: string, timestamp: string, + options: { readonly includeLegacyTracePointer?: boolean } = {}, ): void { writeRunArtifacts(runDir, experiment, timestamp); const artifactDir = path.join(runDir, 'alpha'); mkdirSync(artifactDir, { recursive: true }); - const traceContent = Buffer.from( - JSON.stringify({ + const legacyTraceContent = Buffer.from( + `${JSON.stringify({ schema_version: 'agentv.trace.v1', test_id: 'alpha', spans: [], - }), + })}\n`, ); const transcriptContent = Buffer.from( `${JSON.stringify({ @@ -285,10 +286,12 @@ function writeRunArtifactsWithPointers( content: 'sidecar transcript', })}\n`, ); - writeFileSync(path.join(artifactDir, 'trace.json'), traceContent); + if (options.includeLegacyTracePointer) { + writeFileSync(path.join(artifactDir, 'trace.json'), legacyTraceContent); + } writeFileSync(path.join(artifactDir, 'transcript.jsonl'), transcriptContent); - const traceSha = sha256Hex(traceContent); + const legacyTraceSha = sha256Hex(legacyTraceContent); const transcriptSha = sha256Hex(transcriptContent); writeFileSync( path.join(runDir, 'index.jsonl'), @@ -296,17 +299,21 @@ function writeRunArtifactsWithPointers( test_id: 'alpha', score: 1, artifact_pointers: { - trace: { - ref: AGENTV_RESULTS_REFS.artifacts, - key: 'traces/alpha/trace.json', - object_version: `sha256:${traceSha}`, - path: 'alpha/trace.json', - sha256: traceSha, - size: traceContent.byteLength, - schema_version: 'agentv.trace.v1', - media_type: 'application/vnd.agentv.trace.v1+json', - family: 'traces', - }, + ...(options.includeLegacyTracePointer + ? { + trace: { + ref: AGENTV_RESULTS_REFS.artifacts, + key: 'traces/alpha/trace.json', + object_version: `sha256:${legacyTraceSha}`, + path: 'alpha/trace.json', + sha256: legacyTraceSha, + size: legacyTraceContent.byteLength, + schema_version: 'agentv.trace.v1', + media_type: 'application/vnd.agentv.trace.v1+json', + family: 'traces', + }, + } + : {}), transcript: { ref: AGENTV_RESULTS_REFS.artifacts, key: 'transcripts/alpha/transcript.jsonl', @@ -1581,7 +1588,9 @@ describe('results repo write path', () => { ...createResultsConfig(remoteDir, cloneDir), branch: storageBranch, }; - writeRunArtifactsWithPointers(sourceDir, 'sidecar', '2026-06-21T12:00:00.000Z'); + writeRunArtifactsWithPointers(sourceDir, 'sidecar', '2026-06-21T12:00:00.000Z', { + includeLegacyTracePointer: true, + }); await expect( directPushResults({ @@ -1605,7 +1614,7 @@ describe('results repo write path', () => { `git --git-dir "${remoteDir}" ls-tree -r --name-only ${AGENTV_RESULTS_REFS.artifacts}`, rootDir, ); - expect(artifactTree).toContain(`runs/${destinationPath}/alpha/trace.json`); + expect(artifactTree).not.toContain(`runs/${destinationPath}/alpha/trace.json`); expect(artifactTree).toContain(`runs/${destinationPath}/alpha/transcript.jsonl`); expect(artifactTree).not.toContain(`runs/${destinationPath}/summary.json`); expect(artifactTree).not.toContain(`runs/${destinationPath}/index.jsonl`); @@ -1616,6 +1625,8 @@ describe('results repo write path', () => { rootDir, ).toString('utf8'), ); + expect(index.artifact_pointers).not.toHaveProperty('trace'); + expect(index.artifact_pointers).toHaveProperty('transcript'); for (const pointer of Object.values(index.artifact_pointers) as Array<{ key: string; path: string; @@ -1705,7 +1716,7 @@ describe('results repo write path', () => { `git --git-dir "${remoteDir}" ls-tree -r --name-only ${AGENTV_RESULTS_REFS.artifacts}`, rootDir, ); - expect(artifactTree).toContain(`runs/${destinationPath}/alpha/trace.json`); + expect(artifactTree).not.toContain(`runs/${destinationPath}/alpha/trace.json`); expect(artifactTree).toContain(`runs/${destinationPath}/alpha/transcript.jsonl`); await expect(