From 25eeb69f81508ef8753aa1004c72c01d7f5c2f88 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 26 Jun 2026 13:32:24 +0200 Subject: [PATCH] fix(results): avoid duplicate raw provider logs --- .../commands/eval/artifact-writer.test.ts | 4 ++-- .../docs/docs/evaluation/running-evals.mdx | 19 +++++++++---------- .../src/content/docs/docs/tools/import.mdx | 8 +++++--- .../src/content/docs/docs/tools/results.mdx | 2 +- .../src/evaluation/providers/log-directory.ts | 9 +++++++-- packages/core/src/evaluation/run-artifacts.ts | 16 ---------------- packages/core/src/evaluation/types.ts | 4 ++-- .../core/test/evaluation/orchestrator.test.ts | 7 ++++--- .../providers/log-directory.test.ts | 16 +++++++++------- 9 files changed, 39 insertions(+), 46 deletions(-) diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 0070b2211..c797e0418 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -1565,7 +1565,7 @@ describe('writeArtifactsFromResults', () => { }); }); - it('copies optional raw provider logs as raw transcript evidence', async () => { + it('writes optional raw provider logs only as raw transcript evidence', async () => { const rawLogPath = path.join(testDir, 'provider-source.log'); const rawLog = [ '# provider-native stream log', @@ -1587,7 +1587,7 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'run-1', 'provider.log'); - expect(await readFile(copiedRawLogPath, 'utf8')).toBe(rawLog); + await expect(readFile(copiedRawLogPath, 'utf8')).rejects.toThrow(); const transcriptPath = path.join(testDir, 'raw-log-case', 'run-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 80e624e9a..92b577f9f 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -445,10 +445,9 @@ See the [Import tool docs](/docs/tools/import/) for all providers and options. Each result row's `artifact_dir` is a case-local folder under the timestamped run bundle. It can include `transcript.jsonl`, `transcript-raw.jsonl`, -`provider.log`, `grading.json`, `timing.json`, `metrics.json`, and generated -outputs under `outputs/`. The run root does not contain a mixed transcript -artifact; use each index row's `transcript_path` to find the per-result -transcript. +`grading.json`, `timing.json`, `metrics.json`, and generated outputs under +`outputs/`. The run root does not contain a mixed transcript artifact; use each +index row's `transcript_path` to find the per-result transcript. Rows also include `artifact_pointers` for AgentV-owned artifact storage. Pointer entries such as `artifact_pointers.transcript` carry the storage `ref`, artifact @@ -480,12 +479,12 @@ usage, cost, source metadata, capture state, and trace pointers. Provider-native payloads can appear only inside opaque nested fields such as `metadata`, `source.metadata`, tool `input`, or tool `output`. -When an agent provider captures a native stream or session log, the result row -may also include `raw_provider_log_path`, pointing at -`provider.log`. That file is raw evidence copied byte-for-byte from -the provider log and is not parsed, normalized, or required for replay, import, -Agent Skills conversion, or grading. AgentV does not write or maintain a -parallel `outputs/transcript.json` source of truth. +When an agent provider captures a native stream or session log, AgentV writes +that byte-for-byte evidence to `transcript-raw.jsonl` and records it with +`transcript_raw_path`. New eval runs do not also copy the same stream to +`provider.log`; `raw_provider_log_path` is only a legacy/imported pointer when +older bundles or external sources already provide one. AgentV does not write or +maintain a parallel `outputs/transcript.json` source of truth. Use the transcript when you need a compact portable message/event projection over the trace, including exports to role/content arrays for chat-template or diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx index aa4739722..1846a01e4 100644 --- a/apps/web/src/content/docs/docs/tools/import.mdx +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -177,9 +177,11 @@ exports remain replayable. New eval run artifacts write the v1 shape. For eval run artifacts, `transcript.jsonl` is the portable message/event projection. AgentV does not persist a public `trace.json` run sidecar, and the transcript is not a provider-native session dump. Provider-native session or -stream logs, when captured during an eval run, are separate raw evidence -artifacts referenced by `raw_provider_log_path`; Agent Skills import, convert, -transpile, and run paths do not require them. +stream logs, when captured during a new eval run, are preserved in +`transcript-raw.jsonl` and referenced by `transcript_raw_path`; +`raw_provider_log_path` is a legacy/imported pointer when older bundles or +external sources already provide one. Agent Skills import, convert, transpile, +and run paths do not require those legacy log pointers. ## What Gets Parsed diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index 544f20f9e..920198aa3 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -166,7 +166,7 @@ Agent Skills eval artifacts map into AgentV like this: |----------------------|--------------|-------------------| | Authored `evals/evals.json` cases | AgentV eval cases and task bundle paths | Eval source plus optional `task_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` | | Per-case answer | Generated target output artifact | `run-N/outputs/answer.md` | -| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json`, `provider.log` when present | +| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json` | | Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `run-N/timing.json` | | Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `run-N/grading.json`; summary fields can reference the same trace/result facts | | Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` | diff --git a/packages/core/src/evaluation/providers/log-directory.ts b/packages/core/src/evaluation/providers/log-directory.ts index 110ccdab7..4aa672673 100644 --- a/packages/core/src/evaluation/providers/log-directory.ts +++ b/packages/core/src/evaluation/providers/log-directory.ts @@ -1,3 +1,5 @@ +import { createHash } from 'node:crypto'; +import { tmpdir } from 'node:os'; import path from 'node:path'; import type { ProviderRequest } from './types.js'; @@ -16,6 +18,9 @@ export function resolveDefaultProviderLogDir( ): string | undefined { const runDir = process.env.AGENTV_RUN_DIR?.trim(); if (runDir) { + const runSegment = safePathSegment(path.basename(runDir), 'run'); + const runHash = createHash('sha256').update(path.resolve(runDir)).digest('hex').slice(0, 12); + const captureRoot = path.join(tmpdir(), 'agentv-provider-streams', `${runSegment}-${runHash}`); if (request?.evalCaseId) { const segments = [ request.suite ? safePathSegment(request.suite, 'default') : undefined, @@ -23,9 +28,9 @@ export function resolveDefaultProviderLogDir( 'logs', providerName, ].filter((segment): segment is string => segment !== undefined); - return path.join(path.resolve(runDir), ...segments); + return path.join(captureRoot, ...segments); } - return path.join(path.resolve(runDir), '_logs', providerName); + return path.join(captureRoot, '_logs', providerName); } return undefined; } diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 0676e7637..ef13c67b0 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -842,10 +842,6 @@ async function writeTrialRunArtifacts(params: { if (answerOutputPath) { await writeFile(answerOutputPath, result.output, 'utf8'); } - const rawProviderLogSource = rawProviderLogSourcePath(result); - if (rawProviderLogSource) { - await copyRawProviderLogArtifact(rawProviderLogSource, runDir); - } if (transcriptPath && transcriptRawPath) { await writeNormalizedTranscriptJsonl(transcriptPath, envelope); await writeRawTranscriptJsonl(transcriptRawPath, result, envelope); @@ -1355,17 +1351,6 @@ function rawProviderLogSourcePath(result: EvaluationResult): string | undefined return sourcePath ? sourcePath : undefined; } -async function copyRawProviderLogArtifact(sourcePath: string, testDir: string): Promise { - const destinationPath = path.join(testDir, 'provider.log'); - if (path.resolve(sourcePath) === path.resolve(destinationPath)) { - return destinationPath; - } - - await mkdir(path.dirname(destinationPath), { recursive: true }); - await copyFile(sourcePath, destinationPath); - return destinationPath; -} - interface TraceEnvelopeSidecarParams { readonly result: EvaluationResult; readonly outputDir: string; @@ -1388,7 +1373,6 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined, transcript_path: hasTranscript ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined, metrics_path: CANONICAL_METRICS_ARTIFACT_PATH, - raw_provider_log_path: rawProviderLogSourcePath(params.result) ? 'provider.log' : undefined, }, duplicatePolicy: params.duplicatePolicy, }); diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index c5351b976..15a501560 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1218,8 +1218,8 @@ export interface EvaluationResult { readonly trace: Trace; /** * Optional local provider-native session/stream log captured by a provider. - * Artifact writers copy this byte-for-byte into the run bundle as raw, - * non-canonical evidence and expose only the run-local pointer. + * Artifact writers copy this byte-for-byte into `transcript-raw.jsonl` + * as raw, non-canonical evidence. */ readonly rawProviderLogPath?: string; /** Path to the temporary workspace directory (included on failure for debugging) */ diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 64b2cb6b7..4cb3000d8 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -723,7 +723,7 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(result.failureReasonCode).toBe('provider_error'); }); - it('copies raw provider logs from normal per-case evaluation artifacts', async () => { + it('stores raw provider logs once as transcript-raw evidence', async () => { const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-')); const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl'); writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8'); @@ -752,10 +752,11 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, const artifactDir = path.join(outputDir, 'test-dataset', 'case-1'); const runDir = path.join(artifactDir, 'run-1'); const outputsDir = path.join(runDir, 'outputs'); - expect(readFileSync(path.join(runDir, 'provider.log'), 'utf8')).toBe( + expect(readdirSync(runDir)).not.toContain('provider.log'); + expect(readdirSync(runDir)).toContain('transcript-raw.jsonl'); + expect(readFileSync(path.join(runDir, 'transcript-raw.jsonl'), 'utf8')).toBe( '{"event":"provider-native"}\n', ); - expect(readdirSync(runDir)).toContain('transcript-raw.jsonl'); expect(readdirSync(runDir)).toContain('transcript.jsonl'); expect(readdirSync(outputsDir)).not.toContain('transcript.jsonl'); expect(readdirSync(outputsDir)).not.toContain('transcript.json'); diff --git a/packages/core/test/evaluation/providers/log-directory.test.ts b/packages/core/test/evaluation/providers/log-directory.test.ts index 627140d62..fc9ec1a68 100644 --- a/packages/core/test/evaluation/providers/log-directory.test.ts +++ b/packages/core/test/evaluation/providers/log-directory.test.ts @@ -1,4 +1,6 @@ import { afterEach, describe, expect, it } from 'bun:test'; +import { createHash } from 'node:crypto'; +import { tmpdir } from 'node:os'; import path from 'node:path'; import { resolveDefaultProviderLogDir } from '../../../src/evaluation/providers/log-directory.js'; @@ -14,8 +16,10 @@ describe('resolveDefaultProviderLogDir', () => { } }); - it('places default provider logs inside the case folder for the active run', () => { - process.env.AGENTV_RUN_DIR = path.join('/repo', '.agentv', 'results', 'default', 'run-001'); + it('places default provider stream captures outside the active run bundle', () => { + const runDir = path.join('/repo', '.agentv', 'results', 'default', 'run-001'); + process.env.AGENTV_RUN_DIR = runDir; + const runHash = createHash('sha256').update(path.resolve(runDir)).digest('hex').slice(0, 12); expect( resolveDefaultProviderLogDir('copilot-cli', { @@ -24,11 +28,9 @@ describe('resolveDefaultProviderLogDir', () => { }), ).toBe( path.join( - '/repo', - '.agentv', - 'results', - 'default', - 'run-001', + tmpdir(), + 'agentv-provider-streams', + `run-001-${runHash}`, 'demo-suite', 'case_one', 'logs',