From 25eeb69f81508ef8753aa1004c72c01d7f5c2f88 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 26 Jun 2026 13:32:24 +0200
Subject: [PATCH] fix(results): avoid duplicate raw provider logs

---
 .../commands/eval/artifact-writer.test.ts     |  4 ++--
 .../docs/docs/evaluation/running-evals.mdx    | 19 +++++++++----------
 .../src/content/docs/docs/tools/import.mdx    |  8 +++++---
 .../src/content/docs/docs/tools/results.mdx   |  2 +-
 .../src/evaluation/providers/log-directory.ts |  9 +++++++--
 packages/core/src/evaluation/run-artifacts.ts | 16 ----------------
 packages/core/src/evaluation/types.ts         |  4 ++--
 .../core/test/evaluation/orchestrator.test.ts |  7 ++++---
 .../providers/log-directory.test.ts           | 16 +++++++++-------
 9 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 0070b2211..c797e0418 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -1565,7 +1565,7 @@ describe('writeArtifactsFromResults', () => {
     });
   });
 
-  it('copies optional raw provider logs as raw transcript evidence', async () => {
+  it('writes optional raw provider logs only as raw transcript evidence', async () => {
     const rawLogPath = path.join(testDir, 'provider-source.log');
     const rawLog = [
       '# provider-native stream log',
@@ -1587,7 +1587,7 @@ describe('writeArtifactsFromResults', () => {
     await writeArtifactsFromResults(results, testDir);
 
     const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'run-1', 'provider.log');
-    expect(await readFile(copiedRawLogPath, 'utf8')).toBe(rawLog);
+    await expect(readFile(copiedRawLogPath, 'utf8')).rejects.toThrow();
 
     const transcriptPath = path.join(testDir, 'raw-log-case', 'run-1', 'transcript-raw.jsonl');
     await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog);
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index 80e624e9a..92b577f9f 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -445,10 +445,9 @@ See the [Import tool docs](/docs/tools/import/) for all providers and options.
 
 Each result row's `artifact_dir` is a case-local folder under the timestamped
 run bundle. It can include `transcript.jsonl`, `transcript-raw.jsonl`,
-`provider.log`, `grading.json`, `timing.json`, `metrics.json`, and generated
-outputs under `outputs/`. The run root does not contain a mixed transcript
-artifact; use each index row's `transcript_path` to find the per-result
-transcript.
+`grading.json`, `timing.json`, `metrics.json`, and generated outputs under
+`outputs/`. The run root does not contain a mixed transcript artifact; use each
+index row's `transcript_path` to find the per-result transcript.
 
 Rows also include `artifact_pointers` for AgentV-owned artifact storage. Pointer
 entries such as `artifact_pointers.transcript` carry the storage `ref`, artifact
@@ -480,12 +479,12 @@ usage, cost, source metadata, capture state, and trace pointers.
 Provider-native payloads can appear only inside opaque nested fields such as
 `metadata`, `source.metadata`, tool `input`, or tool `output`.
 
-When an agent provider captures a native stream or session log, the result row
-may also include `raw_provider_log_path`, pointing at
-`provider.log`. That file is raw evidence copied byte-for-byte from
-the provider log and is not parsed, normalized, or required for replay, import,
-Agent Skills conversion, or grading. AgentV does not write or maintain a
-parallel `outputs/transcript.json` source of truth.
+When an agent provider captures a native stream or session log, AgentV writes
+that byte-for-byte evidence to `transcript-raw.jsonl` and records it with
+`transcript_raw_path`. New eval runs do not also copy the same stream to
+`provider.log`; `raw_provider_log_path` is only a legacy/imported pointer when
+older bundles or external sources already provide one. AgentV does not write or
+maintain a parallel `outputs/transcript.json` source of truth.
 
 Use the transcript when you need a compact portable message/event projection
 over the trace, including exports to role/content arrays for chat-template or
diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx
index aa4739722..1846a01e4 100644
--- a/apps/web/src/content/docs/docs/tools/import.mdx
+++ b/apps/web/src/content/docs/docs/tools/import.mdx
@@ -177,9 +177,11 @@ exports remain replayable. New eval run artifacts write the v1 shape.
 For eval run artifacts, `transcript.jsonl` is the portable message/event
 projection. AgentV does not persist a public `trace.json` run sidecar, and the
 transcript is not a provider-native session dump. Provider-native session or
-stream logs, when captured during an eval run, are separate raw evidence
-artifacts referenced by `raw_provider_log_path`; Agent Skills import, convert,
-transpile, and run paths do not require them.
+stream logs, when captured during a new eval run, are preserved in
+`transcript-raw.jsonl` and referenced by `transcript_raw_path`;
+`raw_provider_log_path` is a legacy/imported pointer when older bundles or
+external sources already provide one. Agent Skills import, convert, transpile,
+and run paths do not require those legacy log pointers.
 
 ## What Gets Parsed
 
diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx
index 544f20f9e..920198aa3 100644
--- a/apps/web/src/content/docs/docs/tools/results.mdx
+++ b/apps/web/src/content/docs/docs/tools/results.mdx
@@ -166,7 +166,7 @@ Agent Skills eval artifacts map into AgentV like this:
 |----------------------|--------------|-------------------|
 | Authored `evals/evals.json` cases | AgentV eval cases and task bundle paths | Eval source plus optional `task_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` |
 | Per-case answer | Generated target output artifact | `run-N/outputs/answer.md` |
-| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json`, `provider.log` when present |
+| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json` |
 | Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `run-N/timing.json` |
 | Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `run-N/grading.json`; summary fields can reference the same trace/result facts |
 | Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` |
diff --git a/packages/core/src/evaluation/providers/log-directory.ts b/packages/core/src/evaluation/providers/log-directory.ts
index 110ccdab7..4aa672673 100644
--- a/packages/core/src/evaluation/providers/log-directory.ts
+++ b/packages/core/src/evaluation/providers/log-directory.ts
@@ -1,3 +1,5 @@
+import { createHash } from 'node:crypto';
+import { tmpdir } from 'node:os';
 import path from 'node:path';
 
 import type { ProviderRequest } from './types.js';
@@ -16,6 +18,9 @@ export function resolveDefaultProviderLogDir(
 ): string | undefined {
   const runDir = process.env.AGENTV_RUN_DIR?.trim();
   if (runDir) {
+    const runSegment = safePathSegment(path.basename(runDir), 'run');
+    const runHash = createHash('sha256').update(path.resolve(runDir)).digest('hex').slice(0, 12);
+    const captureRoot = path.join(tmpdir(), 'agentv-provider-streams', `${runSegment}-${runHash}`);
     if (request?.evalCaseId) {
       const segments = [
         request.suite ? safePathSegment(request.suite, 'default') : undefined,
@@ -23,9 +28,9 @@ export function resolveDefaultProviderLogDir(
         'logs',
         providerName,
       ].filter((segment): segment is string => segment !== undefined);
-      return path.join(path.resolve(runDir), ...segments);
+      return path.join(captureRoot, ...segments);
     }
-    return path.join(path.resolve(runDir), '_logs', providerName);
+    return path.join(captureRoot, '_logs', providerName);
   }
   return undefined;
 }
diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts
index 0676e7637..ef13c67b0 100644
--- a/packages/core/src/evaluation/run-artifacts.ts
+++ b/packages/core/src/evaluation/run-artifacts.ts
@@ -842,10 +842,6 @@ async function writeTrialRunArtifacts(params: {
   if (answerOutputPath) {
     await writeFile(answerOutputPath, result.output, 'utf8');
   }
-  const rawProviderLogSource = rawProviderLogSourcePath(result);
-  if (rawProviderLogSource) {
-    await copyRawProviderLogArtifact(rawProviderLogSource, runDir);
-  }
   if (transcriptPath && transcriptRawPath) {
     await writeNormalizedTranscriptJsonl(transcriptPath, envelope);
     await writeRawTranscriptJsonl(transcriptRawPath, result, envelope);
@@ -1355,17 +1351,6 @@ function rawProviderLogSourcePath(result: EvaluationResult): string | undefined
   return sourcePath ? sourcePath : undefined;
 }
 
-async function copyRawProviderLogArtifact(sourcePath: string, testDir: string): Promise<string> {
-  const destinationPath = path.join(testDir, 'provider.log');
-  if (path.resolve(sourcePath) === path.resolve(destinationPath)) {
-    return destinationPath;
-  }
-
-  await mkdir(path.dirname(destinationPath), { recursive: true });
-  await copyFile(sourcePath, destinationPath);
-  return destinationPath;
-}
-
 interface TraceEnvelopeSidecarParams {
   readonly result: EvaluationResult;
   readonly outputDir: string;
@@ -1388,7 +1373,6 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv
       answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined,
       transcript_path: hasTranscript ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined,
       metrics_path: CANONICAL_METRICS_ARTIFACT_PATH,
-      raw_provider_log_path: rawProviderLogSourcePath(params.result) ? 'provider.log' : undefined,
     },
     duplicatePolicy: params.duplicatePolicy,
   });
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index c5351b976..15a501560 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -1218,8 +1218,8 @@ export interface EvaluationResult {
   readonly trace: Trace;
   /**
    * Optional local provider-native session/stream log captured by a provider.
-   * Artifact writers copy this byte-for-byte into the run bundle as raw,
-   * non-canonical evidence and expose only the run-local pointer.
+   * Artifact writers copy this byte-for-byte into `transcript-raw.jsonl`
+   * as raw, non-canonical evidence.
    */
   readonly rawProviderLogPath?: string;
   /** Path to the temporary workspace directory (included on failure for debugging) */
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 64b2cb6b7..4cb3000d8 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -723,7 +723,7 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
     expect(result.failureReasonCode).toBe('provider_error');
   });
 
-  it('copies raw provider logs from normal per-case evaluation artifacts', async () => {
+  it('stores raw provider logs once as transcript-raw evidence', async () => {
     const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-'));
     const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl');
     writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8');
@@ -752,10 +752,11 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
     const artifactDir = path.join(outputDir, 'test-dataset', 'case-1');
     const runDir = path.join(artifactDir, 'run-1');
     const outputsDir = path.join(runDir, 'outputs');
-    expect(readFileSync(path.join(runDir, 'provider.log'), 'utf8')).toBe(
+    expect(readdirSync(runDir)).not.toContain('provider.log');
+    expect(readdirSync(runDir)).toContain('transcript-raw.jsonl');
+    expect(readFileSync(path.join(runDir, 'transcript-raw.jsonl'), 'utf8')).toBe(
       '{"event":"provider-native"}\n',
     );
-    expect(readdirSync(runDir)).toContain('transcript-raw.jsonl');
     expect(readdirSync(runDir)).toContain('transcript.jsonl');
     expect(readdirSync(outputsDir)).not.toContain('transcript.jsonl');
     expect(readdirSync(outputsDir)).not.toContain('transcript.json');
diff --git a/packages/core/test/evaluation/providers/log-directory.test.ts b/packages/core/test/evaluation/providers/log-directory.test.ts
index 627140d62..fc9ec1a68 100644
--- a/packages/core/test/evaluation/providers/log-directory.test.ts
+++ b/packages/core/test/evaluation/providers/log-directory.test.ts
@@ -1,4 +1,6 @@
 import { afterEach, describe, expect, it } from 'bun:test';
+import { createHash } from 'node:crypto';
+import { tmpdir } from 'node:os';
 import path from 'node:path';
 
 import { resolveDefaultProviderLogDir } from '../../../src/evaluation/providers/log-directory.js';
@@ -14,8 +16,10 @@ describe('resolveDefaultProviderLogDir', () => {
     }
   });
 
-  it('places default provider logs inside the case folder for the active run', () => {
-    process.env.AGENTV_RUN_DIR = path.join('/repo', '.agentv', 'results', 'default', 'run-001');
+  it('places default provider stream captures outside the active run bundle', () => {
+    const runDir = path.join('/repo', '.agentv', 'results', 'default', 'run-001');
+    process.env.AGENTV_RUN_DIR = runDir;
+    const runHash = createHash('sha256').update(path.resolve(runDir)).digest('hex').slice(0, 12);
 
     expect(
       resolveDefaultProviderLogDir('copilot-cli', {
@@ -24,11 +28,9 @@ describe('resolveDefaultProviderLogDir', () => {
       }),
     ).toBe(
       path.join(
-        '/repo',
-        '.agentv',
-        'results',
-        'default',
-        'run-001',
+        tmpdir(),
+        'agentv-provider-streams',
+        `run-001-${runHash}`,
         'demo-suite',
         'case_one',
         'logs',