Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .agents/verification.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Use live dogfood before marking PRs ready when they affect eval execution, exper
- Live means both sides are real: a live agent/provider target and a live grader target. Do not count `mock`, `--dry-run`, or deterministic-only assertions as dogfood for these changes.
- Prefer the smallest realistic eval: one or two cases, bounded timeouts, and `workers: 1` for heavyweight agent providers.
- For native experiment changes, run through `agentv eval run ... --experiment <experiment.yaml|ts>` so resolution, setup, scripts, target selection, run knobs, and artifact metadata are exercised together.
- For repeat-run changes, use an experiment-level repeat config with `count >= 2`, `early_exit: false` when validating all attempts are persisted. Inspect root `index.jsonl`, root `benchmark.json`, and the repeated case folder. The repeated case folder should carry aggregate `summary.json` with flattened snake_case timing fields plus AgentV aggregate `grading.json`; attempt-specific outputs and transcripts live under `run-N/`. Each `run-N/` folder should contain `result.json`, `grading.json`, `transcript.json`, `transcript-raw.jsonl`, and `outputs/answer.md`. Do not write per-run `metrics.json`; timing and o11y fields belong in `result.json`, and `result.json` points at `./grading.json` through `grading_path`.
- For repeat-run changes, use an experiment-level repeat config with `count >= 2`, `early_exit: false` when validating all attempts are persisted. Inspect root `index.jsonl`, root `benchmark.json`, and the repeated case folder. The repeated case folder should carry aggregate `summary.json` with flattened snake_case timing fields plus AgentV aggregate `grading.json`; attempt-specific outputs, transcripts, and metrics live under `run-N/`. Each `run-N/` folder should contain `result.json`, `grading.json`, `metrics.json`, `transcript.jsonl`, `transcript-raw.jsonl`, and `outputs/answer.md` when answer output is available. `result.json` should point at `./grading.json`, `./metrics.json`, `./transcript.jsonl`, and `./transcript-raw.jsonl` through the corresponding path fields.
- For local OpenAI-compatible grading through the OAuth proxy, use `endpoint: http://127.0.0.1:10531/v1`, but still route `api_key` and `model` through environment references such as `${{ LOCAL_OPENAI_PROXY_API_KEY }}` and `${{ LOCAL_OPENAI_PROXY_MODEL }}`. Literal secrets and literal model values are intentionally rejected by target validation unless a resolver explicitly allows them.
- Preserve review evidence in `agentv-private` on an `evidence/<bead-or-feature-slug>` branch. Include the run bundle, source eval/experiment/targets files, a short README, an artifact tree, and screenshots when folder structure or UI behavior is under review.
- If comparing against an external convention such as Vercel `agent-eval`, verify both semantic provenance and the physical `run-N` artifact layout for repeat runs.
Expand Down
1 change: 1 addition & 0 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ export function buildIndexArtifactEntry(
answerPath?: string;
tracePath?: string;
transcriptPath?: string;
transcriptRawPath?: string;
metricsPath?: string;
rawProviderLogPath?: string;
responsePath?: string;
Expand Down
1 change: 1 addition & 0 deletions apps/cli/src/commands/results/combine-run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ const MANIFEST_PATH_FIELDS = [
'response_path',
'trace_path',
'transcript_path',
'transcript_raw_path',
'metrics_path',
'raw_provider_log_path',
'task_dir',
Expand Down
83 changes: 39 additions & 44 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ import {
type ExternalTraceMetadataWire,
type ResultArtifactPointersWire,
type TraceSummary,
type TranscriptJsonLine,
buildTraceFromMessages,
fromTraceEnvelopeWire,
toCamelCaseDeep,
traceFromTranscriptJsonLines,
traceEnvelopeToTraceSummary,
traceEnvelopeToTranscriptMessages,
} from '@agentv/core';

import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js';
Expand Down Expand Up @@ -55,6 +56,7 @@ export interface ResultManifestRecord {
readonly answer_path?: string;
readonly trace_path?: string;
readonly transcript_path?: string;
readonly transcript_raw_path?: string;
readonly metrics_path?: string;
readonly raw_provider_log_path?: string;
readonly artifact_pointers?: ResultArtifactPointersWire;
Expand Down Expand Up @@ -86,6 +88,7 @@ export type ArtifactPointer =

export interface ArtifactPointerMap {
readonly transcript_path?: string;
readonly transcript_raw_path?: string;
readonly answer_path?: string;
readonly transcript?: ArtifactPointer;
readonly answer?: ArtifactPointer;
Expand All @@ -101,14 +104,6 @@ export interface ManifestHydrationOptions {
readonly hydrateTranscriptTrace?: boolean;
}

function parseJsonlLines<T>(content: string): T[] {
return content
.split(/\r?\n/)
.map((line) => line.trim())
.filter((line) => line.length > 0)
.map((line) => JSON.parse(line) as T);
}

function parseResultRows(content: string, sourceLabel?: string): ResultManifestRecord[] {
return content
.split(/\r?\n/)
Expand Down Expand Up @@ -162,33 +157,6 @@ function readOptionalJson<T>(baseDir: string, relativePath: string | undefined):
}
}

function nonEmptyString(value: unknown): string | undefined {
return typeof value === 'string' && value.trim().length > 0 ? value : undefined;
}

function artifactPointerPath(pointer: ArtifactPointer | undefined): string | undefined {
if (typeof pointer === 'string') {
return nonEmptyString(pointer);
}
if (!pointer) {
return undefined;
}
return (
nonEmptyString(pointer.path) ??
nonEmptyString(pointer.artifact_path) ??
nonEmptyString(pointer.relative_path)
);
}

function resolveTranscriptPath(record: ResultManifestRecord): string | undefined {
return (
record.transcript_path ??
record.artifact_pointers?.transcript?.path ??
record.artifacts?.transcript_path ??
artifactPointerPath(record.transcript ?? record.artifacts?.transcript)
);
}

function hydrateInput(
baseDir: string,
record: ResultManifestRecord,
Expand Down Expand Up @@ -217,19 +185,46 @@ function hydrateOutput(
return responseText.trimEnd();
}

function hydrateTraceEnvelope(
baseDir: string,
record: ResultManifestRecord,
): EvaluationResult['trace'] | undefined {
const traceWire = readOptionalJson<unknown>(baseDir, record.trace_path);
if (!traceWire) {
return undefined;
}

try {
const envelope = fromTraceEnvelopeWire(traceWire);
const summary = traceEnvelopeToTraceSummary(envelope);
return buildTraceFromMessages({
output: traceEnvelopeToTranscriptMessages(envelope),
summary: summary.trace,
finalOutput: hydrateOutput(baseDir, record),
tokenUsage: summary.tokenUsage,
costUsd: summary.costUsd,
durationMs: summary.durationMs,
startTime: summary.startTime,
endTime: summary.endTime,
provider: envelope.source.provider,
target: record.target ?? envelope.eval.target,
testId: record.test_id ?? envelope.eval.testId,
conversationId: envelope.eval.runId,
});
} catch {
return undefined;
}
}

function hydrateTrace(
baseDir: string,
record: ResultManifestRecord,
options: ManifestHydrationOptions,
): EvaluationResult['trace'] {
if (options.hydrateTranscriptTrace !== false) {
const transcriptText = readOptionalText(baseDir, resolveTranscriptPath(record));
if (transcriptText) {
try {
return traceFromTranscriptJsonLines(parseJsonlLines<TranscriptJsonLine>(transcriptText));
} catch {
// Fall through to a minimal trace below.
}
const trace = hydrateTraceEnvelope(baseDir, record);
if (trace) {
return trace;
}
}

Expand Down
2 changes: 2 additions & 0 deletions apps/cli/src/commands/results/projection-bundle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ export type ProjectionBundleArtifactRefs = Partial<
| 'output_path'
| 'answer_path'
| 'transcript_path'
| 'transcript_raw_path'
| 'metrics_path'
| 'task_dir'
| 'eval_path'
Expand Down Expand Up @@ -178,6 +179,7 @@ function artifactRefs(
output_path: indexEntry.output_path,
answer_path: indexEntry.answer_path,
transcript_path: indexEntry.transcript_path,
transcript_raw_path: indexEntry.transcript_raw_path,
metrics_path: indexEntry.metrics_path,
trace_path: tracePathFor(indexEntry),
task_dir: indexEntry.task_dir,
Expand Down
6 changes: 5 additions & 1 deletion apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,7 @@ function buildResultArtifactCatalog(
addDirectArtifactCatalogEntry(entries, seen, record.response_path, 'artifact');
addDirectArtifactCatalogEntry(entries, seen, record.answer_path, 'answer');
addDirectArtifactCatalogEntry(entries, seen, record.transcript_path, 'transcript');
addDirectArtifactCatalogEntry(entries, seen, record.transcript_raw_path, 'artifact');
addDirectArtifactCatalogEntry(entries, seen, recordWithTrace.trace_path, 'trace');
addDirectArtifactCatalogEntry(entries, seen, record.eval_path, 'artifact');
addDirectArtifactCatalogEntry(entries, seen, record.targets_path, 'artifact');
Expand Down Expand Up @@ -1153,7 +1154,8 @@ function buildRepeatTrialReadModels(
const metricsPath = caseTrialArtifactPath(artifactDir, runPath, 'metrics.json');
const timingPath = caseTrialArtifactPath(artifactDir, runPath, 'timing.json');
const gradingPath = caseTrialArtifactPath(artifactDir, runPath, 'grading.json');
const transcriptPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript-raw.jsonl');
const transcriptPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript.jsonl');
const transcriptRawPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript-raw.jsonl');
const answerPath = caseTrialArtifactPath(artifactDir, runPath, 'outputs/answer.md');
const metrics = readArtifactJsonObject(baseDir, metricsPath);
const timing = readArtifactJsonObject(baseDir, timingPath);
Expand All @@ -1180,6 +1182,7 @@ function buildRepeatTrialReadModels(
...(timingPath && { timing_path: timingPath }),
...(gradingPath && { grading_path: gradingPath }),
...(transcriptPath && { transcript_path: transcriptPath }),
...(transcriptRawPath && { transcript_raw_path: transcriptRawPath }),
...(answerPath && { answer_path: answerPath }),
};
});
Expand All @@ -1203,6 +1206,7 @@ function attachRunDetailReadModelFields<T extends Record<string, unknown>>(
...(record.timing_path && { timing_path: record.timing_path }),
...(record.metrics_path && { metrics_path: record.metrics_path }),
...(record.transcript_path && { transcript_path: record.transcript_path }),
...(record.transcript_raw_path && { transcript_raw_path: record.transcript_raw_path }),
...(record.output_path && { output_path: record.output_path }),
...(record.answer_path && { answer_path: record.answer_path }),
...(trials && { trials }),
Expand Down
Loading
Loading