Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
"dependencies": {
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
"@github/copilot-sdk": "^0.1.25",
"@github/copilot-sdk": "^1.0.3",
"@hono/node-server": "^1.19.11",
"@inquirer/prompts": "^8.2.1",
"@earendil-works/pi-ai": "^0.74.0",
Expand Down
24 changes: 14 additions & 10 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"files": ["dist", "README.md"],
"dependencies": {
"@agentclientprotocol/sdk": "^0.14.1",
"@github/copilot-sdk": "^0.1.25",
"@github/copilot-sdk": "^1.0.3",
"@earendil-works/pi-ai": "^0.74.0",
"@openai/codex-sdk": "^0.136.0",
"fast-glob": "^3.3.3",
Expand Down
137 changes: 71 additions & 66 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2029,78 +2029,81 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati

const providerError = extractProviderError(providerResponse);

// Execute target after_each hook (runs before workspace after_each)
const targetAfterEachHook = options.targetHooks?.after_each;
if (workspacePath && hasHookCommand(targetAfterEachHook)) {
const scriptContext: ScriptExecutionContext = {
workspacePath,
testId: evalCase.id,
evalRunId: evalRunId ?? '',
caseInput: evalCase.question,
caseMetadata: evalCase.metadata,
evalDir,
workspaceFileDir: evalCase.workspace?.workspaceFileDir,
};
try {
await executeWorkspaceScript(
toScriptConfig(targetAfterEachHook, 'after_each', `target hook for '${evalCase.id}'`),
scriptContext,
'warn',
);
} catch {
// target after_each failures are non-fatal
const runAfterEachHooks = async () => {
// Execute target after_each hook before workspace after_each/reset.
const targetAfterEachHook = options.targetHooks?.after_each;
if (workspacePath && hasHookCommand(targetAfterEachHook)) {
const scriptContext: ScriptExecutionContext = {
workspacePath,
testId: evalCase.id,
evalRunId: evalRunId ?? '',
caseInput: evalCase.question,
caseMetadata: evalCase.metadata,
evalDir,
workspaceFileDir: evalCase.workspace?.workspaceFileDir,
};
try {
await executeWorkspaceScript(
toScriptConfig(targetAfterEachHook, 'after_each', `target hook for '${evalCase.id}'`),
scriptContext,
'warn',
);
} catch {
// target after_each failures are non-fatal
}
}
}

// Reset workspace state before after_each hook (if configured)
if (
caseHooksEnabled &&
workspacePath &&
evalCase.workspace?.hooks?.after_each?.reset &&
evalCase.workspace.hooks.after_each.reset !== 'none'
) {
try {
if (repoManager && evalCase.workspace.repos?.length) {
await repoManager.reset(
evalCase.workspace.repos,
workspacePath,
evalCase.workspace.hooks.after_each.reset,
);
} else {
await resetWorkspaceRoot(
workspacePath,
evalCase.workspace.hooks.after_each.reset,
baselineCommit,
);
// Reset workspace state before after_each hook (if configured), but only
// after graders have inspected the agent-modified workspace.
if (
caseHooksEnabled &&
workspacePath &&
evalCase.workspace?.hooks?.after_each?.reset &&
evalCase.workspace.hooks.after_each.reset !== 'none'
) {
try {
if (repoManager && evalCase.workspace.repos?.length) {
await repoManager.reset(
evalCase.workspace.repos,
workspacePath,
evalCase.workspace.hooks.after_each.reset,
);
} else {
await resetWorkspaceRoot(
workspacePath,
evalCase.workspace.hooks.after_each.reset,
baselineCommit,
);
}
} catch {
// Reset failures are non-fatal (like after_each)
}
} catch {
// Reset failures are non-fatal (like after_each)
}
}

// Execute after_each hook (runs after evaluation, before cleanup)
const caseAfterEachHook = evalCase.workspace?.hooks?.after_each;
if (workspacePath && caseHooksEnabled && hasHookCommand(caseAfterEachHook)) {
const afterEachHook = caseAfterEachHook;
const scriptContext: ScriptExecutionContext = {
workspacePath,
testId: evalCase.id,
evalRunId: evalRunId ?? '',
caseInput: evalCase.question,
caseMetadata: evalCase.metadata,
evalDir,
workspaceFileDir: evalCase.workspace?.workspaceFileDir,
};
try {
afterEachOutput = await executeWorkspaceScript(
toScriptConfig(afterEachHook, 'after_each', `test '${evalCase.id}'`),
scriptContext,
'warn',
);
} catch {
// after_each failures are non-fatal
// Execute after_each hook (runs after grading, before cleanup)
const caseAfterEachHook = evalCase.workspace?.hooks?.after_each;
if (workspacePath && caseHooksEnabled && hasHookCommand(caseAfterEachHook)) {
const afterEachHook = caseAfterEachHook;
const scriptContext: ScriptExecutionContext = {
workspacePath,
testId: evalCase.id,
evalRunId: evalRunId ?? '',
caseInput: evalCase.question,
caseMetadata: evalCase.metadata,
evalDir,
workspaceFileDir: evalCase.workspace?.workspaceFileDir,
};
try {
afterEachOutput = await executeWorkspaceScript(
toScriptConfig(afterEachHook, 'after_each', `test '${evalCase.id}'`),
scriptContext,
'warn',
);
} catch {
// after_each failures are non-fatal
}
}
}
};

try {
const result = await evaluateCandidate({
Expand Down Expand Up @@ -2133,6 +2136,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
threshold: evalCase.threshold ?? caseThreshold,
dependencyResults,
});
await runAfterEachHooks();

const effectiveThreshold = evalCase.threshold ?? caseThreshold;
const totalDurationMs = Date.now() - caseStartMs;
Expand Down Expand Up @@ -2236,6 +2240,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati

return finalResult;
} catch (error) {
await runAfterEachHooks().catch(() => {});
const evalRun = { durationMs: Date.now() - caseStartMs };
const errorResult = buildErrorResult(
evalCase,
Expand Down
41 changes: 39 additions & 2 deletions packages/core/src/evaluation/providers/codex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,12 @@ export class CodexProvider implements Provider {
if (this.config.executable) {
codexOptions.codexPathOverride = this.config.executable;
}
if (this.config.model) {
codexOptions.config = { model: this.config.model };
if (this.config.apiKey) {
codexOptions.apiKey = this.config.apiKey;
}
const codexConfig = this.buildCodexConfig();
if (Object.keys(codexConfig).length > 0) {
codexOptions.config = codexConfig;
}

const codex = new sdk.Codex(codexOptions);
Expand All @@ -92,6 +96,12 @@ export class CodexProvider implements Provider {
if (this.config.modelReasoningEffort) {
threadOptions.modelReasoningEffort = this.config.modelReasoningEffort;
}
if (this.config.sandboxMode) {
threadOptions.sandboxMode = this.config.sandboxMode;
}
if (this.config.approvalPolicy) {
threadOptions.approvalPolicy = this.config.approvalPolicy;
}

const thread = codex.startThread(threadOptions);

Expand Down Expand Up @@ -172,6 +182,33 @@ export class CodexProvider implements Provider {
}
}

private buildCodexConfig(): Record<string, unknown> {
const config: Record<string, unknown> = {};
if (this.config.model) {
config.model = this.config.model;
}
if (this.config.modelVerbosity) {
config.model_verbosity = this.config.modelVerbosity;
}
if (this.config.baseUrl) {
if (this.config.apiFormat) {
const providerName = 'agentv-openai';
config.model_provider = providerName;
config.model_providers = {
[providerName]: {
name: 'OpenAI-compatible endpoint',
base_url: this.config.baseUrl,
env_key: 'CODEX_API_KEY',
wire_api: this.config.apiFormat,
},
};
} else {
config.openai_base_url = this.config.baseUrl;
}
}
return config;
}

private async runStreamedWithEvents(
// biome-ignore lint/suspicious/noExplicitAny: SDK thread type is dynamically loaded
thread: any,
Expand Down
Loading
Loading