From f9fcad947803ab03d0d2b390502b08a5a3fd380d Mon Sep 17 00:00:00 2001 From: AgentV Results Date: Mon, 22 Jun 2026 09:02:35 +0200 Subject: [PATCH 1/2] fix(providers): support copilot sdk openai endpoint runs Entire-Checkpoint: eaa4959833ab --- apps/cli/package.json | 2 +- bun.lock | 24 +-- packages/core/package.json | 2 +- packages/core/src/evaluation/orchestrator.ts | 137 +++++++++-------- .../core/src/evaluation/providers/codex.ts | 41 ++++- .../src/evaluation/providers/copilot-sdk.ts | 73 ++++++++- .../core/src/evaluation/providers/targets.ts | 141 ++++++++++++++++++ .../core/src/evaluation/providers/types.ts | 5 + .../validation/targets-validator.ts | 11 ++ .../evaluation/providers/codex-sdk.test.ts | 58 +++++++ .../evaluation/providers/copilot-sdk.test.ts | 74 ++++++++- .../test/evaluation/providers/targets.test.ts | 75 ++++++++++ .../validation/targets-validator.test.ts | 27 ++++ 13 files changed, 578 insertions(+), 92 deletions(-) diff --git a/apps/cli/package.json b/apps/cli/package.json index 6572d7193..3a2fa64d1 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -29,7 +29,7 @@ }, "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.49", - "@github/copilot-sdk": "^0.1.25", + "@github/copilot-sdk": "^1.0.3", "@hono/node-server": "^1.19.11", "@inquirer/prompts": "^8.2.1", "@earendil-works/pi-ai": "^0.74.0", diff --git a/bun.lock b/bun.lock index 366f850c9..50d9c3a85 100644 --- a/bun.lock +++ b/bun.lock @@ -27,7 +27,7 @@ "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@earendil-works/pi-ai": "^0.74.0", - "@github/copilot-sdk": "^0.1.25", + "@github/copilot-sdk": "^1.0.3", "@hono/node-server": "^1.19.11", "@inquirer/prompts": "^8.2.1", "@openai/codex-sdk": "^0.136.0", @@ -89,7 +89,7 @@ "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@earendil-works/pi-ai": "^0.74.0", - "@github/copilot-sdk": "^0.1.25", + "@github/copilot-sdk": "^1.0.3", "@openai/codex-sdk": "^0.136.0", "fast-glob": "^3.3.3", "json5": "^2.2.3", @@ -398,21 +398,25 @@ "@expressive-code/plugin-text-markers": ["@expressive-code/plugin-text-markers@0.41.6", "", { "dependencies": { "@expressive-code/core": "^0.41.6" } }, "sha512-PBFa1wGyYzRExMDzBmAWC6/kdfG1oLn4pLpBeTfIRrALPjcGA/59HP3e7q9J0Smk4pC7U+lWkA2LHR8FYV8U7Q=="], - "@github/copilot": ["@github/copilot@0.0.411", "", { "optionalDependencies": { "@github/copilot-darwin-arm64": "0.0.411", "@github/copilot-darwin-x64": "0.0.411", "@github/copilot-linux-arm64": "0.0.411", "@github/copilot-linux-x64": "0.0.411", "@github/copilot-win32-arm64": "0.0.411", "@github/copilot-win32-x64": "0.0.411" }, "bin": { "copilot": "npm-loader.js" } }, "sha512-I3/7gw40Iu1O+kTyNPKJHNqDRyOebjsUW6wJsvSVrOpT0TNa3/lfm8xdS2XUuJWkp+PgEG/PRwF7u3DVNdP7bQ=="], + "@github/copilot": ["@github/copilot@1.0.64-1", "", { "dependencies": { "detect-libc": "^2.1.2" }, "optionalDependencies": { "@github/copilot-darwin-arm64": "1.0.64-1", "@github/copilot-darwin-x64": "1.0.64-1", "@github/copilot-linux-arm64": "1.0.64-1", "@github/copilot-linux-x64": "1.0.64-1", "@github/copilot-linuxmusl-arm64": "1.0.64-1", "@github/copilot-linuxmusl-x64": "1.0.64-1", "@github/copilot-win32-arm64": "1.0.64-1", "@github/copilot-win32-x64": "1.0.64-1" }, "bin": { "copilot": "npm-loader.js" } }, "sha512-lojV4Cb7oT4VJnYPEKBRH8KI3W43Q4Lh0Pc+V6sej+xjPJkoqwm68sNKn73/p3wXPBSTVTzPeCm9WhIisgf1Jw=="], - "@github/copilot-darwin-arm64": ["@github/copilot-darwin-arm64@0.0.411", "", { "os": "darwin", "cpu": "arm64", "bin": { "copilot-darwin-arm64": "copilot" } }, "sha512-dtr+iHxTS4f8HlV2JT9Fp0FFoxuiPWCnU3XGmrHK+rY6bX5okPC2daU5idvs77WKUGcH8yHTZtfbKYUiMxKosw=="], + "@github/copilot-darwin-arm64": ["@github/copilot-darwin-arm64@1.0.64-1", "", { "os": "darwin", "cpu": "arm64", "bin": { "copilot-darwin-arm64": "copilot" } }, "sha512-MQHZT9LhmCiq+ogO1E8cPCWrurZ6x+r9lPJfYUSnOyMO+EHbREpiJwOOChxtLHgL2/tKJSZdId2pg3tDgUlcsw=="], - "@github/copilot-darwin-x64": ["@github/copilot-darwin-x64@0.0.411", "", { "os": "darwin", "cpu": "x64", "bin": { "copilot-darwin-x64": "copilot" } }, "sha512-zhdbQCbPi1L4iHClackSLx8POfklA+NX9RQLuS48HlKi/0KI/JlaDA/bdbIeMR79wjif5t9gnc/m+RTVmHlRtA=="], + "@github/copilot-darwin-x64": ["@github/copilot-darwin-x64@1.0.64-1", "", { "os": "darwin", "cpu": "x64", "bin": { "copilot-darwin-x64": "copilot" } }, "sha512-kOQY7CvI7He0eO3ObQAHePWdkNLWAOegCSzUqUmdcpa1SNVqbZ3GBMsQ7uAZQip2cQxnGZ7pS1v6tKQ0HJdkYw=="], - "@github/copilot-linux-arm64": ["@github/copilot-linux-arm64@0.0.411", "", { "os": "linux", "cpu": "arm64", "bin": { "copilot-linux-arm64": "copilot" } }, "sha512-oZYZ7oX/7O+jzdTUcHkfD1A8YnNRW6mlUgdPjUg+5rXC43bwIdyatAnc0ObY21m9h8ghxGqholoLhm5WnGv1LQ=="], + "@github/copilot-linux-arm64": ["@github/copilot-linux-arm64@1.0.64-1", "", { "os": "linux", "cpu": "arm64", "bin": { "copilot-linux-arm64": "copilot" } }, "sha512-hIfuO7Q+pWs0SKfIRYqT+CjMaupudnhp4RMS6XoJ5s/e33rvpj2tkTkXYlHJo1PMDI823vvbqgpEdr+KeewMwg=="], - "@github/copilot-linux-x64": ["@github/copilot-linux-x64@0.0.411", "", { "os": "linux", "cpu": "x64", "bin": { "copilot-linux-x64": "copilot" } }, "sha512-nnXrKANmmGnkwa3ROlKdAhVNOx8daeMSE8Xh0o3ybKckFv4s38blhKdcxs0RJQRxgAk4p7XXGlDDKNRhurqF1g=="], + "@github/copilot-linux-x64": ["@github/copilot-linux-x64@1.0.64-1", "", { "os": "linux", "cpu": "x64", "bin": { "copilot-linux-x64": "copilot" } }, "sha512-VHaE62pha0rDDvuNN3bd97gf0EZ+EJebstM1ejHsMYoPT1IOUkYEXlNfGGHY+GfUGYxAiy/+Uew4xw5mJyy/Sw=="], - "@github/copilot-sdk": ["@github/copilot-sdk@0.1.25", "", { "dependencies": { "@github/copilot": "^0.0.411", "vscode-jsonrpc": "^8.2.1", "zod": "^4.3.6" } }, "sha512-hIgYLPXzWw9bNgrsD5BLKmgVH20ow5Or5UyVXfVe3YgeiaTgFxC4jWSAVHLGB6ufHZUrvbjppcq2dWK63FmDRA=="], + "@github/copilot-linuxmusl-arm64": ["@github/copilot-linuxmusl-arm64@1.0.64-1", "", { "os": "linux", "cpu": "arm64", "bin": { "copilot-linuxmusl-arm64": "copilot" } }, "sha512-L/YrZPotRujAP0QERq+DlkR1SLr7abbTSz/56JqKKOqEdjKZPdQW1bUlhL/w1CZg1gXlTNUsNVyKz/fUfrEBgw=="], - "@github/copilot-win32-arm64": ["@github/copilot-win32-arm64@0.0.411", "", { "os": "win32", "cpu": "arm64", "bin": { "copilot-win32-arm64": "copilot.exe" } }, "sha512-h+Bovb2YVCQSeELZOO7zxv8uht45XHcvAkFbRsc1gf9dl109sSUJIcB4KAhs8Aznk28qksxz7kvdSgUWyQBlIA=="], + "@github/copilot-linuxmusl-x64": ["@github/copilot-linuxmusl-x64@1.0.64-1", "", { "os": "linux", "cpu": "x64", "bin": { "copilot-linuxmusl-x64": "copilot" } }, "sha512-AGMjXqR128oyjiJhoI6Gd7JP5ddWkib+P4YH/JoHm05iNn23ZYl4tSc0XihHzeyMI1ix7Aacn8UINYB7lGOGOA=="], - "@github/copilot-win32-x64": ["@github/copilot-win32-x64@0.0.411", "", { "os": "win32", "cpu": "x64", "bin": { "copilot-win32-x64": "copilot.exe" } }, "sha512-xmOgi1lGvUBHQJWmq5AK1EP95+Y8xR4TFoK9OCSOaGbQ+LFcX2jF7iavnMolfWwddabew/AMQjsEHlXvbgMG8Q=="], + "@github/copilot-sdk": ["@github/copilot-sdk@1.0.3", "", { "dependencies": { "@github/copilot": "^1.0.64-1", "vscode-jsonrpc": "^8.2.1", "zod": "^4.3.6" } }, "sha512-ujnH2QVw3+xvjgo9cbpY0wik4fNxAmdMDSFnxGScDSvRuK2vUCL2xWW4V2ANc9pWwRHPBpEpMuNJMtmydmLCIQ=="], + + "@github/copilot-win32-arm64": ["@github/copilot-win32-arm64@1.0.64-1", "", { "os": "win32", "cpu": "arm64", "bin": { "copilot-win32-arm64": "copilot.exe" } }, "sha512-vvv+gnemi9WKaxF41zz7Xmq6a493n8Yjps5UFaOY6a3WR222kKXZXfOpeRvIYsDgnIPHGBHIj1TBOmnHQT4V4w=="], + + "@github/copilot-win32-x64": ["@github/copilot-win32-x64@1.0.64-1", "", { "os": "win32", "cpu": "x64", "bin": { "copilot-win32-x64": "copilot.exe" } }, "sha512-mcHvD0fjGDuqr/YXzy8mKuDmah1F+qjPujxoFuGmabmTJZ33cSIJ3nq7RRvxZNIdp8YJ57NkbcW30WvIcOeJ3w=="], "@google/genai": ["@google/genai@1.51.0", "", { "dependencies": { "google-auth-library": "^10.3.0", "p-retry": "^4.6.2", "protobufjs": "^7.5.4", "ws": "^8.18.0" }, "peerDependencies": { "@modelcontextprotocol/sdk": "^1.25.2" }, "optionalPeers": ["@modelcontextprotocol/sdk"] }, "sha512-vTZZF3CSimN7cn2zsLpW2p5WF0eZa5Gz69ITMPCNHpPrDlAstOfGifSfi0p/s9Z9400f7xJRkgvkQNrcM7pJ6w=="], diff --git a/packages/core/package.json b/packages/core/package.json index bbb7db741..85093820c 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -41,7 +41,7 @@ "files": ["dist", "README.md"], "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", - "@github/copilot-sdk": "^0.1.25", + "@github/copilot-sdk": "^1.0.3", "@earendil-works/pi-ai": "^0.74.0", "@openai/codex-sdk": "^0.136.0", "fast-glob": "^3.3.3", diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 213940190..119359250 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2029,78 +2029,81 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { + // Execute target after_each hook before workspace after_each/reset. + const targetAfterEachHook = options.targetHooks?.after_each; + if (workspacePath && hasHookCommand(targetAfterEachHook)) { + const scriptContext: ScriptExecutionContext = { + workspacePath, + testId: evalCase.id, + evalRunId: evalRunId ?? '', + caseInput: evalCase.question, + caseMetadata: evalCase.metadata, + evalDir, + workspaceFileDir: evalCase.workspace?.workspaceFileDir, + }; + try { + await executeWorkspaceScript( + toScriptConfig(targetAfterEachHook, 'after_each', `target hook for '${evalCase.id}'`), + scriptContext, + 'warn', + ); + } catch { + // target after_each failures are non-fatal + } } - } - // Reset workspace state before after_each hook (if configured) - if ( - caseHooksEnabled && - workspacePath && - evalCase.workspace?.hooks?.after_each?.reset && - evalCase.workspace.hooks.after_each.reset !== 'none' - ) { - try { - if (repoManager && evalCase.workspace.repos?.length) { - await repoManager.reset( - evalCase.workspace.repos, - workspacePath, - evalCase.workspace.hooks.after_each.reset, - ); - } else { - await resetWorkspaceRoot( - workspacePath, - evalCase.workspace.hooks.after_each.reset, - baselineCommit, - ); + // Reset workspace state before after_each hook (if configured), but only + // after graders have inspected the agent-modified workspace. + if ( + caseHooksEnabled && + workspacePath && + evalCase.workspace?.hooks?.after_each?.reset && + evalCase.workspace.hooks.after_each.reset !== 'none' + ) { + try { + if (repoManager && evalCase.workspace.repos?.length) { + await repoManager.reset( + evalCase.workspace.repos, + workspacePath, + evalCase.workspace.hooks.after_each.reset, + ); + } else { + await resetWorkspaceRoot( + workspacePath, + evalCase.workspace.hooks.after_each.reset, + baselineCommit, + ); + } + } catch { + // Reset failures are non-fatal (like after_each) } - } catch { - // Reset failures are non-fatal (like after_each) } - } - // Execute after_each hook (runs after evaluation, before cleanup) - const caseAfterEachHook = evalCase.workspace?.hooks?.after_each; - if (workspacePath && caseHooksEnabled && hasHookCommand(caseAfterEachHook)) { - const afterEachHook = caseAfterEachHook; - const scriptContext: ScriptExecutionContext = { - workspacePath, - testId: evalCase.id, - evalRunId: evalRunId ?? '', - caseInput: evalCase.question, - caseMetadata: evalCase.metadata, - evalDir, - workspaceFileDir: evalCase.workspace?.workspaceFileDir, - }; - try { - afterEachOutput = await executeWorkspaceScript( - toScriptConfig(afterEachHook, 'after_each', `test '${evalCase.id}'`), - scriptContext, - 'warn', - ); - } catch { - // after_each failures are non-fatal + // Execute after_each hook (runs after grading, before cleanup) + const caseAfterEachHook = evalCase.workspace?.hooks?.after_each; + if (workspacePath && caseHooksEnabled && hasHookCommand(caseAfterEachHook)) { + const afterEachHook = caseAfterEachHook; + const scriptContext: ScriptExecutionContext = { + workspacePath, + testId: evalCase.id, + evalRunId: evalRunId ?? '', + caseInput: evalCase.question, + caseMetadata: evalCase.metadata, + evalDir, + workspaceFileDir: evalCase.workspace?.workspaceFileDir, + }; + try { + afterEachOutput = await executeWorkspaceScript( + toScriptConfig(afterEachHook, 'after_each', `test '${evalCase.id}'`), + scriptContext, + 'warn', + ); + } catch { + // after_each failures are non-fatal + } } - } + }; try { const result = await evaluateCandidate({ @@ -2133,6 +2136,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise {}); const evalRun = { durationMs: Date.now() - caseStartMs }; const errorResult = buildErrorResult( evalCase, diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts index 10935243b..6da74a3c3 100644 --- a/packages/core/src/evaluation/providers/codex.ts +++ b/packages/core/src/evaluation/providers/codex.ts @@ -73,8 +73,12 @@ export class CodexProvider implements Provider { if (this.config.executable) { codexOptions.codexPathOverride = this.config.executable; } - if (this.config.model) { - codexOptions.config = { model: this.config.model }; + if (this.config.apiKey) { + codexOptions.apiKey = this.config.apiKey; + } + const codexConfig = this.buildCodexConfig(); + if (Object.keys(codexConfig).length > 0) { + codexOptions.config = codexConfig; } const codex = new sdk.Codex(codexOptions); @@ -92,6 +96,12 @@ export class CodexProvider implements Provider { if (this.config.modelReasoningEffort) { threadOptions.modelReasoningEffort = this.config.modelReasoningEffort; } + if (this.config.sandboxMode) { + threadOptions.sandboxMode = this.config.sandboxMode; + } + if (this.config.approvalPolicy) { + threadOptions.approvalPolicy = this.config.approvalPolicy; + } const thread = codex.startThread(threadOptions); @@ -172,6 +182,33 @@ export class CodexProvider implements Provider { } } + private buildCodexConfig(): Record { + const config: Record = {}; + if (this.config.model) { + config.model = this.config.model; + } + if (this.config.modelVerbosity) { + config.model_verbosity = this.config.modelVerbosity; + } + if (this.config.baseUrl) { + if (this.config.apiFormat) { + const providerName = 'agentv-openai'; + config.model_provider = providerName; + config.model_providers = { + [providerName]: { + name: 'OpenAI-compatible endpoint', + base_url: this.config.baseUrl, + env_key: 'CODEX_API_KEY', + wire_api: this.config.apiFormat, + }, + }; + } else { + config.openai_base_url = this.config.baseUrl; + } + } + return config; + } + private async runStreamedWithEvents( // biome-ignore lint/suspicious/noExplicitAny: SDK thread type is dynamically loaded thread: any, diff --git a/packages/core/src/evaluation/providers/copilot-sdk.ts b/packages/core/src/evaluation/providers/copilot-sdk.ts index 41bfb60bd..d4b1fa42e 100644 --- a/packages/core/src/evaluation/providers/copilot-sdk.ts +++ b/packages/core/src/evaluation/providers/copilot-sdk.ts @@ -52,6 +52,38 @@ async function loadCopilotSdk(): Promise { return copilotSdkModule; } +// biome-ignore lint/suspicious/noExplicitAny: SDK session type changes across versions +async function abortCopilotSession(session: any): Promise { + try { + if (typeof session?.abort === 'function') { + await session.abort(); + return; + } + await cleanupCopilotSession(session); + } catch { + // Best-effort cancellation; preserve the original provider error/abort path. + } +} + +// biome-ignore lint/suspicious/noExplicitAny: SDK session type changes across versions +async function cleanupCopilotSession(session: any): Promise { + try { + if (typeof session?.disconnect === 'function') { + await session.disconnect(); + return; + } + if (typeof session?.destroy === 'function') { + await session.destroy(); + return; + } + if (typeof session?.[Symbol.asyncDispose] === 'function') { + await session[Symbol.asyncDispose](); + } + } catch { + // Cleanup should not mask the provider result or the primary failure. + } +} + interface ToolCallInProgress { readonly tool: string; readonly input?: unknown; @@ -100,7 +132,7 @@ export class CopilotSdkProvider implements Provider { // Create a fresh session for this invocation // biome-ignore lint/suspicious/noExplicitAny: SDK session type is dynamically loaded const sessionOptions: any = { - onPermissionRequest: () => ({ kind: 'approved' }), + onPermissionRequest: sdk.approveAll ?? (() => ({ kind: 'approved' })), }; if (this.config.model) { @@ -139,6 +171,12 @@ export class CopilotSdkProvider implements Provider { if (customProvider.wireApi) { provider.wireApi = customProvider.wireApi; } + if (customProvider.modelId) { + provider.modelId = customProvider.modelId; + } + if (customProvider.wireModel) { + provider.wireModel = customProvider.wireModel; + } if (providerType === 'azure' && customProvider.apiVersion) { provider.azure = { apiVersion: customProvider.apiVersion }; } @@ -240,7 +278,7 @@ export class CopilotSdkProvider implements Provider { if (request.signal) { // Handle abort signal const abortHandler = () => { - session.destroy().catch(() => {}); + void abortCopilotSession(session); }; request.signal.addEventListener('abort', abortHandler, { once: true }); try { @@ -299,7 +337,7 @@ export class CopilotSdkProvider implements Provider { } finally { unsubscribe(); await logger?.close(); - await session.destroy().catch(() => {}); + await cleanupCopilotSession(session); } } @@ -309,21 +347,40 @@ export class CopilotSdkProvider implements Provider { // biome-ignore lint/suspicious/noExplicitAny: SDK constructor options are dynamic const clientOptions: any = {}; if (this.config.cliUrl) { - clientOptions.cliUrl = this.config.cliUrl; + if (sdk.RuntimeConnection?.forUri) { + clientOptions.connection = sdk.RuntimeConnection.forUri(this.config.cliUrl); + } else { + clientOptions.cliUrl = this.config.cliUrl; + } } - if (this.config.cliPath) { - clientOptions.cliPath = this.config.cliPath; + + if (!clientOptions.connection && (this.config.cliPath || this.config.args?.length)) { + if (sdk.RuntimeConnection?.forStdio) { + clientOptions.connection = sdk.RuntimeConnection.forStdio({ + ...(this.config.cliPath ? { path: this.config.cliPath } : {}), + ...(this.config.args?.length ? { args: this.config.args } : {}), + }); + } else if (this.config.cliPath) { + clientOptions.cliPath = this.config.cliPath; + } } else { // The SDK default getBundledCliPath() resolves to a JS entry that requires // node:sqlite (unavailable in Bun). Auto-resolve the platform-specific native // binary from @github/copilot-{platform}-{arch} when available. const nativePath = resolvePlatformCliPath(); - if (nativePath) { + if (nativePath && sdk.RuntimeConnection?.forStdio && !clientOptions.connection) { + clientOptions.connection = sdk.RuntimeConnection.forStdio({ + path: nativePath, + ...(this.config.args?.length ? { args: this.config.args } : {}), + }); + } else if (nativePath) { clientOptions.cliPath = nativePath; } } // Set the subprocess cwd so --plugin-dir ./relative resolves from the eval workspace. const resolvedCwd = evalCwd ?? process.cwd(); + clientOptions.workingDirectory = resolvedCwd; + // Backward compatibility for older @github/copilot-sdk releases. clientOptions.cwd = resolvedCwd; if (this.config.args && this.config.args.length > 0) { @@ -331,6 +388,8 @@ export class CopilotSdkProvider implements Provider { clientOptions.cliArgs = [...this.config.args]; } if (this.config.githubToken) { + clientOptions.gitHubToken = this.config.githubToken; + // Backward compatibility for older @github/copilot-sdk releases. clientOptions.githubToken = this.config.githubToken; } this.client = new sdk.CopilotClient(clientOptions); diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 06d03116c..ca8720f23 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -425,6 +425,12 @@ export interface GeminiResolvedConfig { export interface CodexResolvedConfig { readonly model?: string; readonly modelReasoningEffort?: CodexModelReasoningEffort; + readonly modelVerbosity?: CodexModelVerbosity; + readonly baseUrl?: string; + readonly apiKey?: string; + readonly apiFormat?: ApiFormat; + readonly sandboxMode?: CodexSandboxMode; + readonly approvalPolicy?: CodexApprovalPolicy; readonly executable: string; readonly args?: readonly string[]; readonly cwd?: string; @@ -456,6 +462,8 @@ export interface CopilotCustomProviderConfig { readonly bearerToken?: string; readonly apiVersion?: string; readonly wireApi?: string; + readonly modelId?: string; + readonly wireModel?: string; } export interface CopilotSdkResolvedConfig { @@ -610,9 +618,15 @@ const DEPRECATED_TARGET_CAMEL_CASE_FIELDS = new Map([ ['retryBackoffFactor', 'retry_backoff_factor'], ['retryStatusCodes', 'retry_status_codes'], ['modelReasoningEffort', 'model_reasoning_effort'], + ['modelVerbosity', 'model_verbosity'], + ['sandboxMode', 'sandbox_mode'], + ['approvalPolicy', 'approval_policy'], ]); export type CodexModelReasoningEffort = 'minimal' | 'low' | 'medium' | 'high' | 'xhigh'; +export type CodexModelVerbosity = 'low' | 'medium' | 'high'; +export type CodexSandboxMode = 'read-only' | 'workspace-write' | 'danger-full-access'; +export type CodexApprovalPolicy = 'never' | 'on-request' | 'on-failure' | 'untrusted'; const CODEX_MODEL_REASONING_EFFORT_VALUES = new Set([ 'minimal', @@ -622,6 +636,25 @@ const CODEX_MODEL_REASONING_EFFORT_VALUES = new Set([ 'xhigh', ]); +const CODEX_MODEL_VERBOSITY_VALUES = new Set([ + 'low', + 'medium', + 'high', +]); + +const CODEX_SANDBOX_MODE_VALUES = new Set([ + 'read-only', + 'workspace-write', + 'danger-full-access', +]); + +const CODEX_APPROVAL_POLICY_VALUES = new Set([ + 'never', + 'on-request', + 'on-failure', + 'untrusted', +]); + const DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = new Map([ ['timeoutSeconds', 'timeout_seconds'], ]); @@ -1290,6 +1323,12 @@ function resolveCodexConfig( ): CodexResolvedConfig { const modelSource = target.model; const modelReasoningEffortSource = target.model_reasoning_effort; + const modelVerbositySource = target.model_verbosity; + const baseUrlSource = target.base_url ?? target.endpoint; + const apiKeySource = target.api_key; + const apiFormatSource = target.api_format; + const sandboxModeSource = target.sandbox_mode; + const approvalPolicySource = target.approval_policy; const executableSource = target.executable ?? target.command ?? target.binary; const argsSource = target.args ?? target.arguments; const cwdSource = target.cwd; @@ -1319,6 +1358,42 @@ function resolveCodexConfig( }, ), ); + const modelVerbosity = normalizeCodexModelVerbosity( + resolveOptionalString(modelVerbositySource, env, `${target.name} codex model verbosity`, { + allowLiteral: true, + optionalEnv: true, + }), + ); + + const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} codex base URL`, { + allowLiteral: true, + optionalEnv: true, + }); + + const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} codex API key`, { + allowLiteral: false, + optionalEnv: true, + }); + + const apiFormat = resolveApiFormat( + { ...target, api_format: apiFormatSource }, + env, + target.name, + ); + + const sandboxMode = normalizeCodexSandboxMode( + resolveOptionalString(sandboxModeSource, env, `${target.name} codex sandbox mode`, { + allowLiteral: true, + optionalEnv: true, + }), + ); + + const approvalPolicy = normalizeCodexApprovalPolicy( + resolveOptionalString(approvalPolicySource, env, `${target.name} codex approval policy`, { + allowLiteral: true, + optionalEnv: true, + }), + ); const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, { @@ -1347,6 +1422,12 @@ function resolveCodexConfig( return { model, modelReasoningEffort, + modelVerbosity, + baseUrl, + apiKey, + apiFormat, + sandboxMode, + approvalPolicy, executable, args, cwd, @@ -1374,6 +1455,51 @@ function normalizeCodexModelReasoningEffort( ); } +function normalizeCodexModelVerbosity(value: string | undefined): CodexModelVerbosity | undefined { + if (value === undefined) { + return undefined; + } + + const normalized = value.trim().toLowerCase(); + if (CODEX_MODEL_VERBOSITY_VALUES.has(normalized as CodexModelVerbosity)) { + return normalized as CodexModelVerbosity; + } + + throw new Error( + `codex model_verbosity must be one of: ${[...CODEX_MODEL_VERBOSITY_VALUES].join(', ')}`, + ); +} + +function normalizeCodexSandboxMode(value: string | undefined): CodexSandboxMode | undefined { + if (value === undefined) { + return undefined; + } + + const normalized = value.trim().toLowerCase(); + if (CODEX_SANDBOX_MODE_VALUES.has(normalized as CodexSandboxMode)) { + return normalized as CodexSandboxMode; + } + + throw new Error( + `codex sandbox_mode must be one of: ${[...CODEX_SANDBOX_MODE_VALUES].join(', ')}`, + ); +} + +function normalizeCodexApprovalPolicy(value: string | undefined): CodexApprovalPolicy | undefined { + if (value === undefined) { + return undefined; + } + + const normalized = value.trim().toLowerCase(); + if (CODEX_APPROVAL_POLICY_VALUES.has(normalized as CodexApprovalPolicy)) { + return normalized as CodexApprovalPolicy; + } + + throw new Error( + `codex approval_policy must be one of: ${[...CODEX_APPROVAL_POLICY_VALUES].join(', ')}`, + ); +} + /** * Resolve the stream_log config field, falling back to log_format with a * deprecation warning. @@ -1574,6 +1700,19 @@ function resolveCopilotFlatProviderConfig( optionalEnv: true, }, ); + const modelId = resolveOptionalString(target.model_id, env, `${target.name} copilot model ID`, { + allowLiteral: true, + optionalEnv: true, + }); + const wireModel = resolveOptionalString( + target.wire_model, + env, + `${target.name} copilot wire model`, + { + allowLiteral: true, + optionalEnv: true, + }, + ); return { ...(type ? { type } : {}), @@ -1582,6 +1721,8 @@ function resolveCopilotFlatProviderConfig( ...(bearerToken ? { bearerToken } : {}), ...(apiVersion ? { apiVersion } : {}), ...(apiFormat ? { wireApi: apiFormat } : {}), + ...(modelId ? { modelId } : {}), + ...(wireModel ? { wireModel } : {}), }; } diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 50478041b..d265d5a01 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -386,6 +386,8 @@ export interface TargetDefinition { readonly version?: string | unknown | undefined; readonly api_version?: string | unknown | undefined; readonly api_format?: string | unknown | undefined; + readonly model_id?: string | unknown | undefined; + readonly wire_model?: string | unknown | undefined; // Anthropic fields readonly variant?: string | unknown | undefined; readonly thinking_budget?: number | unknown | undefined; @@ -399,6 +401,9 @@ export interface TargetDefinition { readonly args?: unknown | undefined; readonly arguments?: unknown | undefined; readonly model_reasoning_effort?: string | unknown | undefined; + readonly model_verbosity?: string | unknown | undefined; + readonly sandbox_mode?: string | unknown | undefined; + readonly approval_policy?: string | unknown | undefined; readonly cwd?: string | unknown | undefined; readonly timeout_seconds?: number | unknown | undefined; readonly log_dir?: string | unknown | undefined; diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index ce5aa1002..4d30a3491 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -99,8 +99,15 @@ const GEMINI_SETTINGS = new Set([ const CODEX_SETTINGS = new Set([ ...COMMON_SETTINGS, + 'endpoint', + 'base_url', + 'api_key', + 'api_format', 'model', 'model_reasoning_effort', + 'model_verbosity', + 'sandbox_mode', + 'approval_policy', 'executable', 'command', 'binary', @@ -134,6 +141,8 @@ const COPILOT_SDK_SETTINGS = new Set([ 'bearer_token', 'api_version', 'api_format', + 'model_id', + 'wire_model', ]); const COPILOT_CLI_SETTINGS = new Set([ @@ -156,6 +165,8 @@ const COPILOT_CLI_SETTINGS = new Set([ 'bearer_token', 'api_version', 'api_format', + 'model_id', + 'wire_model', ]); const VSCODE_SETTINGS = new Set([ diff --git a/packages/core/test/evaluation/providers/codex-sdk.test.ts b/packages/core/test/evaluation/providers/codex-sdk.test.ts index e279cc3cc..b117a7886 100644 --- a/packages/core/test/evaluation/providers/codex-sdk.test.ts +++ b/packages/core/test/evaluation/providers/codex-sdk.test.ts @@ -132,6 +132,64 @@ describe('CodexProvider (SDK)', () => { expect(constructorArgs.config.model).toBe('o4-mini'); }); + it('passes OpenAI-compatible endpoint config and execution controls to Codex SDK', async () => { + const thread = createMockThread({ + events: [ + { + type: 'item.completed', + item: { id: 'msg-1', type: 'agent_message', text: 'response' }, + }, + { + type: 'turn.completed', + usage: { input_tokens: 10, output_tokens: 5, cached_input_tokens: 0 }, + }, + ], + }); + const codexInstance = createMockCodex(thread); + + const CodexMock = mock(function Codex() { + return codexInstance; + }); + mock.module('@openai/codex-sdk', () => ({ Codex: CodexMock })); + + const { CodexProvider } = await import('../../../src/evaluation/providers/codex.js'); + + const provider = new CodexProvider('test-target', { + executable: 'codex', + model: 'gpt-5.3-codex-spark', + modelReasoningEffort: 'medium', + modelVerbosity: 'medium', + baseUrl: 'http://127.0.0.1:10531/v1', + apiKey: 'dummy', + apiFormat: 'responses', + sandboxMode: 'danger-full-access', + approvalPolicy: 'never', + }); + + await provider.invoke({ question: 'Test' }); + + const constructorArgs = CodexMock.mock.calls[0][0]; + expect(constructorArgs.apiKey).toBe('dummy'); + expect(constructorArgs.config).toEqual({ + model: 'gpt-5.3-codex-spark', + model_verbosity: 'medium', + model_provider: 'agentv-openai', + model_providers: { + 'agentv-openai': { + name: 'OpenAI-compatible endpoint', + base_url: 'http://127.0.0.1:10531/v1', + env_key: 'CODEX_API_KEY', + wire_api: 'responses', + }, + }, + }); + + const threadOptions = codexInstance.startThread.mock.calls[0][0]; + expect(threadOptions.modelReasoningEffort).toBe('medium'); + expect(threadOptions.sandboxMode).toBe('danger-full-access'); + expect(threadOptions.approvalPolicy).toBe('never'); + }); + it('passes executable config to Codex constructor as codexPathOverride', async () => { const thread = createMockThread({ events: [ diff --git a/packages/core/test/evaluation/providers/copilot-sdk.test.ts b/packages/core/test/evaluation/providers/copilot-sdk.test.ts index 96ab0db74..c730df94f 100644 --- a/packages/core/test/evaluation/providers/copilot-sdk.test.ts +++ b/packages/core/test/evaluation/providers/copilot-sdk.test.ts @@ -18,7 +18,9 @@ type EventHandler = (event: any) => void; interface MockSession { on: ReturnType; sendAndWait: ReturnType; - destroy: ReturnType; + disconnect?: ReturnType; + destroy?: ReturnType; + abort?: ReturnType; } interface MockClient { @@ -30,6 +32,7 @@ interface MockClient { function createMockSession(options?: { events?: Array<{ type: string; data?: unknown }>; sendError?: Error; + legacyDestroyOnly?: boolean; }): MockSession { let eventHandler: EventHandler | null = null; @@ -51,9 +54,15 @@ function createMockSession(options?: { throw options.sendError; } }), - destroy: mock(async () => {}), + abort: mock(async () => {}), }; + if (options?.legacyDestroyOnly) { + session.destroy = mock(async () => {}); + } else { + session.disconnect = mock(async () => {}); + } + return session; } @@ -112,7 +121,7 @@ describe('CopilotSdkProvider', () => { const content = extractLastAssistantContent(response.output); expect(content).toBe('Hello from Copilot SDK'); expect(session.sendAndWait).toHaveBeenCalledTimes(1); - expect(session.destroy).toHaveBeenCalledTimes(1); + expect(session.disconnect).toHaveBeenCalledTimes(1); }); it('passes model config to createSession', async () => { @@ -209,6 +218,7 @@ describe('CopilotSdkProvider', () => { const constructorArgs = CopilotClientMock.mock.calls[0][0]; // cwd is set so the subprocess resolves relative paths itself — args are NOT pre-resolved expect(constructorArgs.cwd).toBe(path.resolve(fixturesRoot)); + expect(constructorArgs.workingDirectory).toBe(path.resolve(fixturesRoot)); expect(constructorArgs.cliArgs).toEqual([ '--plugin-dir', './plugins', @@ -333,11 +343,29 @@ describe('CopilotSdkProvider', () => { await provider.invoke({ question: 'First' }); await provider.invoke({ question: 'Second' }); - // Session should be destroyed after each invocation - expect(session.destroy).toHaveBeenCalledTimes(2); + // Session should be disconnected after each invocation + expect(session.disconnect).toHaveBeenCalledTimes(2); expect(client.createSession).toHaveBeenCalledTimes(2); }); + it('falls back to destroy for older SDK sessions', async () => { + const session = createMockSession({ + events: [{ type: 'assistant.message', data: { content: 'response' } }], + legacyDestroyOnly: true, + }); + const client = createMockClient(session); + const sdkMock = mockCopilotSdk(client); + + mock.module('@github/copilot-sdk', () => sdkMock); + const { CopilotSdkProvider } = await import('../../../src/evaluation/providers/copilot-sdk.js'); + + const provider = new CopilotSdkProvider('test-target', {}); + + await provider.invoke({ question: 'Test' }); + + expect(session.destroy).toHaveBeenCalledTimes(1); + }); + it('extracts token usage from assistant.usage events', async () => { const session = createMockSession({ events: [ @@ -447,6 +475,42 @@ describe('CopilotSdkProvider', () => { expect(sessionOptions.provider.azure).toEqual({ apiVersion: '2024-10-21' }); }); + it('passes custom provider model identity overrides to createSession', async () => { + const session = createMockSession({ + events: [{ type: 'assistant.message', data: { content: 'response' } }], + }); + const client = createMockClient(session); + const sdkMock = mockCopilotSdk(client); + + mock.module('@github/copilot-sdk', () => sdkMock); + const { CopilotSdkProvider } = await import('../../../src/evaluation/providers/copilot-sdk.js'); + + const provider = new CopilotSdkProvider('test-target', { + model: 'gpt-5', + customProvider: { + type: 'openai', + baseUrl: 'http://127.0.0.1:10531/v1', + apiKey: 'dummy', + wireApi: 'responses', + modelId: 'gpt-5', + wireModel: 'gpt-5.3-codex-spark', + }, + }); + + await provider.invoke({ question: 'Test' }); + + const sessionOptions = client.createSession.mock.calls[0][0]; + expect(sessionOptions.model).toBe('gpt-5'); + expect(sessionOptions.provider).toMatchObject({ + type: 'openai', + baseUrl: 'http://127.0.0.1:10531/v1', + apiKey: 'dummy', + wireApi: 'responses', + modelId: 'gpt-5', + wireModel: 'gpt-5.3-codex-spark', + }); + }); + it('normalizes bare azure resource name to full URL', async () => { const session = createMockSession({ events: [{ type: 'assistant.message', data: { content: 'response' } }], diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index 88c0d5af4..e2acc1cb2 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -649,6 +649,44 @@ describe('resolveTargetDefinition', () => { expect(target.config.modelReasoningEffort).toBe('low'); }); + it('resolves codex OpenAI-compatible endpoint settings', () => { + const target = resolveTargetDefinition( + { + name: 'codex-local-openai', + provider: 'codex', + model: '${{ CODEX_MODEL }}', + model_reasoning_effort: 'medium', + model_verbosity: 'medium', + base_url: '${{ OPENAI_BASE_URL }}', + api_key: '${{ OPENAI_API_KEY }}', + api_format: 'responses', + sandbox_mode: 'danger-full-access', + approval_policy: 'never', + }, + { + CODEX_MODEL: 'gpt-5.3-codex-spark', + OPENAI_BASE_URL: 'http://127.0.0.1:10531/v1', + OPENAI_API_KEY: 'dummy', + }, + ); + + expect(target.kind).toBe('codex'); + if (target.kind !== 'codex') { + throw new Error('expected codex target'); + } + + expect(target.config).toMatchObject({ + model: 'gpt-5.3-codex-spark', + modelReasoningEffort: 'medium', + modelVerbosity: 'medium', + baseUrl: 'http://127.0.0.1:10531/v1', + apiKey: 'dummy', + apiFormat: 'responses', + sandboxMode: 'danger-full-access', + approvalPolicy: 'never', + }); + }); + it('rejects unsupported codex model_reasoning_effort values', () => { expect(() => resolveTargetDefinition( @@ -852,6 +890,43 @@ describe('resolveTargetDefinition', () => { }); }); + it('resolves copilot-sdk provider model identity overrides', () => { + const env = { + OPENAI_ENDPOINT: 'https://api.openai.example/v1', + OPENAI_API_KEY: 'openai-secret', + WIRE_MODEL: 'gpt-5.3-codex-spark', + } satisfies Record; + + const target = resolveTargetDefinition( + { + name: 'copilot-sdk-openai-wire-model', + provider: 'copilot-sdk', + model: 'gpt-5', + subprovider: 'openai', + base_url: '${{ OPENAI_ENDPOINT }}', + api_key: '${{ OPENAI_API_KEY }}', + api_format: 'responses', + model_id: 'gpt-5', + wire_model: '${{ WIRE_MODEL }}', + }, + env, + ); + + expect(target.kind).toBe('copilot-sdk'); + if (target.kind !== 'copilot-sdk') { + throw new Error('expected copilot-sdk target'); + } + + expect(target.config.customProvider).toEqual({ + type: 'openai', + baseUrl: 'https://api.openai.example/v1', + apiKey: 'openai-secret', + wireApi: 'responses', + modelId: 'gpt-5', + wireModel: 'gpt-5.3-codex-spark', + }); + }); + it('resolves copilot-sdk args field', () => { const target = resolveTargetDefinition( { diff --git a/packages/core/test/evaluation/validation/targets-validator.test.ts b/packages/core/test/evaluation/validation/targets-validator.test.ts index fc96e146d..22866b9ff 100644 --- a/packages/core/test/evaluation/validation/targets-validator.test.ts +++ b/packages/core/test/evaluation/validation/targets-validator.test.ts @@ -129,10 +129,13 @@ describe('validateTargetsFile', () => { `targets: - name: copilot-sdk-custom-provider provider: copilot-sdk + model: gpt-5 subprovider: openai base_url: \${{ OPENAI_ENDPOINT }} api_key: \${{ OPENAI_API_KEY }} api_format: responses + model_id: gpt-5 + wire_model: \${{ OPENAI_MODEL }} - name: copilot-cli-custom-provider provider: copilot-cli subprovider: openai @@ -148,6 +151,30 @@ describe('validateTargetsFile', () => { expect(result.errors.filter((error) => error.severity === 'warning')).toEqual([]); }); + it('accepts OpenAI-compatible endpoint fields on codex targets', async () => { + const filePath = path.join(tempDir, 'codex-openai-provider.yaml'); + await writeFile( + filePath, + `targets: + - name: codex-local-openai + provider: codex + model: \${{ CODEX_MODEL }} + model_reasoning_effort: medium + model_verbosity: medium + base_url: \${{ OPENAI_ENDPOINT }} + api_key: \${{ OPENAI_API_KEY }} + api_format: responses + sandbox_mode: danger-full-access + approval_policy: never +`, + ); + + const result = await validateTargetsFile(filePath); + + expect(result.valid).toBe(true); + expect(result.errors.filter((error) => error.severity === 'warning')).toEqual([]); + }); + it('warns on removed copilot custom_provider and byok fields', async () => { const filePath = path.join(tempDir, 'copilot-removed-provider-fields.yaml'); await writeFile( From 5bd477baaf37e0f04e1f63ea2706ae1a6173ff00 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 22 Jun 2026 13:27:47 +0200 Subject: [PATCH 2/2] style(providers): format target configuration Entire-Checkpoint: 966e5798cc56 --- packages/core/src/evaluation/providers/targets.ts | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index ca8720f23..9996b01fe 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -636,11 +636,7 @@ const CODEX_MODEL_REASONING_EFFORT_VALUES = new Set([ 'xhigh', ]); -const CODEX_MODEL_VERBOSITY_VALUES = new Set([ - 'low', - 'medium', - 'high', -]); +const CODEX_MODEL_VERBOSITY_VALUES = new Set(['low', 'medium', 'high']); const CODEX_SANDBOX_MODE_VALUES = new Set([ 'read-only', @@ -1375,11 +1371,7 @@ function resolveCodexConfig( optionalEnv: true, }); - const apiFormat = resolveApiFormat( - { ...target, api_format: apiFormatSource }, - env, - target.name, - ); + const apiFormat = resolveApiFormat({ ...target, api_format: apiFormatSource }, env, target.name); const sandboxMode = normalizeCodexSandboxMode( resolveOptionalString(sandboxModeSource, env, `${target.name} codex sandbox mode`, {