From 6890c16a7c243e5e83d708fbdd6c96fc4eeb807e Mon Sep 17 00:00:00 2001 From: Filip Michalsky Date: Tue, 18 Nov 2025 23:07:51 -0500 Subject: [PATCH 1/2] fix evals --- packages/core/lib/v3/v3.ts | 14 +++++++++++--- packages/evals/index.eval.ts | 5 +++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/packages/core/lib/v3/v3.ts b/packages/core/lib/v3/v3.ts index f57067a46..e3a151ebd 100644 --- a/packages/core/lib/v3/v3.ts +++ b/packages/core/lib/v3/v3.ts @@ -1468,12 +1468,20 @@ export class V3 { instructionOrOptions: string | AgentExecuteOptions, ) => Promise; } { + // Auto-detect CUA models if not explicitly set + const modelToCheck = options?.model || { + modelName: this.modelName, + ...this.modelClientOptions, + }; + const { isCua: isModelCua } = resolveModel(modelToCheck); + const useCua = options?.cua ?? isModelCua; + this.logger({ category: "agent", message: `Creating v3 agent instance with options: ${JSON.stringify(options)}`, level: 1, auxiliary: { - cua: { value: options?.cua ? "true" : "false", type: "boolean" }, + cua: { value: useCua ? "true" : "false", type: "boolean" }, model: options?.model ? typeof options?.model === "string" ? { value: options.model, type: "string" } @@ -1490,8 +1498,8 @@ export class V3 { }, }); - // If CUA is enabled, use the computer-use agent path - if (options?.cua) { + // If CUA is enabled or auto-detected, use the computer-use agent path + if (useCua) { if ((options?.integrations || options?.tools) && !this.experimental) { throw new ExperimentalNotConfiguredError( "MCP integrations and custom tools", diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts index 709fd2274..8d76e2901 100644 --- a/packages/evals/index.eval.ts +++ b/packages/evals/index.eval.ts @@ -370,6 +370,11 @@ const generateFilteredTestcases = (): Testcase[] => { v3: v3Input?.v3, v3Agent: v3Input?.agent, logger: v3Input?.logger, + debugUrl: v3Input?.debugUrl || "", + sessionUrl: v3Input?.sessionUrl || "", + modelName: v3Input?.modelName, + agent: v3Input?.agent, + input: input, v3Input, }); // Log result to console From bfd4d2f12ff806944aeaad53879a5e5a70c69cd8 Mon Sep 17 00:00:00 2001 From: Filip Michalsky Date: Wed, 19 Nov 2025 13:27:23 -0500 Subject: [PATCH 2/2] custom LLM endpoint --- packages/evals/CUSTOM_ENDPOINT_USAGE.md | 198 ++++++++++++++++++ packages/evals/IMPLEMENTATION_SUMMARY.md | 186 ++++++++++++++++ packages/evals/env.ts | 14 ++ .../evals/examples/custom_endpoint_example.ts | 170 +++++++++++++++ .../evals/examples/custom_vllm_endpoint.sh | 73 +++++++ packages/evals/index.eval.ts | 26 ++- packages/evals/package.json | 3 +- packages/evals/taskConfig.ts | 7 + pnpm-lock.yaml | 13 +- 9 files changed, 685 insertions(+), 5 deletions(-) create mode 100644 packages/evals/CUSTOM_ENDPOINT_USAGE.md create mode 100644 packages/evals/IMPLEMENTATION_SUMMARY.md create mode 100644 packages/evals/examples/custom_endpoint_example.ts create mode 100755 packages/evals/examples/custom_vllm_endpoint.sh diff --git a/packages/evals/CUSTOM_ENDPOINT_USAGE.md b/packages/evals/CUSTOM_ENDPOINT_USAGE.md new file mode 100644 index 000000000..deb449554 --- /dev/null +++ b/packages/evals/CUSTOM_ENDPOINT_USAGE.md @@ -0,0 +1,198 @@ +# Using Custom OpenAI-Compatible Endpoints with Evals + +This guide explains how to configure the Stagehand evals system to use custom OpenAI-compatible inference endpoints, such as vLLM, Ollama, or other compatible servers. + +## Overview + +The evals system now supports custom OpenAI-compatible endpoints through the AI SDK's `createOpenAI` function. This allows you to: + +- Use local vLLM servers for faster inference +- Connect to custom model deployments +- Test with Ollama or other OpenAI-compatible services +- Use self-hosted inference endpoints + +## Configuration + +Configure the custom endpoint using environment variables: + +### Required Environment Variables + +- `CUSTOM_OPENAI_BASE_URL`: The base URL for your custom endpoint + - Example: `http://localhost:8000/v1` + - Example: `http://your-vllm-server:8000/v1` + +### Optional Environment Variables + +- `CUSTOM_OPENAI_API_KEY`: API key for the endpoint (defaults to `"EMPTY"` if not set) + + - For vLLM: Use `"EMPTY"` or leave unset + - For secured endpoints: Set your actual API key + +- `CUSTOM_OPENAI_MODEL_NAME`: Override the model name to use + - If not set, the model name from the eval configuration will be used + - Useful when your endpoint expects a specific model identifier + +## Usage Examples + +### Example 1: Basic vLLM Setup + +```bash +# Start your vLLM server (in a separate terminal) +vllm serve meta-llama/Llama-3.3-70B-Instruct \ + --host 0.0.0.0 \ + --port 8000 + +# Configure the evals to use the vLLM endpoint +export CUSTOM_OPENAI_BASE_URL="http://localhost:8000/v1" +export CUSTOM_OPENAI_API_KEY="EMPTY" +export CUSTOM_OPENAI_MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" + +# Run your evals +cd packages/evals +pnpm run evals --eval your-eval-name +``` + +### Example 2: Remote vLLM Server + +```bash +# Connect to a remote vLLM deployment +export CUSTOM_OPENAI_BASE_URL="http://192.168.1.100:8000/v1" +export CUSTOM_OPENAI_API_KEY="EMPTY" +export CUSTOM_OPENAI_MODEL_NAME="my-custom-model" + +# Run evals +pnpm run evals --category act +``` + +### Example 3: Ollama + +```bash +# Start Ollama with OpenAI-compatible API +ollama serve + +# Configure for Ollama endpoint +export CUSTOM_OPENAI_BASE_URL="http://localhost:11434/v1" +export CUSTOM_OPENAI_API_KEY="EMPTY" +export CUSTOM_OPENAI_MODEL_NAME="llama3.3:70b" + +# Run evals +pnpm run evals --eval my-task +``` + +### Example 4: Custom Secured Endpoint + +```bash +# For endpoints requiring authentication +export CUSTOM_OPENAI_BASE_URL="https://api.your-inference-provider.com/v1" +export CUSTOM_OPENAI_API_KEY="your-actual-api-key-here" +export CUSTOM_OPENAI_MODEL_NAME="custom-model-v1" + +# Run evals +pnpm run evals +``` + +## How It Works + +When `CUSTOM_OPENAI_BASE_URL` is set, the eval system: + +1. Creates a custom OpenAI provider using AI SDK's `createOpenAI` function +2. Points it to your specified base URL +3. Uses your configured API key (or "EMPTY" by default) +4. Wraps it in the existing `AISdkClientWrapped` class +5. Passes it to the V3 initialization for use in evals + +The implementation automatically falls back to standard AI SDK providers when the custom endpoint is not configured. + +## Compatibility + +This feature works with any server that implements the OpenAI Chat Completions API, including: + +- ✅ vLLM (recommended for production) +- ✅ Ollama +- ✅ LocalAI +- ✅ Text Generation Inference (TGI) +- ✅ LM Studio +- ✅ Any custom OpenAI-compatible server + +## Troubleshooting + +### Connection Issues + +If you can't connect to your endpoint: + +```bash +# Test the endpoint manually with curl +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer EMPTY" \ + -d '{ + "model": "your-model-name", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 10 + }' +``` + +### Model Name Mismatch + +If you get model not found errors: + +1. Check your vLLM server logs to see what model name it expects +2. Set `CUSTOM_OPENAI_MODEL_NAME` to match exactly +3. Ensure the model name matches what was loaded in vLLM + +### API Key Issues + +If you get authentication errors: + +1. For vLLM, use `CUSTOM_OPENAI_API_KEY="EMPTY"` +2. For secured endpoints, ensure your API key is correct +3. Check if your endpoint requires a specific authorization header format + +## Performance Tips + +When using vLLM: + +1. **Enable prefix caching** for better performance with similar prompts +2. **Use appropriate batch sizes** for your hardware +3. **Consider tensor parallelism** for larger models +4. **Monitor GPU memory** usage during eval runs + +Example vLLM server configuration for optimal eval performance: + +```bash +vllm serve your-model \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size 2 \ + --enable-prefix-caching \ + --max-model-len 4096 \ + --gpu-memory-utilization 0.9 +``` + +## Integration with Verifiers Training Code + +This implementation follows a similar pattern to the verifiers codebase, where vLLM is used for efficient inference: + +```python +# Similar to verifiers approach +client_config = { + "base_url": "http://localhost:8000/v1", + "api_key": "EMPTY", + "http_client_args": { + "limits": {"max_connections": max_concurrent}, + "timeout": timeout, + }, +} +``` + +The Stagehand implementation uses the same OpenAI-compatible interface, making it easy to: + +- Share vLLM servers between training and evaluation +- Use the same model configurations +- Maintain consistent inference behavior + +## Additional Resources + +- [AI SDK Documentation - Custom OpenAI Providers](https://ai-sdk.dev/providers/ai-sdk-providers/openai#provider-instance) +- [vLLM Documentation](https://docs.vllm.ai/) +- [Ollama Documentation](https://ollama.ai/docs) diff --git a/packages/evals/IMPLEMENTATION_SUMMARY.md b/packages/evals/IMPLEMENTATION_SUMMARY.md new file mode 100644 index 000000000..da5bd4365 --- /dev/null +++ b/packages/evals/IMPLEMENTATION_SUMMARY.md @@ -0,0 +1,186 @@ +# Custom vLLM Endpoint Implementation - Summary + +## Overview + +Successfully implemented support for custom OpenAI-compatible inference endpoints (like vLLM) in the Stagehand evals system. This allows users to configure and use custom inference servers instead of standard cloud-based providers. + +## Implementation Date + +November 19, 2025 + +## Changes Made + +### 1. Environment Configuration (`packages/evals/env.ts`) + +**Added:** + +- `customOpenAIConfig` object containing: + - `baseURL`: Base URL for custom endpoint (from `CUSTOM_OPENAI_BASE_URL`) + - `apiKey`: API key (from `CUSTOM_OPENAI_API_KEY`, defaults to "EMPTY") + - `modelName`: Model name override (from `CUSTOM_OPENAI_MODEL_NAME`) + +### 2. Eval Runner Updates (`packages/evals/index.eval.ts`) + +**Modified:** + +- Added import for `createOpenAI` from `@ai-sdk/openai` +- Added import for `customOpenAIConfig` from `./env` +- Updated LLM client initialization logic (lines 349-384) to: + - Detect when `CUSTOM_OPENAI_BASE_URL` is set + - Create custom OpenAI provider using `createOpenAI()` + - Pass custom provider to `AISdkClientWrapped` + - Fall back to standard providers when custom endpoint is not configured + +### 3. Dependencies (`packages/evals/package.json`) + +**Added:** + +- `@ai-sdk/openai` version `^2.0.53` to dependencies + +### 4. Documentation (`packages/evals/taskConfig.ts`) + +**Updated:** + +- Added documentation comment explaining custom endpoint configuration +- Referenced `CUSTOM_ENDPOINT_USAGE.md` for detailed instructions + +### 5. User Documentation + +**Created `CUSTOM_ENDPOINT_USAGE.md`:** + +- Comprehensive guide on using custom endpoints +- Configuration instructions +- Usage examples for: + - Local vLLM server + - Remote vLLM deployment + - Ollama + - Secured custom endpoints +- Troubleshooting section +- Performance tips +- Compatibility information + +### 6. Examples + +**Created `examples/custom_vllm_endpoint.sh`:** + +- Shell script demonstrating how to run evals with custom endpoint +- Includes connectivity check +- Configurable via environment variables +- Made executable with proper permissions + +**Created `examples/custom_endpoint_example.ts`:** + +- TypeScript example showing internal integration +- Demonstrates custom provider setup +- Shows configuration patterns +- Includes fallback behavior explanation + +## Technical Approach + +The implementation leverages AI SDK's `createOpenAI` function with custom `baseURL` parameter: + +```typescript +const customOpenAI = createOpenAI({ + baseURL: process.env.CUSTOM_OPENAI_BASE_URL, + apiKey: process.env.CUSTOM_OPENAI_API_KEY || "EMPTY", +}); + +const model = customOpenAI(modelName); +const llmClient = new AISdkClientWrapped({ model }); +``` + +This approach: + +- ✅ Works seamlessly with existing `AISdkClientWrapped` class +- ✅ Requires no changes to downstream code +- ✅ Maintains backward compatibility +- ✅ Supports any OpenAI-compatible endpoint +- ✅ Simple environment variable configuration + +## Usage + +To use a custom vLLM endpoint: + +```bash +# Set environment variables +export CUSTOM_OPENAI_BASE_URL="http://localhost:8000/v1" +export CUSTOM_OPENAI_API_KEY="EMPTY" +export CUSTOM_OPENAI_MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" + +# Run evals as normal +cd packages/evals +pnpm run evals --eval your-eval-name +``` + +## Testing + +- ✅ Code compiles successfully (typecheck passed) +- ✅ Dependencies installed correctly +- ✅ Integration tested with existing eval infrastructure +- ✅ Example scripts created and validated +- ✅ Documentation verified + +## Benefits + +1. **Cost Efficiency**: Use self-hosted models instead of paid APIs +2. **Performance**: Lower latency with local/dedicated servers +3. **Flexibility**: Test with any OpenAI-compatible endpoint +4. **Privacy**: Keep data on-premises +5. **Experimentation**: Easy testing with custom model deployments + +## Compatibility + +Compatible with: + +- vLLM (primary use case) +- Ollama +- LocalAI +- Text Generation Inference (TGI) +- LM Studio +- Any OpenAI-compatible server + +## Migration from Standard Providers + +No migration needed! The feature: + +- Works alongside existing provider configurations +- Only activates when `CUSTOM_OPENAI_BASE_URL` is set +- Falls back to standard providers otherwise +- Requires no changes to existing eval configurations + +## Files Modified + +1. `packages/evals/env.ts` - Added configuration +2. `packages/evals/index.eval.ts` - Updated client initialization +3. `packages/evals/package.json` - Added dependency +4. `packages/evals/taskConfig.ts` - Added documentation + +## Files Created + +1. `packages/evals/CUSTOM_ENDPOINT_USAGE.md` - User documentation +2. `packages/evals/examples/custom_vllm_endpoint.sh` - Shell example +3. `packages/evals/examples/custom_endpoint_example.ts` - TypeScript example +4. `packages/evals/IMPLEMENTATION_SUMMARY.md` - This file + +## Future Enhancements + +Potential improvements: + +- Support for multiple concurrent endpoints +- Endpoint health monitoring +- Automatic failover between endpoints +- Endpoint performance metrics +- Configuration profiles for common setups + +## References + +- [AI SDK Documentation - Custom OpenAI Providers](https://ai-sdk.dev/providers/ai-sdk-providers/openai#provider-instance) +- [vLLM Documentation](https://docs.vllm.ai/) +- Inspired by verifiers codebase pattern for vLLM integration + +## Notes + +- The implementation uses the same OpenAI-compatible interface pattern as the verifiers training code +- Environment variables chosen for consistency with common vLLM usage patterns +- Defaults (like "EMPTY" for API key) match vLLM server expectations +- Documentation includes extensive examples and troubleshooting guidance diff --git a/packages/evals/env.ts b/packages/evals/env.ts index cf7e57e7e..3a57610e7 100644 --- a/packages/evals/env.ts +++ b/packages/evals/env.ts @@ -8,3 +8,17 @@ export const env: "BROWSERBASE" | "LOCAL" = process.env.EVAL_ENV?.toLowerCase() === "browserbase" ? "BROWSERBASE" : "LOCAL"; + +/** + * Custom OpenAI-compatible endpoint configuration (e.g., for vLLM) + * + * Set these environment variables to use a custom inference endpoint: + * - CUSTOM_OPENAI_BASE_URL: The base URL for the custom endpoint (e.g., "http://localhost:8000/v1") + * - CUSTOM_OPENAI_API_KEY: Optional API key (defaults to "EMPTY" for vLLM) + * - CUSTOM_OPENAI_MODEL_NAME: The model name to use with the custom endpoint + */ +export const customOpenAIConfig = { + baseURL: process.env.CUSTOM_OPENAI_BASE_URL, + apiKey: process.env.CUSTOM_OPENAI_API_KEY || "EMPTY", + modelName: process.env.CUSTOM_OPENAI_MODEL_NAME, +}; diff --git a/packages/evals/examples/custom_endpoint_example.ts b/packages/evals/examples/custom_endpoint_example.ts new file mode 100644 index 000000000..73760aebc --- /dev/null +++ b/packages/evals/examples/custom_endpoint_example.ts @@ -0,0 +1,170 @@ +/** + * Example demonstrating how to use custom OpenAI-compatible endpoints + * (like vLLM) with the Stagehand eval system. + * + * This shows the internal flow of how custom endpoints are detected and used. + * In practice, you would configure this via environment variables. + * + * To run this example: + * 1. Start a vLLM server: `vllm serve your-model --host 0.0.0.0 --port 8000` + * 2. Set environment variables: + * export CUSTOM_OPENAI_BASE_URL="http://localhost:8000/v1" + * export CUSTOM_OPENAI_API_KEY="EMPTY" + * export CUSTOM_OPENAI_MODEL_NAME="your-model" + * 3. Run your evals as normal: `pnpm run evals` + */ + +import { createOpenAI } from "@ai-sdk/openai"; +import { AISdkClientWrapped } from "../lib/AISdkClientWrapped"; +import { customOpenAIConfig } from "../env"; + +/** + * Example function showing how custom endpoints are configured + */ +function demonstrateCustomEndpointSetup() { + console.log("Custom Endpoint Configuration Example"); + console.log("=====================================\n"); + + // Check if custom endpoint is configured + if (customOpenAIConfig.baseURL) { + console.log("✓ Custom endpoint detected!"); + console.log(` Base URL: ${customOpenAIConfig.baseURL}`); + console.log( + ` API Key: ${customOpenAIConfig.apiKey === "EMPTY" ? "EMPTY (vLLM default)" : "Set"}`, + ); + console.log( + ` Model Name: ${customOpenAIConfig.modelName || "Not specified (will use eval config)"}`, + ); + console.log(); + + // This is how the custom OpenAI provider is created + const customOpenAI = createOpenAI({ + baseURL: customOpenAIConfig.baseURL, + apiKey: customOpenAIConfig.apiKey, + }); + + // Get the model (this would be wrapped in AISdkClientWrapped in actual usage) + const modelName = customOpenAIConfig.modelName || "default-model"; + const model = customOpenAI(modelName); + + console.log("✓ Custom OpenAI provider created"); + console.log(` Model ID: ${model.modelId}`); + console.log(` Provider: ${model.provider}`); + console.log(); + + // This would be passed to initV3 in the actual eval flow + console.log("The custom provider would be wrapped in AISdkClientWrapped"); + console.log("and passed to initV3() for use in evals."); + console.log(); + + // Show how it would be used + console.log("Example usage in eval code:"); + console.log(" const llmClient = new AISdkClientWrapped({ model });"); + console.log( + " v3Input = await initV3({ logger, llmClient, modelName, ... });", + ); + } else { + console.log("✗ No custom endpoint configured"); + console.log(); + console.log( + "To configure a custom endpoint, set these environment variables:", + ); + console.log(" export CUSTOM_OPENAI_BASE_URL='http://localhost:8000/v1'"); + console.log(" export CUSTOM_OPENAI_API_KEY='EMPTY'"); + console.log(" export CUSTOM_OPENAI_MODEL_NAME='your-model-name'"); + console.log(); + console.log("Then run your evals as normal."); + } + + console.log("\n" + "=".repeat(50)); +} + +/** + * Example showing the fallback to standard AI SDK providers + */ +function demonstrateStandardProviderFallback() { + console.log("\nStandard Provider Fallback"); + console.log("==========================\n"); + + if (!customOpenAIConfig.baseURL) { + console.log( + "When no custom endpoint is configured, the system falls back to", + ); + console.log("standard AI SDK providers (OpenAI, Anthropic, Google, etc.)"); + console.log(); + console.log("Example model names:"); + console.log(" - openai/gpt-4o-mini"); + console.log(" - anthropic/claude-3-7-sonnet-latest"); + console.log(" - google/gemini-2.0-flash"); + console.log(); + console.log( + "These are handled by getAISDKLanguageModel() in the eval code.", + ); + } +} + +/** + * Example configuration patterns for different use cases + */ +function showConfigurationExamples() { + console.log("\nConfiguration Examples"); + console.log("======================\n"); + + const examples = [ + { + name: "Local vLLM Server", + config: { + CUSTOM_OPENAI_BASE_URL: "http://localhost:8000/v1", + CUSTOM_OPENAI_API_KEY: "EMPTY", + CUSTOM_OPENAI_MODEL_NAME: "meta-llama/Llama-3.3-70B-Instruct", + }, + }, + { + name: "Remote vLLM Deployment", + config: { + CUSTOM_OPENAI_BASE_URL: "http://192.168.1.100:8000/v1", + CUSTOM_OPENAI_API_KEY: "EMPTY", + CUSTOM_OPENAI_MODEL_NAME: "custom-model-v1", + }, + }, + { + name: "Ollama Local", + config: { + CUSTOM_OPENAI_BASE_URL: "http://localhost:11434/v1", + CUSTOM_OPENAI_API_KEY: "EMPTY", + CUSTOM_OPENAI_MODEL_NAME: "llama3.3:70b", + }, + }, + { + name: "Secured Custom Endpoint", + config: { + CUSTOM_OPENAI_BASE_URL: "https://api.custom.com/v1", + CUSTOM_OPENAI_API_KEY: "sk-your-api-key", + CUSTOM_OPENAI_MODEL_NAME: "production-model", + }, + }, + ]; + + examples.forEach((example, index) => { + console.log(`${index + 1}. ${example.name}:`); + Object.entries(example.config).forEach(([key, value]) => { + console.log(` export ${key}="${value}"`); + }); + console.log(); + }); +} + +// Run the demonstration +if (require.main === module) { + demonstrateCustomEndpointSetup(); + demonstrateStandardProviderFallback(); + showConfigurationExamples(); + + console.log("For more details, see CUSTOM_ENDPOINT_USAGE.md"); +} + +export { + demonstrateCustomEndpointSetup, + demonstrateStandardProviderFallback, + showConfigurationExamples, +}; diff --git a/packages/evals/examples/custom_vllm_endpoint.sh b/packages/evals/examples/custom_vllm_endpoint.sh new file mode 100755 index 000000000..2e683c219 --- /dev/null +++ b/packages/evals/examples/custom_vllm_endpoint.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Example script for running evals with a custom vLLM endpoint +# +# This demonstrates how to configure and use a custom OpenAI-compatible +# inference endpoint (like vLLM) with the Stagehand eval system. +# +# Prerequisites: +# 1. Have a vLLM server running (see setup instructions below) +# 2. Be in the packages/evals directory +# +# Usage: +# ./examples/custom_vllm_endpoint.sh + +set -e # Exit on error + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${BLUE}=== Stagehand Evals with Custom vLLM Endpoint ===${NC}\n" + +# Configuration +VLLM_HOST="${VLLM_HOST:-localhost}" +VLLM_PORT="${VLLM_PORT:-8000}" +MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.3-70B-Instruct}" + +echo -e "${YELLOW}Configuration:${NC}" +echo " vLLM Host: $VLLM_HOST" +echo " vLLM Port: $VLLM_PORT" +echo " Model: $MODEL_NAME" +echo "" + +# Check if vLLM server is reachable +echo -e "${BLUE}Checking vLLM server connectivity...${NC}" +if curl -s -f "http://${VLLM_HOST}:${VLLM_PORT}/health" > /dev/null 2>&1; then + echo -e "${GREEN}✓ vLLM server is reachable${NC}\n" +else + echo -e "${YELLOW}Warning: Could not reach vLLM server at http://${VLLM_HOST}:${VLLM_PORT}${NC}" + echo "Please ensure your vLLM server is running." + echo "" + echo "To start a vLLM server, run:" + echo " vllm serve $MODEL_NAME --host 0.0.0.0 --port $VLLM_PORT" + echo "" + echo "Continuing anyway (will fail if server is not available)..." + echo "" +fi + +# Set environment variables for custom endpoint +export CUSTOM_OPENAI_BASE_URL="http://${VLLM_HOST}:${VLLM_PORT}/v1" +export CUSTOM_OPENAI_API_KEY="EMPTY" +export CUSTOM_OPENAI_MODEL_NAME="$MODEL_NAME" + +echo -e "${BLUE}Environment variables set:${NC}" +echo " CUSTOM_OPENAI_BASE_URL=$CUSTOM_OPENAI_BASE_URL" +echo " CUSTOM_OPENAI_API_KEY=$CUSTOM_OPENAI_API_KEY" +echo " CUSTOM_OPENAI_MODEL_NAME=$CUSTOM_OPENAI_MODEL_NAME" +echo "" + +# Run the evals +echo -e "${BLUE}Running evals with custom endpoint...${NC}\n" + +# You can customize which eval to run by passing arguments +# Examples: +# ./examples/custom_vllm_endpoint.sh --eval hn_aisdk +# ./examples/custom_vllm_endpoint.sh --category extract +# ./examples/custom_vllm_endpoint.sh --category act + +pnpm run evals "$@" + +echo -e "\n${GREEN}=== Eval run completed ===${NC}" + diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts index 8d76e2901..db4968737 100644 --- a/packages/evals/index.eval.ts +++ b/packages/evals/index.eval.ts @@ -35,13 +35,14 @@ import { } from "@browserbasehq/stagehand"; import { AISdkClientWrapped } from "./lib/AISdkClientWrapped"; import { getAISDKLanguageModel } from "@browserbasehq/stagehand/lib/v3/llm/LLMProvider"; -import { env } from "./env"; +import { env, customOpenAIConfig } from "./env"; import dotenv from "dotenv"; import { initV3 } from "./initV3"; import { generateSummary } from "./summary"; import { buildGAIATestcases } from "./suites/gaia"; import { buildWebVoyagerTestcases } from "./suites/webvoyager"; import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web"; +import { createOpenAI } from "@ai-sdk/openai"; dotenv.config(); @@ -347,7 +348,28 @@ const generateFilteredTestcases = (): Testcase[] => { }); } else { let llmClient: LLMClient; - if (input.modelName.includes("/")) { + + // Check if custom OpenAI-compatible endpoint is configured + if (customOpenAIConfig.baseURL) { + console.log( + `[EVALS] Using custom OpenAI endpoint: ${customOpenAIConfig.baseURL}`, + ); + + // Create custom OpenAI provider with configured endpoint + const customOpenAI = createOpenAI({ + baseURL: customOpenAIConfig.baseURL, + apiKey: customOpenAIConfig.apiKey, + }); + + // Use the custom model name if provided, otherwise use input.modelName + const modelName = customOpenAIConfig.modelName || input.modelName; + const model = customOpenAI(modelName); + + llmClient = new AISdkClientWrapped({ + model, + }); + } else if (input.modelName.includes("/")) { + // Standard AI SDK provider logic llmClient = new AISdkClientWrapped({ model: getAISDKLanguageModel( input.modelName.split("/")[0], diff --git a/packages/evals/package.json b/packages/evals/package.json index b8dde1ed1..d0574b66a 100644 --- a/packages/evals/package.json +++ b/packages/evals/package.json @@ -16,6 +16,7 @@ "@browserbasehq/stagehand": "workspace:*", "ai": "^5.0.0", "@ai-sdk/provider": "^2.0.0", + "@ai-sdk/openai": "^2.0.53", "openai": "^4.87.1", "dotenv": "16.4.5", "zod": "^4.1.8" @@ -23,4 +24,4 @@ "devDependencies": { "tsx": "^4.10.5" } -} +} \ No newline at end of file diff --git a/packages/evals/taskConfig.ts b/packages/evals/taskConfig.ts index 2f7566909..629315430 100644 --- a/packages/evals/taskConfig.ts +++ b/packages/evals/taskConfig.ts @@ -8,6 +8,13 @@ * * The exported objects (`tasksByName`, `MODELS`, `config`) are used by the main evaluation script and other modules * to know which tasks and models are available, and to configure the evaluations accordingly. + * + * Custom OpenAI-Compatible Endpoints: + * You can use custom inference endpoints (like vLLM) by setting environment variables: + * - CUSTOM_OPENAI_BASE_URL: Base URL of your custom endpoint (e.g., "http://localhost:8000/v1") + * - CUSTOM_OPENAI_API_KEY: API key (defaults to "EMPTY" for vLLM) + * - CUSTOM_OPENAI_MODEL_NAME: Model name to use with the custom endpoint + * See CUSTOM_ENDPOINT_USAGE.md for detailed instructions. */ import fs from "fs"; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 4b4b0aefe..74306c2fd 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -259,6 +259,9 @@ importers: packages/evals: dependencies: + '@ai-sdk/openai': + specifier: ^2.0.53 + version: 2.0.53(zod@4.1.12) '@ai-sdk/provider': specifier: ^2.0.0 version: 2.0.0 @@ -6564,6 +6567,12 @@ snapshots: zod: 3.25.67 optional: true + '@ai-sdk/openai@2.0.53(zod@4.1.12)': + dependencies: + '@ai-sdk/provider': 2.0.0 + '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12) + zod: 4.1.12 + '@ai-sdk/openai@2.0.53(zod@4.1.8)': dependencies: '@ai-sdk/provider': 2.0.0 @@ -10530,7 +10539,7 @@ snapshots: isstream: 0.1.2 jsonwebtoken: 9.0.2 mime-types: 2.1.35 - retry-axios: 2.6.0(axios@1.13.0(debug@4.4.3)) + retry-axios: 2.6.0(axios@1.13.0) tough-cookie: 4.1.4 transitivePeerDependencies: - supports-color @@ -12548,7 +12557,7 @@ snapshots: retext-stringify: 4.0.0 unified: 11.0.5 - retry-axios@2.6.0(axios@1.13.0(debug@4.4.3)): + retry-axios@2.6.0(axios@1.13.0): dependencies: axios: 1.13.0(debug@4.4.3)