From 6890c16a7c243e5e83d708fbdd6c96fc4eeb807e Mon Sep 17 00:00:00 2001
From: Filip Michalsky <filip@paywithsoap.com>
Date: Tue, 18 Nov 2025 23:07:51 -0500
Subject: [PATCH 1/2] fix evals

---
 packages/core/lib/v3/v3.ts   | 14 +++++++++++---
 packages/evals/index.eval.ts |  5 +++++
 2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/packages/core/lib/v3/v3.ts b/packages/core/lib/v3/v3.ts
index f57067a46..e3a151ebd 100644
--- a/packages/core/lib/v3/v3.ts
+++ b/packages/core/lib/v3/v3.ts
@@ -1468,12 +1468,20 @@ export class V3 {
       instructionOrOptions: string | AgentExecuteOptions,
     ) => Promise<AgentResult>;
   } {
+    // Auto-detect CUA models if not explicitly set
+    const modelToCheck = options?.model || {
+      modelName: this.modelName,
+      ...this.modelClientOptions,
+    };
+    const { isCua: isModelCua } = resolveModel(modelToCheck);
+    const useCua = options?.cua ?? isModelCua;
+
     this.logger({
       category: "agent",
       message: `Creating v3 agent instance with options: ${JSON.stringify(options)}`,
       level: 1,
       auxiliary: {
-        cua: { value: options?.cua ? "true" : "false", type: "boolean" },
+        cua: { value: useCua ? "true" : "false", type: "boolean" },
         model: options?.model
           ? typeof options?.model === "string"
             ? { value: options.model, type: "string" }
@@ -1490,8 +1498,8 @@ export class V3 {
       },
     });
 
-    // If CUA is enabled, use the computer-use agent path
-    if (options?.cua) {
+    // If CUA is enabled or auto-detected, use the computer-use agent path
+    if (useCua) {
       if ((options?.integrations || options?.tools) && !this.experimental) {
         throw new ExperimentalNotConfiguredError(
           "MCP integrations and custom tools",
diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts
index 709fd2274..8d76e2901 100644
--- a/packages/evals/index.eval.ts
+++ b/packages/evals/index.eval.ts
@@ -370,6 +370,11 @@ const generateFilteredTestcases = (): Testcase[] => {
               v3: v3Input?.v3,
               v3Agent: v3Input?.agent,
               logger: v3Input?.logger,
+              debugUrl: v3Input?.debugUrl || "",
+              sessionUrl: v3Input?.sessionUrl || "",
+              modelName: v3Input?.modelName,
+              agent: v3Input?.agent,
+              input: input,
               v3Input,
             });
             // Log result to console

From bfd4d2f12ff806944aeaad53879a5e5a70c69cd8 Mon Sep 17 00:00:00 2001
From: Filip Michalsky <filip@paywithsoap.com>
Date: Wed, 19 Nov 2025 13:27:23 -0500
Subject: [PATCH 2/2] custom LLM endpoint

---
 packages/evals/CUSTOM_ENDPOINT_USAGE.md       | 198 ++++++++++++++++++
 packages/evals/IMPLEMENTATION_SUMMARY.md      | 186 ++++++++++++++++
 packages/evals/env.ts                         |  14 ++
 .../evals/examples/custom_endpoint_example.ts | 170 +++++++++++++++
 .../evals/examples/custom_vllm_endpoint.sh    |  73 +++++++
 packages/evals/index.eval.ts                  |  26 ++-
 packages/evals/package.json                   |   3 +-
 packages/evals/taskConfig.ts                  |   7 +
 pnpm-lock.yaml                                |  13 +-
 9 files changed, 685 insertions(+), 5 deletions(-)
 create mode 100644 packages/evals/CUSTOM_ENDPOINT_USAGE.md
 create mode 100644 packages/evals/IMPLEMENTATION_SUMMARY.md
 create mode 100644 packages/evals/examples/custom_endpoint_example.ts
 create mode 100755 packages/evals/examples/custom_vllm_endpoint.sh

diff --git a/packages/evals/CUSTOM_ENDPOINT_USAGE.md b/packages/evals/CUSTOM_ENDPOINT_USAGE.md
new file mode 100644
index 000000000..deb449554
--- /dev/null
+++ b/packages/evals/CUSTOM_ENDPOINT_USAGE.md
@@ -0,0 +1,198 @@
+# Using Custom OpenAI-Compatible Endpoints with Evals
+
+This guide explains how to configure the Stagehand evals system to use custom OpenAI-compatible inference endpoints, such as vLLM, Ollama, or other compatible servers.
+
+## Overview
+
+The evals system now supports custom OpenAI-compatible endpoints through the AI SDK's `createOpenAI` function. This allows you to:
+
+- Use local vLLM servers for faster inference
+- Connect to custom model deployments
+- Test with Ollama or other OpenAI-compatible services
+- Use self-hosted inference endpoints
+
+## Configuration
+
+Configure the custom endpoint using environment variables:
+
+### Required Environment Variables
+
+- `CUSTOM_OPENAI_BASE_URL`: The base URL for your custom endpoint
+  - Example: `http://localhost:8000/v1`
+  - Example: `http://your-vllm-server:8000/v1`
+
+### Optional Environment Variables
+
+- `CUSTOM_OPENAI_API_KEY`: API key for the endpoint (defaults to `"EMPTY"` if not set)
+
+  - For vLLM: Use `"EMPTY"` or leave unset
+  - For secured endpoints: Set your actual API key
+
+- `CUSTOM_OPENAI_MODEL_NAME`: Override the model name to use
+  - If not set, the model name from the eval configuration will be used
+  - Useful when your endpoint expects a specific model identifier
+
+## Usage Examples
+
+### Example 1: Basic vLLM Setup
+
+```bash
+# Start your vLLM server (in a separate terminal)
+vllm serve meta-llama/Llama-3.3-70B-Instruct \
+  --host 0.0.0.0 \
+  --port 8000
+
+# Configure the evals to use the vLLM endpoint
+export CUSTOM_OPENAI_BASE_URL="http://localhost:8000/v1"
+export CUSTOM_OPENAI_API_KEY="EMPTY"
+export CUSTOM_OPENAI_MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
+
+# Run your evals
+cd packages/evals
+pnpm run evals --eval your-eval-name
+```
+
+### Example 2: Remote vLLM Server
+
+```bash
+# Connect to a remote vLLM deployment
+export CUSTOM_OPENAI_BASE_URL="http://192.168.1.100:8000/v1"
+export CUSTOM_OPENAI_API_KEY="EMPTY"
+export CUSTOM_OPENAI_MODEL_NAME="my-custom-model"
+
+# Run evals
+pnpm run evals --category act
+```
+
+### Example 3: Ollama
+
+```bash
+# Start Ollama with OpenAI-compatible API
+ollama serve
+
+# Configure for Ollama endpoint
+export CUSTOM_OPENAI_BASE_URL="http://localhost:11434/v1"
+export CUSTOM_OPENAI_API_KEY="EMPTY"
+export CUSTOM_OPENAI_MODEL_NAME="llama3.3:70b"
+
+# Run evals
+pnpm run evals --eval my-task
+```
+
+### Example 4: Custom Secured Endpoint
+
+```bash
+# For endpoints requiring authentication
+export CUSTOM_OPENAI_BASE_URL="https://api.your-inference-provider.com/v1"
+export CUSTOM_OPENAI_API_KEY="your-actual-api-key-here"
+export CUSTOM_OPENAI_MODEL_NAME="custom-model-v1"
+
+# Run evals
+pnpm run evals
+```
+
+## How It Works
+
+When `CUSTOM_OPENAI_BASE_URL` is set, the eval system:
+
+1. Creates a custom OpenAI provider using AI SDK's `createOpenAI` function
+2. Points it to your specified base URL
+3. Uses your configured API key (or "EMPTY" by default)
+4. Wraps it in the existing `AISdkClientWrapped` class
+5. Passes it to the V3 initialization for use in evals
+
+The implementation automatically falls back to standard AI SDK providers when the custom endpoint is not configured.
+
+## Compatibility
+
+This feature works with any server that implements the OpenAI Chat Completions API, including:
+
+- ✅ vLLM (recommended for production)
+- ✅ Ollama
+- ✅ LocalAI
+- ✅ Text Generation Inference (TGI)
+- ✅ LM Studio
+- ✅ Any custom OpenAI-compatible server
+
+## Troubleshooting
+
+### Connection Issues
+
+If you can't connect to your endpoint:
+
+```bash
+# Test the endpoint manually with curl
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer EMPTY" \
+  -d '{
+    "model": "your-model-name",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "max_tokens": 10
+  }'
+```
+
+### Model Name Mismatch
+
+If you get model not found errors:
+
+1. Check your vLLM server logs to see what model name it expects
+2. Set `CUSTOM_OPENAI_MODEL_NAME` to match exactly
+3. Ensure the model name matches what was loaded in vLLM
+
+### API Key Issues
+
+If you get authentication errors:
+
+1. For vLLM, use `CUSTOM_OPENAI_API_KEY="EMPTY"`
+2. For secured endpoints, ensure your API key is correct
+3. Check if your endpoint requires a specific authorization header format
+
+## Performance Tips
+
+When using vLLM:
+
+1. **Enable prefix caching** for better performance with similar prompts
+2. **Use appropriate batch sizes** for your hardware
+3. **Consider tensor parallelism** for larger models
+4. **Monitor GPU memory** usage during eval runs
+
+Example vLLM server configuration for optimal eval performance:
+
+```bash
+vllm serve your-model \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --tensor-parallel-size 2 \
+  --enable-prefix-caching \
+  --max-model-len 4096 \
+  --gpu-memory-utilization 0.9
+```
+
+## Integration with Verifiers Training Code
+
+This implementation follows a similar pattern to the verifiers codebase, where vLLM is used for efficient inference:
+
+```python
+# Similar to verifiers approach
+client_config = {
+    "base_url": "http://localhost:8000/v1",
+    "api_key": "EMPTY",
+    "http_client_args": {
+        "limits": {"max_connections": max_concurrent},
+        "timeout": timeout,
+    },
+}
+```
+
+The Stagehand implementation uses the same OpenAI-compatible interface, making it easy to:
+
+- Share vLLM servers between training and evaluation
+- Use the same model configurations
+- Maintain consistent inference behavior
+
+## Additional Resources
+
+- [AI SDK Documentation - Custom OpenAI Providers](https://ai-sdk.dev/providers/ai-sdk-providers/openai#provider-instance)
+- [vLLM Documentation](https://docs.vllm.ai/)
+- [Ollama Documentation](https://ollama.ai/docs)
diff --git a/packages/evals/IMPLEMENTATION_SUMMARY.md b/packages/evals/IMPLEMENTATION_SUMMARY.md
new file mode 100644
index 000000000..da5bd4365
--- /dev/null
+++ b/packages/evals/IMPLEMENTATION_SUMMARY.md
@@ -0,0 +1,186 @@
+# Custom vLLM Endpoint Implementation - Summary
+
+## Overview
+
+Successfully implemented support for custom OpenAI-compatible inference endpoints (like vLLM) in the Stagehand evals system. This allows users to configure and use custom inference servers instead of standard cloud-based providers.
+
+## Implementation Date
+
+November 19, 2025
+
+## Changes Made
+
+### 1. Environment Configuration (`packages/evals/env.ts`)
+
+**Added:**
+
+- `customOpenAIConfig` object containing:
+  - `baseURL`: Base URL for custom endpoint (from `CUSTOM_OPENAI_BASE_URL`)
+  - `apiKey`: API key (from `CUSTOM_OPENAI_API_KEY`, defaults to "EMPTY")
+  - `modelName`: Model name override (from `CUSTOM_OPENAI_MODEL_NAME`)
+
+### 2. Eval Runner Updates (`packages/evals/index.eval.ts`)
+
+**Modified:**
+
+- Added import for `createOpenAI` from `@ai-sdk/openai`
+- Added import for `customOpenAIConfig` from `./env`
+- Updated LLM client initialization logic (lines 349-384) to:
+  - Detect when `CUSTOM_OPENAI_BASE_URL` is set
+  - Create custom OpenAI provider using `createOpenAI()`
+  - Pass custom provider to `AISdkClientWrapped`
+  - Fall back to standard providers when custom endpoint is not configured
+
+### 3. Dependencies (`packages/evals/package.json`)
+
+**Added:**
+
+- `@ai-sdk/openai` version `^2.0.53` to dependencies
+
+### 4. Documentation (`packages/evals/taskConfig.ts`)
+
+**Updated:**
+
+- Added documentation comment explaining custom endpoint configuration
+- Referenced `CUSTOM_ENDPOINT_USAGE.md` for detailed instructions
+
+### 5. User Documentation
+
+**Created `CUSTOM_ENDPOINT_USAGE.md`:**
+
+- Comprehensive guide on using custom endpoints
+- Configuration instructions
+- Usage examples for:
+  - Local vLLM server
+  - Remote vLLM deployment
+  - Ollama
+  - Secured custom endpoints
+- Troubleshooting section
+- Performance tips
+- Compatibility information
+
+### 6. Examples
+
+**Created `examples/custom_vllm_endpoint.sh`:**
+
+- Shell script demonstrating how to run evals with custom endpoint
+- Includes connectivity check
+- Configurable via environment variables
+- Made executable with proper permissions
+
+**Created `examples/custom_endpoint_example.ts`:**
+
+- TypeScript example showing internal integration
+- Demonstrates custom provider setup
+- Shows configuration patterns
+- Includes fallback behavior explanation
+
+## Technical Approach
+
+The implementation leverages AI SDK's `createOpenAI` function with custom `baseURL` parameter:
+
+```typescript
+const customOpenAI = createOpenAI({
+  baseURL: process.env.CUSTOM_OPENAI_BASE_URL,
+  apiKey: process.env.CUSTOM_OPENAI_API_KEY || "EMPTY",
+});
+
+const model = customOpenAI(modelName);
+const llmClient = new AISdkClientWrapped({ model });
+```
+
+This approach:
+
+- ✅ Works seamlessly with existing `AISdkClientWrapped` class
+- ✅ Requires no changes to downstream code
+- ✅ Maintains backward compatibility
+- ✅ Supports any OpenAI-compatible endpoint
+- ✅ Simple environment variable configuration
+
+## Usage
+
+To use a custom vLLM endpoint:
+
+```bash
+# Set environment variables
+export CUSTOM_OPENAI_BASE_URL="http://localhost:8000/v1"
+export CUSTOM_OPENAI_API_KEY="EMPTY"
+export CUSTOM_OPENAI_MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
+
+# Run evals as normal
+cd packages/evals
+pnpm run evals --eval your-eval-name
+```
+
+## Testing
+
+- ✅ Code compiles successfully (typecheck passed)
+- ✅ Dependencies installed correctly
+- ✅ Integration tested with existing eval infrastructure
+- ✅ Example scripts created and validated
+- ✅ Documentation verified
+
+## Benefits
+
+1. **Cost Efficiency**: Use self-hosted models instead of paid APIs
+2. **Performance**: Lower latency with local/dedicated servers
+3. **Flexibility**: Test with any OpenAI-compatible endpoint
+4. **Privacy**: Keep data on-premises
+5. **Experimentation**: Easy testing with custom model deployments
+
+## Compatibility
+
+Compatible with:
+
+- vLLM (primary use case)
+- Ollama
+- LocalAI
+- Text Generation Inference (TGI)
+- LM Studio
+- Any OpenAI-compatible server
+
+## Migration from Standard Providers
+
+No migration needed! The feature:
+
+- Works alongside existing provider configurations
+- Only activates when `CUSTOM_OPENAI_BASE_URL` is set
+- Falls back to standard providers otherwise
+- Requires no changes to existing eval configurations
+
+## Files Modified
+
+1. `packages/evals/env.ts` - Added configuration
+2. `packages/evals/index.eval.ts` - Updated client initialization
+3. `packages/evals/package.json` - Added dependency
+4. `packages/evals/taskConfig.ts` - Added documentation
+
+## Files Created
+
+1. `packages/evals/CUSTOM_ENDPOINT_USAGE.md` - User documentation
+2. `packages/evals/examples/custom_vllm_endpoint.sh` - Shell example
+3. `packages/evals/examples/custom_endpoint_example.ts` - TypeScript example
+4. `packages/evals/IMPLEMENTATION_SUMMARY.md` - This file
+
+## Future Enhancements
+
+Potential improvements:
+
+- Support for multiple concurrent endpoints
+- Endpoint health monitoring
+- Automatic failover between endpoints
+- Endpoint performance metrics
+- Configuration profiles for common setups
+
+## References
+
+- [AI SDK Documentation - Custom OpenAI Providers](https://ai-sdk.dev/providers/ai-sdk-providers/openai#provider-instance)
+- [vLLM Documentation](https://docs.vllm.ai/)
+- Inspired by verifiers codebase pattern for vLLM integration
+
+## Notes
+
+- The implementation uses the same OpenAI-compatible interface pattern as the verifiers training code
+- Environment variables chosen for consistency with common vLLM usage patterns
+- Defaults (like "EMPTY" for API key) match vLLM server expectations
+- Documentation includes extensive examples and troubleshooting guidance
diff --git a/packages/evals/env.ts b/packages/evals/env.ts
index cf7e57e7e..3a57610e7 100644
--- a/packages/evals/env.ts
+++ b/packages/evals/env.ts
@@ -8,3 +8,17 @@ export const env: "BROWSERBASE" | "LOCAL" =
   process.env.EVAL_ENV?.toLowerCase() === "browserbase"
     ? "BROWSERBASE"
     : "LOCAL";
+
+/**
+ * Custom OpenAI-compatible endpoint configuration (e.g., for vLLM)
+ *
+ * Set these environment variables to use a custom inference endpoint:
+ * - CUSTOM_OPENAI_BASE_URL: The base URL for the custom endpoint (e.g., "http://localhost:8000/v1")
+ * - CUSTOM_OPENAI_API_KEY: Optional API key (defaults to "EMPTY" for vLLM)
+ * - CUSTOM_OPENAI_MODEL_NAME: The model name to use with the custom endpoint
+ */
+export const customOpenAIConfig = {
+  baseURL: process.env.CUSTOM_OPENAI_BASE_URL,
+  apiKey: process.env.CUSTOM_OPENAI_API_KEY || "EMPTY",
+  modelName: process.env.CUSTOM_OPENAI_MODEL_NAME,
+};
diff --git a/packages/evals/examples/custom_endpoint_example.ts b/packages/evals/examples/custom_endpoint_example.ts
new file mode 100644
index 000000000..73760aebc
--- /dev/null
+++ b/packages/evals/examples/custom_endpoint_example.ts
@@ -0,0 +1,170 @@
+/**
+ * Example demonstrating how to use custom OpenAI-compatible endpoints
+ * (like vLLM) with the Stagehand eval system.
+ *
+ * This shows the internal flow of how custom endpoints are detected and used.
+ * In practice, you would configure this via environment variables.
+ *
+ * To run this example:
+ * 1. Start a vLLM server: `vllm serve your-model --host 0.0.0.0 --port 8000`
+ * 2. Set environment variables:
+ *    export CUSTOM_OPENAI_BASE_URL="http://localhost:8000/v1"
+ *    export CUSTOM_OPENAI_API_KEY="EMPTY"
+ *    export CUSTOM_OPENAI_MODEL_NAME="your-model"
+ * 3. Run your evals as normal: `pnpm run evals`
+ */
+
+import { createOpenAI } from "@ai-sdk/openai";
+import { AISdkClientWrapped } from "../lib/AISdkClientWrapped";
+import { customOpenAIConfig } from "../env";
+
+/**
+ * Example function showing how custom endpoints are configured
+ */
+function demonstrateCustomEndpointSetup() {
+  console.log("Custom Endpoint Configuration Example");
+  console.log("=====================================\n");
+
+  // Check if custom endpoint is configured
+  if (customOpenAIConfig.baseURL) {
+    console.log("✓ Custom endpoint detected!");
+    console.log(`  Base URL: ${customOpenAIConfig.baseURL}`);
+    console.log(
+      `  API Key: ${customOpenAIConfig.apiKey === "EMPTY" ? "EMPTY (vLLM default)" : "Set"}`,
+    );
+    console.log(
+      `  Model Name: ${customOpenAIConfig.modelName || "Not specified (will use eval config)"}`,
+    );
+    console.log();
+
+    // This is how the custom OpenAI provider is created
+    const customOpenAI = createOpenAI({
+      baseURL: customOpenAIConfig.baseURL,
+      apiKey: customOpenAIConfig.apiKey,
+    });
+
+    // Get the model (this would be wrapped in AISdkClientWrapped in actual usage)
+    const modelName = customOpenAIConfig.modelName || "default-model";
+    const model = customOpenAI(modelName);
+
+    console.log("✓ Custom OpenAI provider created");
+    console.log(`  Model ID: ${model.modelId}`);
+    console.log(`  Provider: ${model.provider}`);
+    console.log();
+
+    // This would be passed to initV3 in the actual eval flow
+    console.log("The custom provider would be wrapped in AISdkClientWrapped");
+    console.log("and passed to initV3() for use in evals.");
+    console.log();
+
+    // Show how it would be used
+    console.log("Example usage in eval code:");
+    console.log("  const llmClient = new AISdkClientWrapped({ model });");
+    console.log(
+      "  v3Input = await initV3({ logger, llmClient, modelName, ... });",
+    );
+  } else {
+    console.log("✗ No custom endpoint configured");
+    console.log();
+    console.log(
+      "To configure a custom endpoint, set these environment variables:",
+    );
+    console.log("  export CUSTOM_OPENAI_BASE_URL='http://localhost:8000/v1'");
+    console.log("  export CUSTOM_OPENAI_API_KEY='EMPTY'");
+    console.log("  export CUSTOM_OPENAI_MODEL_NAME='your-model-name'");
+    console.log();
+    console.log("Then run your evals as normal.");
+  }
+
+  console.log("\n" + "=".repeat(50));
+}
+
+/**
+ * Example showing the fallback to standard AI SDK providers
+ */
+function demonstrateStandardProviderFallback() {
+  console.log("\nStandard Provider Fallback");
+  console.log("==========================\n");
+
+  if (!customOpenAIConfig.baseURL) {
+    console.log(
+      "When no custom endpoint is configured, the system falls back to",
+    );
+    console.log("standard AI SDK providers (OpenAI, Anthropic, Google, etc.)");
+    console.log();
+    console.log("Example model names:");
+    console.log("  - openai/gpt-4o-mini");
+    console.log("  - anthropic/claude-3-7-sonnet-latest");
+    console.log("  - google/gemini-2.0-flash");
+    console.log();
+    console.log(
+      "These are handled by getAISDKLanguageModel() in the eval code.",
+    );
+  }
+}
+
+/**
+ * Example configuration patterns for different use cases
+ */
+function showConfigurationExamples() {
+  console.log("\nConfiguration Examples");
+  console.log("======================\n");
+
+  const examples = [
+    {
+      name: "Local vLLM Server",
+      config: {
+        CUSTOM_OPENAI_BASE_URL: "http://localhost:8000/v1",
+        CUSTOM_OPENAI_API_KEY: "EMPTY",
+        CUSTOM_OPENAI_MODEL_NAME: "meta-llama/Llama-3.3-70B-Instruct",
+      },
+    },
+    {
+      name: "Remote vLLM Deployment",
+      config: {
+        CUSTOM_OPENAI_BASE_URL: "http://192.168.1.100:8000/v1",
+        CUSTOM_OPENAI_API_KEY: "EMPTY",
+        CUSTOM_OPENAI_MODEL_NAME: "custom-model-v1",
+      },
+    },
+    {
+      name: "Ollama Local",
+      config: {
+        CUSTOM_OPENAI_BASE_URL: "http://localhost:11434/v1",
+        CUSTOM_OPENAI_API_KEY: "EMPTY",
+        CUSTOM_OPENAI_MODEL_NAME: "llama3.3:70b",
+      },
+    },
+    {
+      name: "Secured Custom Endpoint",
+      config: {
+        CUSTOM_OPENAI_BASE_URL: "https://api.custom.com/v1",
+        CUSTOM_OPENAI_API_KEY: "sk-your-api-key",
+        CUSTOM_OPENAI_MODEL_NAME: "production-model",
+      },
+    },
+  ];
+
+  examples.forEach((example, index) => {
+    console.log(`${index + 1}. ${example.name}:`);
+    Object.entries(example.config).forEach(([key, value]) => {
+      console.log(`   export ${key}="${value}"`);
+    });
+    console.log();
+  });
+}
+
+// Run the demonstration
+if (require.main === module) {
+  demonstrateCustomEndpointSetup();
+  demonstrateStandardProviderFallback();
+  showConfigurationExamples();
+
+  console.log("For more details, see CUSTOM_ENDPOINT_USAGE.md");
+}
+
+export {
+  demonstrateCustomEndpointSetup,
+  demonstrateStandardProviderFallback,
+  showConfigurationExamples,
+};
diff --git a/packages/evals/examples/custom_vllm_endpoint.sh b/packages/evals/examples/custom_vllm_endpoint.sh
new file mode 100755
index 000000000..2e683c219
--- /dev/null
+++ b/packages/evals/examples/custom_vllm_endpoint.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Example script for running evals with a custom vLLM endpoint
+#
+# This demonstrates how to configure and use a custom OpenAI-compatible
+# inference endpoint (like vLLM) with the Stagehand eval system.
+#
+# Prerequisites:
+# 1. Have a vLLM server running (see setup instructions below)
+# 2. Be in the packages/evals directory
+#
+# Usage:
+#   ./examples/custom_vllm_endpoint.sh
+
+set -e  # Exit on error
+
+# Colors for output
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}=== Stagehand Evals with Custom vLLM Endpoint ===${NC}\n"
+
+# Configuration
+VLLM_HOST="${VLLM_HOST:-localhost}"
+VLLM_PORT="${VLLM_PORT:-8000}"
+MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.3-70B-Instruct}"
+
+echo -e "${YELLOW}Configuration:${NC}"
+echo "  vLLM Host: $VLLM_HOST"
+echo "  vLLM Port: $VLLM_PORT"
+echo "  Model: $MODEL_NAME"
+echo ""
+
+# Check if vLLM server is reachable
+echo -e "${BLUE}Checking vLLM server connectivity...${NC}"
+if curl -s -f "http://${VLLM_HOST}:${VLLM_PORT}/health" > /dev/null 2>&1; then
+    echo -e "${GREEN}✓ vLLM server is reachable${NC}\n"
+else
+    echo -e "${YELLOW}Warning: Could not reach vLLM server at http://${VLLM_HOST}:${VLLM_PORT}${NC}"
+    echo "Please ensure your vLLM server is running."
+    echo ""
+    echo "To start a vLLM server, run:"
+    echo "  vllm serve $MODEL_NAME --host 0.0.0.0 --port $VLLM_PORT"
+    echo ""
+    echo "Continuing anyway (will fail if server is not available)..."
+    echo ""
+fi
+
+# Set environment variables for custom endpoint
+export CUSTOM_OPENAI_BASE_URL="http://${VLLM_HOST}:${VLLM_PORT}/v1"
+export CUSTOM_OPENAI_API_KEY="EMPTY"
+export CUSTOM_OPENAI_MODEL_NAME="$MODEL_NAME"
+
+echo -e "${BLUE}Environment variables set:${NC}"
+echo "  CUSTOM_OPENAI_BASE_URL=$CUSTOM_OPENAI_BASE_URL"
+echo "  CUSTOM_OPENAI_API_KEY=$CUSTOM_OPENAI_API_KEY"
+echo "  CUSTOM_OPENAI_MODEL_NAME=$CUSTOM_OPENAI_MODEL_NAME"
+echo ""
+
+# Run the evals
+echo -e "${BLUE}Running evals with custom endpoint...${NC}\n"
+
+# You can customize which eval to run by passing arguments
+# Examples:
+#   ./examples/custom_vllm_endpoint.sh --eval hn_aisdk
+#   ./examples/custom_vllm_endpoint.sh --category extract
+#   ./examples/custom_vllm_endpoint.sh --category act
+
+pnpm run evals "$@"
+
+echo -e "\n${GREEN}=== Eval run completed ===${NC}"
+
diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts
index 8d76e2901..db4968737 100644
--- a/packages/evals/index.eval.ts
+++ b/packages/evals/index.eval.ts
@@ -35,13 +35,14 @@ import {
 } from "@browserbasehq/stagehand";
 import { AISdkClientWrapped } from "./lib/AISdkClientWrapped";
 import { getAISDKLanguageModel } from "@browserbasehq/stagehand/lib/v3/llm/LLMProvider";
-import { env } from "./env";
+import { env, customOpenAIConfig } from "./env";
 import dotenv from "dotenv";
 import { initV3 } from "./initV3";
 import { generateSummary } from "./summary";
 import { buildGAIATestcases } from "./suites/gaia";
 import { buildWebVoyagerTestcases } from "./suites/webvoyager";
 import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web";
+import { createOpenAI } from "@ai-sdk/openai";
 
 dotenv.config();
 
@@ -347,7 +348,28 @@ const generateFilteredTestcases = (): Testcase[] => {
             });
           } else {
             let llmClient: LLMClient;
-            if (input.modelName.includes("/")) {
+
+            // Check if custom OpenAI-compatible endpoint is configured
+            if (customOpenAIConfig.baseURL) {
+              console.log(
+                `[EVALS] Using custom OpenAI endpoint: ${customOpenAIConfig.baseURL}`,
+              );
+
+              // Create custom OpenAI provider with configured endpoint
+              const customOpenAI = createOpenAI({
+                baseURL: customOpenAIConfig.baseURL,
+                apiKey: customOpenAIConfig.apiKey,
+              });
+
+              // Use the custom model name if provided, otherwise use input.modelName
+              const modelName = customOpenAIConfig.modelName || input.modelName;
+              const model = customOpenAI(modelName);
+
+              llmClient = new AISdkClientWrapped({
+                model,
+              });
+            } else if (input.modelName.includes("/")) {
+              // Standard AI SDK provider logic
               llmClient = new AISdkClientWrapped({
                 model: getAISDKLanguageModel(
                   input.modelName.split("/")[0],
diff --git a/packages/evals/package.json b/packages/evals/package.json
index b8dde1ed1..d0574b66a 100644
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -16,6 +16,7 @@
     "@browserbasehq/stagehand": "workspace:*",
     "ai": "^5.0.0",
     "@ai-sdk/provider": "^2.0.0",
+    "@ai-sdk/openai": "^2.0.53",
     "openai": "^4.87.1",
     "dotenv": "16.4.5",
     "zod": "^4.1.8"
@@ -23,4 +24,4 @@
   "devDependencies": {
     "tsx": "^4.10.5"
   }
-}
+}
\ No newline at end of file
diff --git a/packages/evals/taskConfig.ts b/packages/evals/taskConfig.ts
index 2f7566909..629315430 100644
--- a/packages/evals/taskConfig.ts
+++ b/packages/evals/taskConfig.ts
@@ -8,6 +8,13 @@
  *
  * The exported objects (`tasksByName`, `MODELS`, `config`) are used by the main evaluation script and other modules
  * to know which tasks and models are available, and to configure the evaluations accordingly.
+ *
+ * Custom OpenAI-Compatible Endpoints:
+ * You can use custom inference endpoints (like vLLM) by setting environment variables:
+ * - CUSTOM_OPENAI_BASE_URL: Base URL of your custom endpoint (e.g., "http://localhost:8000/v1")
+ * - CUSTOM_OPENAI_API_KEY: API key (defaults to "EMPTY" for vLLM)
+ * - CUSTOM_OPENAI_MODEL_NAME: Model name to use with the custom endpoint
+ * See CUSTOM_ENDPOINT_USAGE.md for detailed instructions.
  */
 
 import fs from "fs";
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 4b4b0aefe..74306c2fd 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -259,6 +259,9 @@ importers:
 
   packages/evals:
     dependencies:
+      '@ai-sdk/openai':
+        specifier: ^2.0.53
+        version: 2.0.53(zod@4.1.12)
       '@ai-sdk/provider':
         specifier: ^2.0.0
         version: 2.0.0
@@ -6564,6 +6567,12 @@ snapshots:
       zod: 3.25.67
     optional: true
 
+  '@ai-sdk/openai@2.0.53(zod@4.1.12)':
+    dependencies:
+      '@ai-sdk/provider': 2.0.0
+      '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12)
+      zod: 4.1.12
+
   '@ai-sdk/openai@2.0.53(zod@4.1.8)':
     dependencies:
       '@ai-sdk/provider': 2.0.0
@@ -10530,7 +10539,7 @@ snapshots:
       isstream: 0.1.2
       jsonwebtoken: 9.0.2
       mime-types: 2.1.35
-      retry-axios: 2.6.0(axios@1.13.0(debug@4.4.3))
+      retry-axios: 2.6.0(axios@1.13.0)
       tough-cookie: 4.1.4
     transitivePeerDependencies:
       - supports-color
@@ -12548,7 +12557,7 @@ snapshots:
       retext-stringify: 4.0.0
       unified: 11.0.5
 
-  retry-axios@2.6.0(axios@1.13.0(debug@4.4.3)):
+  retry-axios@2.6.0(axios@1.13.0):
     dependencies:
       axios: 1.13.0(debug@4.4.3)