hud-evals · jdchawla29 · Jun 18, 2026
diff --git a/docs/v6/cookbooks/coding-agent.mdx b/docs/v6/cookbooks/coding-agent.mdx
@@ -60,16 +60,16 @@ Point a coding agent at the environment. `claude` opens the `ssh` capability, ed
 hud eval env.py claude
 ```
 
-For Claude Code (the `claude` CLI driving the shell over SSH), use the `ClaudeSDKAgent` in code:
+For the `claude` CLI driving the shell over SSH, use the `ClaudeCLIAgent` in code:
 
 ```python run.py
 import asyncio
-from hud.agents import ClaudeSDKAgent
-from hud.agents.types import ClaudeSDKConfig
+from hud.agents import ClaudeCLIAgent
+from hud.agents.types import ClaudeCLIConfig
 from env import fix_add
 
 async def main():
-    agent = ClaudeSDKAgent(ClaudeSDKConfig(model="claude-sonnet-4-5"))
+    agent = ClaudeCLIAgent(ClaudeCLIConfig(model="claude-sonnet-4-5"))
     job = await fix_add().run(agent)
     print("reward:", job.reward)
 

diff --git a/docs/v6/reference/agents.mdx b/docs/v6/reference/agents.mdx
@@ -13,7 +13,19 @@ async def __call__(self, run: Run) -> None
 It fills `run.trace` in place; the answer it produces is `run.trace.content`, graded when the run exits. Agents are **stateless per run**, so one instance can drive many concurrent rollouts.
 
 ```python
-from hud.agents import create_agent, ClaudeAgent, OpenAIAgent, GeminiAgent, OpenAIChatAgent
+from hud.agents import (
+    create_agent,
+    ClaudeAgent,
+    OpenAIAgent,
+    GeminiAgent,
+    OpenAIChatAgent,
+    CodexAgent,
+    OpenCodeAgent,
+    AiderAgent,
+    GrokBuildAgent,
+    MiniSweAgent,
+    Terminus2Agent,
+)
 ```
 
 ## `create_agent`
@@ -22,25 +34,31 @@ from hud.agents import create_agent, ClaudeAgent, OpenAIAgent, GeminiAgent, Open
 create_agent(model: str, **kwargs) -> Agent
 ```
 
-Builds an agent routed through the HUD gateway for any model id the gateway knows (`claude-...`, `gpt-...`, `gemini-...`, `grok-...`). Extra `kwargs` pass through to the provider config.
+Builds an agent for any model id the HUD gateway knows (`claude-...`, `gpt-...`, `gemini-...`, `grok-...`) or for a built-in agent type such as `cli`, `codex`, `opencode`, `aider`, `grok_build`, `mini_swe_agent`, or `terminus_2`. Extra `kwargs` pass through to the agent config.
 
 ```python
 agent = create_agent("claude-sonnet-4-5")
 ```
 
 For direct provider access with your own API key, construct a provider agent instead.
 
-## Provider agents
+## Built-in agents
 
-Each provider agent takes an optional config from `hud.agents.types`:
+Each built-in agent takes an optional config from `hud.agents.types`:
 
 | Agent | Config | Default model |
 |-------|--------|---------------|
 | `ClaudeAgent` | `ClaudeConfig` | `claude-sonnet-4-6` |
 | `OpenAIAgent` | `OpenAIConfig` | `gpt-5.4` |
 | `GeminiAgent` | `GeminiConfig` | `gemini-3-pro-preview` |
 | `OpenAIChatAgent` | `OpenAIChatConfig` | `gpt-5-mini` |
-| `ClaudeSDKAgent` | `ClaudeSDKConfig` | `claude-sonnet-4-5` |
+| `ClaudeCLIAgent` | `ClaudeCLIConfig` | `claude-sonnet-4-5` |
+| `CodexAgent` | `CodexConfig` | `gpt-5.4` |
+| `OpenCodeAgent` | `OpenCodeConfig` | `openai/gpt-5.4` |
+| `AiderAgent` | `AiderConfig` | `openai/gpt-5.4` |
+| `GrokBuildAgent` | `GrokBuildConfig` | `grok-build-0.1` |
+| `MiniSweAgent` | `MiniSweAgentConfig` | `openai/gpt-5.4` |
+| `Terminus2Agent` | `Terminus2Config` | `openai/gpt-5.4` |
 
 ```python
 from hud.agents import ClaudeAgent
@@ -50,7 +68,23 @@ agent = ClaudeAgent(ClaudeConfig(model="claude-sonnet-4-5", max_tokens=16384))
 ```
 
 - **`OpenAIChatAgent`** speaks OpenAI Chat Completions — point `base_url` at any compatible server (vLLM, local models).
-- **`ClaudeSDKAgent`** runs the `claude` CLI (Claude Code) over an `ssh` capability.
+- **`ClaudeCLIAgent`** runs the `claude` CLI over an `ssh` capability and subclasses `CLIAgent`.
+- **`CodexAgent`**, **`OpenCodeAgent`**, **`AiderAgent`**, **`GrokBuildAgent`**, **`MiniSweAgent`**, and **`Terminus2Agent`** run popular coding CLIs over an `ssh` capability. Install the CLI in the workspace image for CLI-backed agents. `Terminus2Agent` installs Harbor into the run with `uv --with harbor==0.6.6`.
+
+For another CLI, use the generic adapter:
+
+```python
+from hud.agents import CLIAgent
+from hud.agents.types import CLIConfig
+
+agent = CLIAgent(CLIConfig(
+    command="my-agent",
+    args=["--message-file", "{prompt_file}"],
+    model="my-model",
+))
+```
+
+`args` supports `{prompt}`, `{prompt_file}`, `{model}`, and `{mcp_config}` placeholders. The prompt is always written to `.hud_prompt.txt` before the CLI runs.
 
 ## How an agent uses capabilities
 
@@ -70,7 +104,7 @@ When the same knob (e.g. `model`, `max_steps`) is set in more than one place, th
 - `hud eval … --max-steps 30 --model …` overrides the config defaults for that run.
 - Unset everywhere → the config's built-in default (`max_steps=10`).
 
-## Bring your own harness
+## Bring your own agent
 
 Subclass `Agent` and implement `__call__`. Write the answer to `run.trace.content`:
 

diff --git a/docs/v6/run/models.mdx b/docs/v6/run/models.mdx
@@ -4,7 +4,7 @@ description: "Evaluate a task with Claude, OpenAI, Gemini, or any OpenAI-compati
 icon: "robot"
 ---
 
-An **evaluation** produces one **trace**: an agent works the task against the environment and gets graded. Because the environment only exposes **capabilities** (never a fixed agent), any model or harness plugs in — you choose the agent at run time, not at authoring time.
+An **evaluation** produces one **trace**: an agent works the task against the environment and gets graded. Because the environment only exposes **capabilities** (never a fixed agent), any model or agent runtime plugs in — you choose the agent at run time, not at authoring time.
 
 ## Prerequisites
 
@@ -13,7 +13,7 @@ An **evaluation** produces one **trace**: an agent works the task against the en
 
 ## The fastest path: `hud eval`
 
-Pass a task source and an agent name. The agent names are `claude`, `openai`, `gemini`, and `openai_compatible`:
+Pass a task source and an agent name. The provider agent names are `claude`, `openai`, `gemini`, and `openai_compatible`; CLI-backed agents include `cli`, `codex`, `opencode`, `aider`, `grok_build`, `mini_swe_agent`, and `terminus_2`:
 
 ```bash
 hud eval tasks.py claude --group 3
@@ -68,7 +68,18 @@ from hud.agents.types import ClaudeConfig
 agent = ClaudeAgent(ClaudeConfig(model="claude-sonnet-4-5"))
 ```
 
-The provider agents are `ClaudeAgent`, `OpenAIAgent`, `GeminiAgent`, and `OpenAIChatAgent`, each with a matching config in `hud.agents.types` (`ClaudeConfig`, `OpenAIConfig`, `GeminiConfig`, `OpenAIChatConfig`). `ClaudeSDKAgent` runs the `claude` CLI (Claude Code) over an `ssh` capability.
+The provider agents are `ClaudeAgent`, `OpenAIAgent`, `GeminiAgent`, and `OpenAIChatAgent`, each with a matching config in `hud.agents.types` (`ClaudeConfig`, `OpenAIConfig`, `GeminiConfig`, `OpenAIChatConfig`). `ClaudeCLIAgent` runs the `claude` CLI over an `ssh` capability and subclasses `CLIAgent`.
+
+For coding CLIs, use `CodexAgent`, `OpenCodeAgent`, `AiderAgent`, `GrokBuildAgent`, `MiniSweAgent`, `Terminus2Agent`, or the generic `CLIAgent`:
+
+```python
+from hud.agents import OpenCodeAgent
+from hud.agents.types import OpenCodeConfig
+
+agent = OpenCodeAgent(OpenCodeConfig(model="openai/gpt-5.4"))
+```
+
+These adapters run inside the environment workspace over `ssh`, so install the underlying CLI binary in the workspace image when the agent is CLI-backed. `Terminus2Agent` installs Harbor into the run with `uv --with harbor==0.6.6`.
 
 ## Your own vLLM / OpenAI-compatible endpoint
 
@@ -87,11 +98,11 @@ agent = OpenAIChatAgent(OpenAIChatConfig(
 
 From the CLI, the equivalent is `hud eval tasks.py openai_compatible --model my-model` with the `base_url` set in your eval config.
 
-## Bring your own harness
+## Bring your own agent
 
-A harness is just *attach to a capability + define a tool spec*, so wrapping another agent framework is a thin adapter — no protocol work. Subclass `Agent` and implement `__call__`:
+An agent adapter is just *attach to a capability + define a tool spec*, so wrapping another agent framework is thin — no protocol work. Subclass `Agent` and implement `__call__`:
 
-```python harness.py
+```python agent.py
 from hud.agents.base import Agent
 from hud import Run
 
@@ -116,6 +127,6 @@ class EchoAgent(Agent):
   Every agent class, config, and the `Run` contract.
 </Card>
 <Card title="Capabilities" icon="plug" href="/v6/reference/capabilities">
-  What a harness can attach to.
+  What an agent can attach to.
 </Card>
 </CardGroup>
diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py
@@ -13,7 +13,24 @@
 if TYPE_CHECKING:
     from typing import TypeAlias
 
-    from hud.agents.claude import ClaudeAgent, ClaudeSDKAgent, ClaudeSDKConfig
+    from hud.agents.base import Agent
+    from hud.agents.claude import ClaudeAgent, ClaudeCLIAgent, ClaudeCLIConfig
+    from hud.agents.cli import (
+        AiderAgent,
+        AiderConfig,
+        CLIAgent,
+        CLIConfig,
+        CodexAgent,
+        CodexConfig,
+        GrokBuildAgent,
+        GrokBuildConfig,
+        MiniSweAgent,
+        MiniSweAgentConfig,
+        OpenCodeAgent,
+        OpenCodeConfig,
+        Terminus2Agent,
+        Terminus2Config,
+    )
     from hud.agents.gemini import GeminiAgent
     from hud.agents.openai import OpenAIAgent
     from hud.agents.openai_compatible import OpenAIChatAgent
@@ -22,14 +39,22 @@
     GatewayAgent: TypeAlias = ClaudeAgent | GeminiAgent | OpenAIAgent | OpenAIChatAgent
 
 
-def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
-    """Create an agent routed through the HUD gateway.
+_GATEWAY_AGENT_TYPES = {
+    AgentType.CLAUDE,
+    AgentType.OPENAI,
+    AgentType.GEMINI,
+    AgentType.OPENAI_COMPATIBLE,
+}
+
+
+def create_agent(model: str, **kwargs: Any) -> Agent:
+    """Create an agent for a gateway model id or a built-in agent type.
 
     For direct API access with provider API keys, instantiate the agent classes directly.
     """
     agent_type = next((candidate for candidate in AgentType if candidate.value == model), None)
     if agent_type is not None:
-        model_id = model
+        model_id = model if agent_type in _GATEWAY_AGENT_TYPES else None
         provider_name = agent_type.gateway_provider
     else:
         try:
@@ -86,31 +111,61 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
             )
             raise ValueError(f"Model {model!r} not found in {source}.{hint}")
 
-    kwargs.setdefault("model", model_id)
-    kwargs.setdefault("model_client", build_gateway_client(provider_name))
+    if model_id is not None:
+        kwargs.setdefault("model", model_id)
+    if agent_type in _GATEWAY_AGENT_TYPES:
+        kwargs.setdefault("model_client", build_gateway_client(provider_name))
     # cls/config_cls are matched unions; the pairing is correct by construction.
     config = agent_type.config_cls(**kwargs)
     return agent_type.cls(cast("Any", config))
 
 
 _LAZY_EXPORTS = {
+    "AiderAgent": ("hud.agents.cli", "AiderAgent"),
+    "AiderConfig": ("hud.agents.cli", "AiderConfig"),
     "ClaudeAgent": ("hud.agents.claude", "ClaudeAgent"),
-    "ClaudeSDKAgent": ("hud.agents.claude", "ClaudeSDKAgent"),
-    "ClaudeSDKConfig": ("hud.agents.claude", "ClaudeSDKConfig"),
+    "ClaudeCLIAgent": ("hud.agents.claude", "ClaudeCLIAgent"),
+    "ClaudeCLIConfig": ("hud.agents.claude", "ClaudeCLIConfig"),
+    "CLIAgent": ("hud.agents.cli", "CLIAgent"),
+    "CLIConfig": ("hud.agents.cli", "CLIConfig"),
+    "CodexAgent": ("hud.agents.cli", "CodexAgent"),
+    "CodexConfig": ("hud.agents.cli", "CodexConfig"),
     "GeminiAgent": ("hud.agents.gemini", "GeminiAgent"),
+    "GrokBuildAgent": ("hud.agents.cli", "GrokBuildAgent"),
+    "GrokBuildConfig": ("hud.agents.cli", "GrokBuildConfig"),
     "MCPAgent": ("hud.agents.tool_agent", "ToolAgent"),
+    "MiniSweAgent": ("hud.agents.cli", "MiniSweAgent"),
+    "MiniSweAgentConfig": ("hud.agents.cli", "MiniSweAgentConfig"),
     "OpenAIAgent": ("hud.agents.openai", "OpenAIAgent"),
     "OpenAIChatAgent": ("hud.agents.openai_compatible", "OpenAIChatAgent"),
+    "OpenCodeAgent": ("hud.agents.cli", "OpenCodeAgent"),
+    "OpenCodeConfig": ("hud.agents.cli", "OpenCodeConfig"),
+    "Terminus2Agent": ("hud.agents.cli", "Terminus2Agent"),
+    "Terminus2Config": ("hud.agents.cli", "Terminus2Config"),
 }
 
 __all__ = [
+    "AiderAgent",
+    "AiderConfig",
+    "CLIAgent",
+    "CLIConfig",
     "ClaudeAgent",
-    "ClaudeSDKAgent",
-    "ClaudeSDKConfig",
+    "ClaudeCLIAgent",
+    "ClaudeCLIConfig",
+    "CodexAgent",
+    "CodexConfig",
     "GeminiAgent",
+    "GrokBuildAgent",
+    "GrokBuildConfig",
     "MCPAgent",
+    "MiniSweAgent",
+    "MiniSweAgentConfig",
     "OpenAIAgent",
     "OpenAIChatAgent",
+    "OpenCodeAgent",
+    "OpenCodeConfig",
+    "Terminus2Agent",
+    "Terminus2Config",
     "create_agent",
 ]
 

diff --git a/hud/agents/claude/__init__.py b/hud/agents/claude/__init__.py
@@ -7,15 +7,15 @@
     AsyncAnthropicBedrock,
     ClaudeAgent,
 )
-from .sdk import ClaudeSDKAgent, ClaudeSDKConfig
+from .cli import ClaudeCLIAgent, ClaudeCLIConfig
 from .tools import ClaudeToolSearchTool, ClaudeWebFetchTool, ClaudeWebSearchTool
 
 __all__ = [
     "AsyncAnthropic",
     "AsyncAnthropicBedrock",
     "ClaudeAgent",
-    "ClaudeSDKAgent",
-    "ClaudeSDKConfig",
+    "ClaudeCLIAgent",
+    "ClaudeCLIConfig",
     "ClaudeToolSearchTool",
     "ClaudeWebFetchTool",
     "ClaudeWebSearchTool",

diff --git a/hud/agents/claude/cli/__init__.py b/hud/agents/claude/cli/__init__.py
@@ -0,0 +1,5 @@
+"""Claude CLI agent."""
+
+from .agent import ClaudeCLIAgent, ClaudeCLIConfig
+
+__all__ = ["ClaudeCLIAgent", "ClaudeCLIConfig"]