From 6c28be98173ebf02000f6a33e5b8943058c8ce03 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Wed, 17 Jun 2026 18:33:57 -0700 Subject: [PATCH] add cli agents --- docs/v6/cookbooks/coding-agent.mdx | 8 +- docs/v6/reference/agents.mdx | 48 +- docs/v6/run/models.mdx | 25 +- hud/agents/__init__.py | 75 ++- hud/agents/claude/__init__.py | 6 +- hud/agents/claude/cli/__init__.py | 5 + hud/agents/claude/{sdk => cli}/agent.py | 189 +++----- .../claude/{sdk => cli}/computer_mcp.py | 0 hud/agents/claude/sdk/__init__.py | 5 - hud/agents/cli/__init__.py | 35 ++ hud/agents/cli/agent.py | 428 ++++++++++++++++++ ..._sdk_agent.py => test_claude_cli_agent.py} | 17 +- hud/agents/tests/test_cli_agent.py | 263 +++++++++++ hud/agents/types.py | 151 +++++- hud/cli/eval.py | 32 +- hud/cli/tests/test_eval_config.py | 41 +- hud/settings.py | 6 + hud/types.py | 125 ++++- 18 files changed, 1267 insertions(+), 192 deletions(-) create mode 100644 hud/agents/claude/cli/__init__.py rename hud/agents/claude/{sdk => cli}/agent.py (59%) rename hud/agents/claude/{sdk => cli}/computer_mcp.py (100%) delete mode 100644 hud/agents/claude/sdk/__init__.py create mode 100644 hud/agents/cli/__init__.py create mode 100644 hud/agents/cli/agent.py rename hud/agents/tests/{test_claude_sdk_agent.py => test_claude_cli_agent.py} (91%) create mode 100644 hud/agents/tests/test_cli_agent.py diff --git a/docs/v6/cookbooks/coding-agent.mdx b/docs/v6/cookbooks/coding-agent.mdx index 75941d6d7..145090f65 100644 --- a/docs/v6/cookbooks/coding-agent.mdx +++ b/docs/v6/cookbooks/coding-agent.mdx @@ -60,16 +60,16 @@ Point a coding agent at the environment. `claude` opens the `ssh` capability, ed hud eval env.py claude ``` -For Claude Code (the `claude` CLI driving the shell over SSH), use the `ClaudeSDKAgent` in code: +For the `claude` CLI driving the shell over SSH, use the `ClaudeCLIAgent` in code: ```python run.py import asyncio -from hud.agents import ClaudeSDKAgent -from hud.agents.types import ClaudeSDKConfig +from hud.agents import ClaudeCLIAgent +from hud.agents.types import ClaudeCLIConfig from env import fix_add async def main(): - agent = ClaudeSDKAgent(ClaudeSDKConfig(model="claude-sonnet-4-5")) + agent = ClaudeCLIAgent(ClaudeCLIConfig(model="claude-sonnet-4-5")) job = await fix_add().run(agent) print("reward:", job.reward) diff --git a/docs/v6/reference/agents.mdx b/docs/v6/reference/agents.mdx index 8b0e5fe24..582b593b4 100644 --- a/docs/v6/reference/agents.mdx +++ b/docs/v6/reference/agents.mdx @@ -13,7 +13,19 @@ async def __call__(self, run: Run) -> None It fills `run.trace` in place; the answer it produces is `run.trace.content`, graded when the run exits. Agents are **stateless per run**, so one instance can drive many concurrent rollouts. ```python -from hud.agents import create_agent, ClaudeAgent, OpenAIAgent, GeminiAgent, OpenAIChatAgent +from hud.agents import ( + create_agent, + ClaudeAgent, + OpenAIAgent, + GeminiAgent, + OpenAIChatAgent, + CodexAgent, + OpenCodeAgent, + AiderAgent, + GrokBuildAgent, + MiniSweAgent, + Terminus2Agent, +) ``` ## `create_agent` @@ -22,7 +34,7 @@ from hud.agents import create_agent, ClaudeAgent, OpenAIAgent, GeminiAgent, Open create_agent(model: str, **kwargs) -> Agent ``` -Builds an agent routed through the HUD gateway for any model id the gateway knows (`claude-...`, `gpt-...`, `gemini-...`, `grok-...`). Extra `kwargs` pass through to the provider config. +Builds an agent for any model id the HUD gateway knows (`claude-...`, `gpt-...`, `gemini-...`, `grok-...`) or for a built-in agent type such as `cli`, `codex`, `opencode`, `aider`, `grok_build`, `mini_swe_agent`, or `terminus_2`. Extra `kwargs` pass through to the agent config. ```python agent = create_agent("claude-sonnet-4-5") @@ -30,9 +42,9 @@ agent = create_agent("claude-sonnet-4-5") For direct provider access with your own API key, construct a provider agent instead. -## Provider agents +## Built-in agents -Each provider agent takes an optional config from `hud.agents.types`: +Each built-in agent takes an optional config from `hud.agents.types`: | Agent | Config | Default model | |-------|--------|---------------| @@ -40,7 +52,13 @@ Each provider agent takes an optional config from `hud.agents.types`: | `OpenAIAgent` | `OpenAIConfig` | `gpt-5.4` | | `GeminiAgent` | `GeminiConfig` | `gemini-3-pro-preview` | | `OpenAIChatAgent` | `OpenAIChatConfig` | `gpt-5-mini` | -| `ClaudeSDKAgent` | `ClaudeSDKConfig` | `claude-sonnet-4-5` | +| `ClaudeCLIAgent` | `ClaudeCLIConfig` | `claude-sonnet-4-5` | +| `CodexAgent` | `CodexConfig` | `gpt-5.4` | +| `OpenCodeAgent` | `OpenCodeConfig` | `openai/gpt-5.4` | +| `AiderAgent` | `AiderConfig` | `openai/gpt-5.4` | +| `GrokBuildAgent` | `GrokBuildConfig` | `grok-build-0.1` | +| `MiniSweAgent` | `MiniSweAgentConfig` | `openai/gpt-5.4` | +| `Terminus2Agent` | `Terminus2Config` | `openai/gpt-5.4` | ```python from hud.agents import ClaudeAgent @@ -50,7 +68,23 @@ agent = ClaudeAgent(ClaudeConfig(model="claude-sonnet-4-5", max_tokens=16384)) ``` - **`OpenAIChatAgent`** speaks OpenAI Chat Completions — point `base_url` at any compatible server (vLLM, local models). -- **`ClaudeSDKAgent`** runs the `claude` CLI (Claude Code) over an `ssh` capability. +- **`ClaudeCLIAgent`** runs the `claude` CLI over an `ssh` capability and subclasses `CLIAgent`. +- **`CodexAgent`**, **`OpenCodeAgent`**, **`AiderAgent`**, **`GrokBuildAgent`**, **`MiniSweAgent`**, and **`Terminus2Agent`** run popular coding CLIs over an `ssh` capability. Install the CLI in the workspace image for CLI-backed agents. `Terminus2Agent` installs Harbor into the run with `uv --with harbor==0.6.6`. + +For another CLI, use the generic adapter: + +```python +from hud.agents import CLIAgent +from hud.agents.types import CLIConfig + +agent = CLIAgent(CLIConfig( + command="my-agent", + args=["--message-file", "{prompt_file}"], + model="my-model", +)) +``` + +`args` supports `{prompt}`, `{prompt_file}`, `{model}`, and `{mcp_config}` placeholders. The prompt is always written to `.hud_prompt.txt` before the CLI runs. ## How an agent uses capabilities @@ -70,7 +104,7 @@ When the same knob (e.g. `model`, `max_steps`) is set in more than one place, th - `hud eval … --max-steps 30 --model …` overrides the config defaults for that run. - Unset everywhere → the config's built-in default (`max_steps=10`). -## Bring your own harness +## Bring your own agent Subclass `Agent` and implement `__call__`. Write the answer to `run.trace.content`: diff --git a/docs/v6/run/models.mdx b/docs/v6/run/models.mdx index bbc704d1a..4add393cc 100644 --- a/docs/v6/run/models.mdx +++ b/docs/v6/run/models.mdx @@ -4,7 +4,7 @@ description: "Evaluate a task with Claude, OpenAI, Gemini, or any OpenAI-compati icon: "robot" --- -An **evaluation** produces one **trace**: an agent works the task against the environment and gets graded. Because the environment only exposes **capabilities** (never a fixed agent), any model or harness plugs in — you choose the agent at run time, not at authoring time. +An **evaluation** produces one **trace**: an agent works the task against the environment and gets graded. Because the environment only exposes **capabilities** (never a fixed agent), any model or agent runtime plugs in — you choose the agent at run time, not at authoring time. ## Prerequisites @@ -13,7 +13,7 @@ An **evaluation** produces one **trace**: an agent works the task against the en ## The fastest path: `hud eval` -Pass a task source and an agent name. The agent names are `claude`, `openai`, `gemini`, and `openai_compatible`: +Pass a task source and an agent name. The provider agent names are `claude`, `openai`, `gemini`, and `openai_compatible`; CLI-backed agents include `cli`, `codex`, `opencode`, `aider`, `grok_build`, `mini_swe_agent`, and `terminus_2`: ```bash hud eval tasks.py claude --group 3 @@ -68,7 +68,18 @@ from hud.agents.types import ClaudeConfig agent = ClaudeAgent(ClaudeConfig(model="claude-sonnet-4-5")) ``` -The provider agents are `ClaudeAgent`, `OpenAIAgent`, `GeminiAgent`, and `OpenAIChatAgent`, each with a matching config in `hud.agents.types` (`ClaudeConfig`, `OpenAIConfig`, `GeminiConfig`, `OpenAIChatConfig`). `ClaudeSDKAgent` runs the `claude` CLI (Claude Code) over an `ssh` capability. +The provider agents are `ClaudeAgent`, `OpenAIAgent`, `GeminiAgent`, and `OpenAIChatAgent`, each with a matching config in `hud.agents.types` (`ClaudeConfig`, `OpenAIConfig`, `GeminiConfig`, `OpenAIChatConfig`). `ClaudeCLIAgent` runs the `claude` CLI over an `ssh` capability and subclasses `CLIAgent`. + +For coding CLIs, use `CodexAgent`, `OpenCodeAgent`, `AiderAgent`, `GrokBuildAgent`, `MiniSweAgent`, `Terminus2Agent`, or the generic `CLIAgent`: + +```python +from hud.agents import OpenCodeAgent +from hud.agents.types import OpenCodeConfig + +agent = OpenCodeAgent(OpenCodeConfig(model="openai/gpt-5.4")) +``` + +These adapters run inside the environment workspace over `ssh`, so install the underlying CLI binary in the workspace image when the agent is CLI-backed. `Terminus2Agent` installs Harbor into the run with `uv --with harbor==0.6.6`. ## Your own vLLM / OpenAI-compatible endpoint @@ -87,11 +98,11 @@ agent = OpenAIChatAgent(OpenAIChatConfig( From the CLI, the equivalent is `hud eval tasks.py openai_compatible --model my-model` with the `base_url` set in your eval config. -## Bring your own harness +## Bring your own agent -A harness is just *attach to a capability + define a tool spec*, so wrapping another agent framework is a thin adapter — no protocol work. Subclass `Agent` and implement `__call__`: +An agent adapter is just *attach to a capability + define a tool spec*, so wrapping another agent framework is thin — no protocol work. Subclass `Agent` and implement `__call__`: -```python harness.py +```python agent.py from hud.agents.base import Agent from hud import Run @@ -116,6 +127,6 @@ class EchoAgent(Agent): Every agent class, config, and the `Run` contract. - What a harness can attach to. + What an agent can attach to. diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py index 4aa66b274..d61f85679 100644 --- a/hud/agents/__init__.py +++ b/hud/agents/__init__.py @@ -13,7 +13,24 @@ if TYPE_CHECKING: from typing import TypeAlias - from hud.agents.claude import ClaudeAgent, ClaudeSDKAgent, ClaudeSDKConfig + from hud.agents.base import Agent + from hud.agents.claude import ClaudeAgent, ClaudeCLIAgent, ClaudeCLIConfig + from hud.agents.cli import ( + AiderAgent, + AiderConfig, + CLIAgent, + CLIConfig, + CodexAgent, + CodexConfig, + GrokBuildAgent, + GrokBuildConfig, + MiniSweAgent, + MiniSweAgentConfig, + OpenCodeAgent, + OpenCodeConfig, + Terminus2Agent, + Terminus2Config, + ) from hud.agents.gemini import GeminiAgent from hud.agents.openai import OpenAIAgent from hud.agents.openai_compatible import OpenAIChatAgent @@ -22,14 +39,22 @@ GatewayAgent: TypeAlias = ClaudeAgent | GeminiAgent | OpenAIAgent | OpenAIChatAgent -def create_agent(model: str, **kwargs: Any) -> GatewayAgent: - """Create an agent routed through the HUD gateway. +_GATEWAY_AGENT_TYPES = { + AgentType.CLAUDE, + AgentType.OPENAI, + AgentType.GEMINI, + AgentType.OPENAI_COMPATIBLE, +} + + +def create_agent(model: str, **kwargs: Any) -> Agent: + """Create an agent for a gateway model id or a built-in agent type. For direct API access with provider API keys, instantiate the agent classes directly. """ agent_type = next((candidate for candidate in AgentType if candidate.value == model), None) if agent_type is not None: - model_id = model + model_id = model if agent_type in _GATEWAY_AGENT_TYPES else None provider_name = agent_type.gateway_provider else: try: @@ -86,31 +111,61 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent: ) raise ValueError(f"Model {model!r} not found in {source}.{hint}") - kwargs.setdefault("model", model_id) - kwargs.setdefault("model_client", build_gateway_client(provider_name)) + if model_id is not None: + kwargs.setdefault("model", model_id) + if agent_type in _GATEWAY_AGENT_TYPES: + kwargs.setdefault("model_client", build_gateway_client(provider_name)) # cls/config_cls are matched unions; the pairing is correct by construction. config = agent_type.config_cls(**kwargs) return agent_type.cls(cast("Any", config)) _LAZY_EXPORTS = { + "AiderAgent": ("hud.agents.cli", "AiderAgent"), + "AiderConfig": ("hud.agents.cli", "AiderConfig"), "ClaudeAgent": ("hud.agents.claude", "ClaudeAgent"), - "ClaudeSDKAgent": ("hud.agents.claude", "ClaudeSDKAgent"), - "ClaudeSDKConfig": ("hud.agents.claude", "ClaudeSDKConfig"), + "ClaudeCLIAgent": ("hud.agents.claude", "ClaudeCLIAgent"), + "ClaudeCLIConfig": ("hud.agents.claude", "ClaudeCLIConfig"), + "CLIAgent": ("hud.agents.cli", "CLIAgent"), + "CLIConfig": ("hud.agents.cli", "CLIConfig"), + "CodexAgent": ("hud.agents.cli", "CodexAgent"), + "CodexConfig": ("hud.agents.cli", "CodexConfig"), "GeminiAgent": ("hud.agents.gemini", "GeminiAgent"), + "GrokBuildAgent": ("hud.agents.cli", "GrokBuildAgent"), + "GrokBuildConfig": ("hud.agents.cli", "GrokBuildConfig"), "MCPAgent": ("hud.agents.tool_agent", "ToolAgent"), + "MiniSweAgent": ("hud.agents.cli", "MiniSweAgent"), + "MiniSweAgentConfig": ("hud.agents.cli", "MiniSweAgentConfig"), "OpenAIAgent": ("hud.agents.openai", "OpenAIAgent"), "OpenAIChatAgent": ("hud.agents.openai_compatible", "OpenAIChatAgent"), + "OpenCodeAgent": ("hud.agents.cli", "OpenCodeAgent"), + "OpenCodeConfig": ("hud.agents.cli", "OpenCodeConfig"), + "Terminus2Agent": ("hud.agents.cli", "Terminus2Agent"), + "Terminus2Config": ("hud.agents.cli", "Terminus2Config"), } __all__ = [ + "AiderAgent", + "AiderConfig", + "CLIAgent", + "CLIConfig", "ClaudeAgent", - "ClaudeSDKAgent", - "ClaudeSDKConfig", + "ClaudeCLIAgent", + "ClaudeCLIConfig", + "CodexAgent", + "CodexConfig", "GeminiAgent", + "GrokBuildAgent", + "GrokBuildConfig", "MCPAgent", + "MiniSweAgent", + "MiniSweAgentConfig", "OpenAIAgent", "OpenAIChatAgent", + "OpenCodeAgent", + "OpenCodeConfig", + "Terminus2Agent", + "Terminus2Config", "create_agent", ] diff --git a/hud/agents/claude/__init__.py b/hud/agents/claude/__init__.py index f5c727565..26e7aae08 100644 --- a/hud/agents/claude/__init__.py +++ b/hud/agents/claude/__init__.py @@ -7,15 +7,15 @@ AsyncAnthropicBedrock, ClaudeAgent, ) -from .sdk import ClaudeSDKAgent, ClaudeSDKConfig +from .cli import ClaudeCLIAgent, ClaudeCLIConfig from .tools import ClaudeToolSearchTool, ClaudeWebFetchTool, ClaudeWebSearchTool __all__ = [ "AsyncAnthropic", "AsyncAnthropicBedrock", "ClaudeAgent", - "ClaudeSDKAgent", - "ClaudeSDKConfig", + "ClaudeCLIAgent", + "ClaudeCLIConfig", "ClaudeToolSearchTool", "ClaudeWebFetchTool", "ClaudeWebSearchTool", diff --git a/hud/agents/claude/cli/__init__.py b/hud/agents/claude/cli/__init__.py new file mode 100644 index 000000000..f3fd8d36b --- /dev/null +++ b/hud/agents/claude/cli/__init__.py @@ -0,0 +1,5 @@ +"""Claude CLI agent.""" + +from .agent import ClaudeCLIAgent, ClaudeCLIConfig + +__all__ = ["ClaudeCLIAgent", "ClaudeCLIConfig"] diff --git a/hud/agents/claude/sdk/agent.py b/hud/agents/claude/cli/agent.py similarity index 59% rename from hud/agents/claude/sdk/agent.py rename to hud/agents/claude/cli/agent.py index 7ae98eb5b..486b99c99 100644 --- a/hud/agents/claude/sdk/agent.py +++ b/hud/agents/claude/cli/agent.py @@ -1,11 +1,11 @@ -"""ClaudeSDKAgent — runs ``claude`` CLI over SSH inside the env workspace. +"""ClaudeCLIAgent — runs the claude CLI over SSH inside the env workspace. SSH-execs the ``claude`` CLI on the remote workspace so all built-in tools (Bash, Read, Write, Edit, Glob, Grep) operate on the env's filesystem. MCP capabilities from the manifest are written as MCP server config so the CLI can call env-hosted MCP tools too. -Inspired by harbor-framework/harbor's ClaudeCode agent. +Inspired by harbor-framework/harbor's claude CLI adapter. """ from __future__ import annotations @@ -13,16 +13,21 @@ import json import logging import shlex -from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast -from hud.agents.base import Agent -from hud.agents.types import AgentStep, ClaudeSDKConfig, Usage +from hud.agents.cli import CLIAgent +from hud.agents.cli.agent import ( + RemoteInvocation, +) +from hud.agents.cli.agent import ( + build_remote_invocation as build_cli_remote_invocation, +) +from hud.agents.types import AgentStep, ClaudeCLIConfig, Usage from hud.settings import settings from hud.types import Step if TYPE_CHECKING: - from hud.capabilities import RFBClient, SSHClient + from hud.capabilities import Capability, RFBClient from hud.eval.run import Run logger = logging.getLogger(__name__) @@ -36,19 +41,6 @@ ) -@dataclass(slots=True) -class RemoteInvocation: - """How to run an assembled CLI command on the remote workspace shell. - - ``command`` is what gets exec'd over SSH. When ``script_name`` is set, that - file must be written (with ``script_body``) before exec'ing ``command``. - """ - - command: str - script_name: str | None = None - script_body: str | None = None - - def build_remote_invocation(shell: str, run_cmd: str) -> RemoteInvocation: """Build the remote exec command for ``run_cmd`` under the given login shell. @@ -58,16 +50,13 @@ def build_remote_invocation(shell: str, run_cmd: str) -> RemoteInvocation: to run under a PowerShell default shell, so ``cmd /c`` is required for both. POSIX shells take the command inline, prefixed with a one-shot install check. """ - if shell in WINDOWS_SHELLS: - return RemoteInvocation( - command="cmd /c .hud_run.bat", - script_name=".hud_run.bat", - script_body=f"@echo off\r\n{run_cmd}\r\n", - ) + invocation = build_cli_remote_invocation(shell, run_cmd) + if invocation.script_name is not None: + return invocation return RemoteInvocation(command=f"{_POSIX_INSTALL_CHECK} && {run_cmd}") -class ClaudeSDKAgent(Agent): +class ClaudeCLIAgent(CLIAgent): """Runs ``claude`` CLI over SSH inside the env workspace. Stateless w.r.t. the env: driven by ``await agent(run)``. SSH and RFB are @@ -76,36 +65,14 @@ class ClaudeSDKAgent(Agent): MCP config (the CLI connects to them itself). """ - config: ClaudeSDKConfig - - def __init__(self, config: ClaudeSDKConfig | None = None) -> None: - self.config = config or ClaudeSDKConfig() - self._ssh: SSHClient | None = None - self._mcp_servers: dict[str, dict[str, Any]] = {} - self._shell = "bash" - - async def __call__(self, run: Run) -> None: - self._mcp_servers = {} - manifest = run.client.manifest - bindings = manifest.bindings if manifest is not None else [] - families = {c.protocol.split("/", 1)[0] for c in bindings} - - if "ssh" not in families: - raise RuntimeError("ClaudeSDKAgent requires an SSH capability") - self._ssh = cast("SSHClient", await run.client.open("ssh")) - self._shell = self._ssh.capability.params.get("shell", "bash") + def __init__(self, config: ClaudeCLIConfig | None = None) -> None: + super().__init__(config or ClaudeCLIConfig()) + async def _collect_mcp_servers(self, run: Run, bindings: list[Capability]) -> None: + await super()._collect_mcp_servers(run, bindings) for cap in bindings: - family = cap.protocol.split("/", 1)[0] - if family == "mcp": - token = cap.params.get("auth_token") - transport = "http" if cap.url.startswith("http") else "sse" - server_config: dict[str, Any] = {"type": transport, "url": cap.url} - if token: - server_config["headers"] = {"Authorization": f"Bearer {token}"} - self._mcp_servers[cap.name] = server_config - elif family == "rfb": - from hud.agents.claude.sdk.computer_mcp import serve_computer_mcp + if cap.protocol.split("/", 1)[0] == "rfb": + from hud.agents.claude.cli.computer_mcp import serve_computer_mcp rfb = cast("RFBClient", await run.client.open("rfb")) port = await serve_computer_mcp(rfb) @@ -114,67 +81,8 @@ async def __call__(self, run: Run) -> None: "url": f"http://127.0.0.1:{port}/mcp", } - await self._exec( - run, - prompt=run.prompt_text, - max_steps=self.config.max_steps, - system_prompt=self.config.system_prompt, - ) - - async def _exec( - self, - run: Run, - *, - prompt: str, - max_steps: int = -1, - system_prompt: str | None = None, - ) -> None: - assert self._ssh is not None - - mcp_config_path = await self._write_mcp_config() - - # Write prompt to file via SFTP — avoids all shell quoting issues. - async with ( - self._ssh.conn.start_sftp_client() as sftp, - sftp.open(".hud_prompt.txt", "wb") as f, - ): - await f.write(prompt.encode("utf-8")) - - run_cmd = self._build_cli_command( - prompt=prompt, - max_steps=max_steps, - system_prompt=system_prompt, - mcp_config_path=mcp_config_path, - ) - - invocation = build_remote_invocation(self._shell, run_cmd) - if invocation.script_name is not None: - assert invocation.script_body is not None - # cmd.exe mangles inline quotes, so the command rides a batch file. - async with ( - self._ssh.conn.start_sftp_client() as sftp, - sftp.open(invocation.script_name, "wb") as f, - ): - await f.write(invocation.script_body.encode("utf-8")) - - full_cmd = invocation.command - logger.info("SSH exec claude CLI (%d chars)", len(full_cmd)) - logger.info("Full command: %s", full_cmd) - - completed = await self._ssh.conn.run(full_cmd, check=False) - stdout = completed.stdout if isinstance(completed.stdout, str) else "" - stderr = completed.stderr if isinstance(completed.stderr, str) else "" - - logger.info("exit=%s stdout=%d stderr=%d", completed.exit_status, len(stdout), len(stderr)) - - if completed.exit_status != 0 and not stdout.strip(): - error = stderr or f"claude CLI exited with status {completed.exit_status}" - run.trace.status = "error" - run.trace.extra.update({"exit_status": completed.exit_status, "stderr": stderr}) - run.record(Step(source="system", error=error)) - return - - self._parse_stream_json(run, stdout, stderr) + def _build_remote_invocation(self, run_cmd: str) -> RemoteInvocation: + return build_remote_invocation(self._shell, run_cmd) def _build_env_vars(self) -> dict[str, str]: env: dict[str, str] = {} @@ -201,27 +109,13 @@ def _build_env_vars(self) -> dict[str, str]: return env - async def _write_mcp_config(self) -> str | None: - """Write MCP config via SFTP and return the file path, or None.""" - if not self._mcp_servers or self._ssh is None: - return None - mcp_json = json.dumps({"mcpServers": self._mcp_servers}, indent=2) - # Write into the workspace root (SFTP is chrooted there). - sftp_path = ".hud_mcp_config.json" - async with self._ssh.conn.start_sftp_client() as sftp, sftp.open(sftp_path, "wb") as f: - await f.write(mcp_json.encode("utf-8")) - # Return the absolute path the CLI will see (cwd = workspace root). - logger.info("Wrote MCP config via SFTP") - return sftp_path - def _build_cli_command( self, *, prompt: str, - max_steps: int, - system_prompt: str | None, mcp_config_path: str | None = None, ) -> str: + config = cast("ClaudeCLIConfig", self.config) env_vars = self._build_env_vars() is_win = self._shell in WINDOWS_SHELLS self._win_redirect = False @@ -232,13 +126,13 @@ def _build_cli_command( "--verbose", "--output-format=stream-json", "--print", - f"--permission-mode={self.config.permission_mode}", + f"--permission-mode={config.permission_mode}", ] - if max_steps > 0: - base_args.append(f"--max-turns={max_steps}") - if system_prompt: - base_args.extend(["--system-prompt", system_prompt]) - for tool in self.config.allowed_tools: + if config.max_steps > 0: + base_args.append(f"--max-turns={config.max_steps}") + if config.system_prompt: + base_args.extend(["--system-prompt", config.system_prompt]) + for tool in config.allowed_tools: base_args.extend(["--allowedTools", tool]) if mcp_config_path: base_args.extend(["--mcp-config", mcp_config_path]) @@ -253,7 +147,7 @@ def _build_cli_command( # prompt via stdin from .hud_prompt.txt. claude --print reads stdin as # the initial message when no -- argument is provided. set_parts = [f"set {k}={v}" for k, v in env_vars.items()] - cmd_args = ["cmd", "/c", "claude"] + base_args[1:] # noqa: RUF005 + cmd_args = ["cmd", "/c", "claude"] + base_args[1:] # noqa: RUF005 py_args_repr = "[" + ",".join(f"'{a}'" for a in cmd_args) + "]" python_launcher = ( 'python -c "' @@ -270,6 +164,25 @@ def _build_cli_command( env_prefix = " ".join(f"{k}={shlex.quote(v)}" for k, v in env_vars.items()) return f'export PATH="$HOME/.local/bin:$PATH"; {env_prefix} {cli_cmd}' + async def _record_cli_result( + self, + run: Run, + *, + stdout: str, + stderr: str, + exit_status: int, + ) -> None: + logger.info("exit=%s stdout=%d stderr=%d", exit_status, len(stdout), len(stderr)) + + if exit_status != 0 and not stdout.strip(): + error = stderr or f"claude CLI exited with status {exit_status}" + run.trace.status = "error" + run.trace.extra.update({"exit_status": exit_status, "stderr": stderr}) + run.record(Step(source="system", error=error)) + return + + self._parse_stream_json(run, stdout, stderr) + def _parse_stream_json(self, run: Run, stdout: str, stderr: str) -> None: messages: list[dict[str, Any]] = [] content_parts: list[str] = [] @@ -332,4 +245,4 @@ def _parse_stream_json(self, run: Run, stdout: str, stderr: str) -> None: ) -__all__ = ["ClaudeSDKAgent", "ClaudeSDKConfig", "RemoteInvocation", "build_remote_invocation"] +__all__ = ["ClaudeCLIAgent", "ClaudeCLIConfig", "RemoteInvocation", "build_remote_invocation"] diff --git a/hud/agents/claude/sdk/computer_mcp.py b/hud/agents/claude/cli/computer_mcp.py similarity index 100% rename from hud/agents/claude/sdk/computer_mcp.py rename to hud/agents/claude/cli/computer_mcp.py diff --git a/hud/agents/claude/sdk/__init__.py b/hud/agents/claude/sdk/__init__.py deleted file mode 100644 index 57fd2773c..000000000 --- a/hud/agents/claude/sdk/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Claude Agent SDK agent.""" - -from .agent import ClaudeSDKAgent, ClaudeSDKConfig - -__all__ = ["ClaudeSDKAgent", "ClaudeSDKConfig"] diff --git a/hud/agents/cli/__init__.py b/hud/agents/cli/__init__.py new file mode 100644 index 000000000..ece677ee6 --- /dev/null +++ b/hud/agents/cli/__init__.py @@ -0,0 +1,35 @@ +"""CLI agents.""" + +from .agent import ( + AiderAgent, + AiderConfig, + CLIAgent, + CLIConfig, + CodexAgent, + CodexConfig, + GrokBuildAgent, + GrokBuildConfig, + MiniSweAgent, + MiniSweAgentConfig, + OpenCodeAgent, + OpenCodeConfig, + Terminus2Agent, + Terminus2Config, +) + +__all__ = [ + "AiderAgent", + "AiderConfig", + "CLIAgent", + "CLIConfig", + "CodexAgent", + "CodexConfig", + "GrokBuildAgent", + "GrokBuildConfig", + "MiniSweAgent", + "MiniSweAgentConfig", + "OpenCodeAgent", + "OpenCodeConfig", + "Terminus2Agent", + "Terminus2Config", +] diff --git a/hud/agents/cli/agent.py b/hud/agents/cli/agent.py new file mode 100644 index 000000000..2a94ec294 --- /dev/null +++ b/hud/agents/cli/agent.py @@ -0,0 +1,428 @@ +"""CLI agents that run non-interactive coding-agent CLIs over SSH.""" + +from __future__ import annotations + +import json +import logging +import shlex +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, cast + +from asyncssh.sftp import SFTPError + +from hud.agents.base import Agent +from hud.agents.types import ( + AgentStep, + AiderConfig, + CLIConfig, + CodexConfig, + GrokBuildConfig, + MiniSweAgentConfig, + OpenCodeConfig, + Terminus2Config, +) +from hud.settings import settings +from hud.types import Step + +if TYPE_CHECKING: + from hud.capabilities import Capability, SSHClient + from hud.eval.run import Run + +logger = logging.getLogger(__name__) + +WINDOWS_SHELLS = ("cmd", "powershell") +PROMPT_PATH = ".hud_prompt.txt" +MCP_CONFIG_PATH = ".hud_mcp_config.json" +TERMINUS2_SCRIPT_PATH = ".hud_terminus2.py" +TERMINUS2_RESULT_PATH = ".hud_terminus2_logs/trajectory.json" +TERMINUS2_SCRIPT = r""" +from __future__ import annotations + +import asyncio +import logging +import os +import shutil +import subprocess +from pathlib import Path + +from harbor.agents.terminus_2 import Terminus2 +from harbor.environments.base import BaseEnvironment, ExecResult +from harbor.models.agent.context import AgentContext +from harbor.models.environment_type import EnvironmentType +from harbor.models.trial.paths import TrialPaths + + +class LocalEnvironment(BaseEnvironment): + def __init__(self, workdir: Path, logs_dir: Path) -> None: + self.workdir = workdir + self.trial_paths = TrialPaths(trial_dir=logs_dir) + self.trial_paths.mkdir() + self.default_user = None + self.session_id = "hud" + self.logger = logging.getLogger(__name__) + + def type(self) -> EnvironmentType: + return EnvironmentType.DOCKER + + @property + def is_mounted(self) -> bool: + return True + + @property + def supports_gpus(self) -> bool: + return False + + @property + def can_disable_internet(self) -> bool: + return False + + def _validate_definition(self) -> None: + return None + + async def start(self, force_build: bool) -> None: + return None + + async def stop(self, delete: bool) -> None: + return None + + async def prepare_logs_for_host(self) -> None: + return None + + async def upload_file(self, source_path, target_path) -> None: + shutil.copy(source_path, target_path) + + async def upload_dir(self, source_dir, target_dir) -> None: + shutil.copytree(source_dir, target_dir, dirs_exist_ok=True) + + async def download_file(self, source_path, target_path) -> None: + shutil.copy(source_path, target_path) + + async def download_dir(self, source_dir, target_dir) -> None: + shutil.copytree(source_dir, target_dir, dirs_exist_ok=True) + + async def exec( + self, + command: str, + cwd: str | None = None, + env: dict | None = None, + timeout_sec: int | None = None, + user: str | int | None = None, + ) -> ExecResult: + _ = user + try: + result = subprocess.run( + command, + shell=True, + cwd=cwd or str(self.workdir), + env={**os.environ, **(env or {})}, + capture_output=True, + text=True, + timeout=timeout_sec, + ) + except subprocess.TimeoutExpired: + return ExecResult(stdout="", stderr="Command timed out", return_code=124) + return ExecResult( + stdout=result.stdout, + stderr=result.stderr, + return_code=result.returncode, + ) + + +async def main() -> None: + workdir = Path(os.environ.get("AGENT_WORKDIR") or os.getcwd()) + logs_dir = Path(".hud_terminus2_logs") + logs_dir.mkdir(parents=True, exist_ok=True) + instruction = Path(".hud_prompt.txt").read_text() + subprocess.run( + ["tmux", "kill-session", "-t", "terminus-2"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=False, + ) + api_base = ( + os.environ.get("OPENAI_BASE_URL") + or os.environ.get("OPENAI_API_BASE") + or "https://api.openai.com/v1" + ) + agent = Terminus2( + logs_dir=logs_dir, + model_name=os.environ.get("HUD_TERMINUS2_MODEL", "openai/gpt-5.4"), + api_base=api_base, + max_turns=int(os.environ.get("HUD_TERMINUS2_MAX_TURNS", "10")), + ) + env = LocalEnvironment(workdir=workdir, logs_dir=logs_dir) + await agent.setup(env) + await agent.run(instruction, env, AgentContext()) + + +asyncio.run(main()) +""" + + +@dataclass(slots=True) +class RemoteInvocation: + command: str + script_name: str | None = None + script_body: str | None = None + + +def build_remote_invocation(shell: str, run_cmd: str) -> RemoteInvocation: + if shell in WINDOWS_SHELLS: + return RemoteInvocation( + command="cmd /c .hud_run.bat", + script_name=".hud_run.bat", + script_body=f"@echo off\r\n{run_cmd}\r\n", + ) + return RemoteInvocation(command=run_cmd) + + +class CLIAgent(Agent): + """Runs a configured non-interactive CLI inside the env workspace.""" + + config: CLIConfig + + def __init__(self, config: CLIConfig | None = None) -> None: + self.config = config or CLIConfig() + self._ssh: SSHClient | None = None + self._shell = "bash" + self._mcp_servers: dict[str, dict[str, Any]] = {} + + async def __call__(self, run: Run) -> None: + self._mcp_servers = {} + manifest = run.client.manifest + bindings = manifest.bindings if manifest is not None else [] + + await self._open_ssh(run, bindings) + await self._collect_mcp_servers(run, bindings) + await self._exec(run, prompt=run.prompt_text) + + async def _open_ssh(self, run: Run, bindings: list[Capability]) -> None: + if "ssh" not in {c.protocol.split("/", 1)[0] for c in bindings}: + raise RuntimeError(f"{self.config.model_name} requires an SSH capability") + + self._ssh = cast("SSHClient", await run.client.open("ssh")) + self._shell = self._ssh.capability.params.get("shell", "bash") + + async def _collect_mcp_servers(self, run: Run, bindings: list[Capability]) -> None: + _ = run + for cap in bindings: + if cap.protocol.split("/", 1)[0] != "mcp": + continue + token = cap.params.get("auth_token") + transport = "http" if cap.url.startswith("http") else "sse" + server_config: dict[str, Any] = {"type": transport, "url": cap.url} + if token: + server_config["headers"] = {"Authorization": f"Bearer {token}"} + self._mcp_servers[cap.name] = server_config + + async def _exec(self, run: Run, *, prompt: str) -> None: + if not self.config.command: + raise ValueError("CLIConfig.command is required") + assert self._ssh is not None + + await self._write_file(PROMPT_PATH, prompt.encode("utf-8")) + mcp_config_path = await self._write_mcp_config() + run_cmd = self._build_cli_command(prompt=prompt, mcp_config_path=mcp_config_path) + invocation = self._build_remote_invocation(run_cmd) + + if invocation.script_name is not None: + assert invocation.script_body is not None + await self._write_file(invocation.script_name, invocation.script_body.encode("utf-8")) + + logger.info("SSH exec %s CLI (%d chars)", self.config.model_name, len(invocation.command)) + completed = await self._ssh.conn.run(invocation.command, check=False) + stdout = completed.stdout if isinstance(completed.stdout, str) else "" + stderr = completed.stderr if isinstance(completed.stderr, str) else "" + await self._record_cli_result( + run, + stdout=stdout, + stderr=stderr, + exit_status=completed.exit_status if completed.exit_status is not None else 1, + ) + + def _build_remote_invocation(self, run_cmd: str) -> RemoteInvocation: + return build_remote_invocation(self._shell, run_cmd) + + async def _record_cli_result( + self, + run: Run, + *, + stdout: str, + stderr: str, + exit_status: int, + ) -> None: + result_content = await self._read_result_file() + content = (result_content if result_content is not None else stdout).strip() + + trace = run.trace + trace.status = "error" if exit_status != 0 else "completed" + trace.content = content + trace.extra.update( + { + "agent": self.config.model_name, + "command": self.config.command, + "args": self.config.args, + "exit_status": exit_status, + } + ) + if stderr: + trace.extra["stderr"] = stderr + + error = None + if exit_status != 0: + error = ( + stderr or content or f"{self.config.model_name} exited with status {exit_status}" + ) + run.record(Step(source="system", error=error)) + + run.record( + AgentStep( + content=content, + done=True, + model=self.config.model, + error=error, + extra={"agent": self.config.model_name, "exit_status": exit_status}, + ) + ) + + async def _write_file(self, path: str, data: bytes) -> None: + assert self._ssh is not None + async with self._ssh.conn.start_sftp_client() as sftp, sftp.open(path, "wb") as f: + await f.write(data) + + async def _read_result_file(self) -> str | None: + if self.config.result_file is None: + return None + assert self._ssh is not None + try: + async with ( + self._ssh.conn.start_sftp_client() as sftp, + sftp.open(self.config.result_file, "rb") as f, + ): + data = cast("bytes | str", await f.read()) + except (OSError, SFTPError): + return None + if isinstance(data, str): + return data + return data.decode("utf-8", errors="replace") + + async def _write_mcp_config(self) -> str | None: + if not self.config.mcp_config or not self._mcp_servers: + return None + await self._write_file( + MCP_CONFIG_PATH, + json.dumps({"mcpServers": self._mcp_servers}, indent=2).encode("utf-8"), + ) + return MCP_CONFIG_PATH + + def _build_env_vars(self) -> dict[str, str]: + env: dict[str, str] = {} + + if self.config.use_hud_gateway and settings.api_key: + env["OPENAI_API_KEY"] = settings.api_key + env["OPENAI_API_BASE"] = settings.hud_gateway_url + env["OPENAI_BASE_URL"] = settings.hud_gateway_url + if settings.openai_api_key: + env.setdefault("OPENAI_API_KEY", settings.openai_api_key) + env.setdefault("CODEX_API_KEY", settings.openai_api_key) + if settings.anthropic_api_key: + env["ANTHROPIC_API_KEY"] = settings.anthropic_api_key + if settings.gemini_api_key: + env["GEMINI_API_KEY"] = settings.gemini_api_key + if settings.xai_api_key: + env["XAI_API_KEY"] = settings.xai_api_key + + env.update(self.config.extra_env) + return env + + def _build_cli_command(self, *, prompt: str, mcp_config_path: str | None) -> str: + args = [self.config.command, *self.config.args] + expanded = [ + arg.format( + prompt=prompt, + prompt_file=PROMPT_PATH, + model=self.config.model, + max_steps=self.config.max_steps, + mcp_config=mcp_config_path or "", + ) + for arg in args + ] + + cmd = " ".join(shlex.quote(arg) for arg in expanded) + if self.config.stdin: + cmd = f"{cmd} < {shlex.quote(PROMPT_PATH)}" + + env_vars = self._build_env_vars() + env_prefix = " ".join(f"{key}={shlex.quote(value)}" for key, value in env_vars.items()) + command = f"{env_prefix} {cmd}" if env_prefix else cmd + if self.config.install_check: + return f"{self.config.install_check} && {command}" + return command + + +class OpenCodeAgent(CLIAgent): + def __init__(self, config: OpenCodeConfig | None = None) -> None: + super().__init__(config or OpenCodeConfig()) + + +class AiderAgent(CLIAgent): + def __init__(self, config: AiderConfig | None = None) -> None: + super().__init__(config or AiderConfig()) + + +class CodexAgent(CLIAgent): + def __init__(self, config: CodexConfig | None = None) -> None: + super().__init__(config or CodexConfig()) + + +class GrokBuildAgent(CLIAgent): + def __init__(self, config: GrokBuildConfig | None = None) -> None: + super().__init__(config or GrokBuildConfig()) + + def _build_env_vars(self) -> dict[str, str]: + env = super()._build_env_vars() + if env.get("XAI_API_KEY"): + env.setdefault("HOME", "/tmp/hud-grok-home") # noqa: S108 + return env + + +class MiniSweAgent(CLIAgent): + def __init__(self, config: MiniSweAgentConfig | None = None) -> None: + super().__init__(config or MiniSweAgentConfig()) + + +class Terminus2Agent(CLIAgent): + def __init__(self, config: Terminus2Config | None = None) -> None: + super().__init__(config or Terminus2Config()) + + async def _exec(self, run: Run, *, prompt: str) -> None: + await self._write_file(TERMINUS2_SCRIPT_PATH, TERMINUS2_SCRIPT.encode("utf-8")) + await super()._exec(run, prompt=prompt) + + def _build_env_vars(self) -> dict[str, str]: + env = super()._build_env_vars() + config = cast("Terminus2Config", self.config) + env["HUD_TERMINUS2_MODEL"] = config.model + env["HUD_TERMINUS2_MAX_TURNS"] = str(config.max_steps) + return env + + +__all__ = [ + "TERMINUS2_RESULT_PATH", + "AiderAgent", + "AiderConfig", + "CLIAgent", + "CLIConfig", + "CodexAgent", + "CodexConfig", + "GrokBuildAgent", + "GrokBuildConfig", + "MiniSweAgent", + "MiniSweAgentConfig", + "OpenCodeAgent", + "OpenCodeConfig", + "RemoteInvocation", + "Terminus2Agent", + "Terminus2Config", + "build_remote_invocation", +] diff --git a/hud/agents/tests/test_claude_sdk_agent.py b/hud/agents/tests/test_claude_cli_agent.py similarity index 91% rename from hud/agents/tests/test_claude_sdk_agent.py rename to hud/agents/tests/test_claude_cli_agent.py index cd010c01f..81f117b37 100644 --- a/hud/agents/tests/test_claude_sdk_agent.py +++ b/hud/agents/tests/test_claude_cli_agent.py @@ -1,4 +1,4 @@ -"""ClaudeSDKAgent remote-command construction over the workspace SSH. +"""ClaudeCLIAgent remote-command construction over the workspace SSH. The agent runs the ``claude`` CLI on the remote workspace. These cover how the command is assembled per login shell — especially the Windows path, where the @@ -15,7 +15,7 @@ import pytest -from hud.agents.claude.sdk.agent import ClaudeSDKAgent, build_remote_invocation +from hud.agents.claude.cli.agent import ClaudeCLIAgent, build_remote_invocation # ─── build_remote_invocation (pure) ─────────────────────────────────── @@ -98,8 +98,8 @@ def _fake_run() -> Any: ) -def _agent_with_conn(shell: str, conn: _FakeConn) -> ClaudeSDKAgent: - agent = ClaudeSDKAgent() +def _agent_with_conn(shell: str, conn: _FakeConn) -> ClaudeCLIAgent: + agent = ClaudeCLIAgent() agent._ssh = cast("Any", SimpleNamespace(conn=conn)) agent._shell = shell return agent @@ -109,9 +109,10 @@ async def test_exec_on_windows_writes_batch_and_execs_via_cmd() -> None: sink: dict[str, bytes] = {} conn = _FakeConn(sink, SimpleNamespace(stdout=_STREAM_JSON, stderr="", exit_status=0)) agent = _agent_with_conn("cmd", conn) + agent.config.max_steps = 5 run = _fake_run() - await agent._exec(run, prompt="build it", max_steps=5) + await agent._exec(run, prompt="build it") assert conn.ran == ["cmd /c .hud_run.bat"] assert sink[".hud_run.bat"].startswith(b"@echo off\r\n") @@ -124,9 +125,10 @@ async def test_exec_on_bash_runs_inline_without_batch() -> None: sink: dict[str, bytes] = {} conn = _FakeConn(sink, SimpleNamespace(stdout=_STREAM_JSON, stderr="", exit_status=0)) agent = _agent_with_conn("bash", conn) + agent.config.max_steps = 5 run = _fake_run() - await agent._exec(run, prompt="build it", max_steps=5) + await agent._exec(run, prompt="build it") assert ".hud_run.bat" not in sink assert len(conn.ran) == 1 @@ -139,9 +141,10 @@ async def test_exec_nonzero_exit_with_no_stdout_records_system_error() -> None: sink: dict[str, bytes] = {} conn = _FakeConn(sink, SimpleNamespace(stdout="", stderr="boom", exit_status=1)) agent = _agent_with_conn("cmd", conn) + agent.config.max_steps = 1 run = _fake_run() - await agent._exec(run, prompt="x", max_steps=1) + await agent._exec(run, prompt="x") assert run.trace.status == "error" assert run.trace.extra["exit_status"] == 1 diff --git a/hud/agents/tests/test_cli_agent.py b/hud/agents/tests/test_cli_agent.py new file mode 100644 index 000000000..bc5b9463e --- /dev/null +++ b/hud/agents/tests/test_cli_agent.py @@ -0,0 +1,263 @@ +"""CLIAgent remote-command construction over workspace SSH.""" +# pyright: reportPrivateUsage=false + +from __future__ import annotations + +from types import SimpleNamespace +from typing import TYPE_CHECKING, Any, cast + +from hud.agents import create_agent +from hud.agents.cli import CodexAgent, GrokBuildAgent, OpenCodeAgent, Terminus2Agent +from hud.agents.cli.agent import ( + TERMINUS2_RESULT_PATH, + TERMINUS2_SCRIPT_PATH, + CLIAgent, + build_remote_invocation, +) +from hud.agents.types import ( + AiderConfig, + CLIConfig, + CodexConfig, + GrokBuildConfig, + MiniSweAgentConfig, + OpenCodeConfig, + Terminus2Config, +) +from hud.settings import settings + +if TYPE_CHECKING: + import pytest + + +class _FakeFile: + def __init__(self, name: str, mode: str, sink: dict[str, bytes]) -> None: + self._name = name + self._mode = mode + self._sink = sink + + async def __aenter__(self) -> _FakeFile: + return self + + async def __aexit__(self, *exc: Any) -> None: + return None + + async def write(self, data: bytes) -> None: + self._sink[self._name] = self._sink.get(self._name, b"") + data + + async def read(self) -> bytes: + if "r" not in self._mode: + raise OSError("not opened for reading") + return self._sink[self._name] + + +class _FakeSFTP: + def __init__(self, sink: dict[str, bytes]) -> None: + self._sink = sink + + async def __aenter__(self) -> _FakeSFTP: + return self + + async def __aexit__(self, *exc: Any) -> None: + return None + + def open(self, name: str, mode: str) -> _FakeFile: + if "r" in mode and name not in self._sink: + raise OSError(name) + return _FakeFile(name, mode, self._sink) + + +class _FakeConn: + def __init__(self, sink: dict[str, bytes], result: Any) -> None: + self._sink = sink + self._result = result + self.ran: list[str] = [] + + def start_sftp_client(self) -> _FakeSFTP: + return _FakeSFTP(self._sink) + + async def run(self, cmd: str, *, check: bool = True) -> Any: + self.ran.append(cmd) + return self._result + + +def _fake_run() -> Any: + trace = SimpleNamespace(status="", content="", extra={}) + steps: list[Any] = [] + return SimpleNamespace(trace=trace, record=steps.append, steps=steps) + + +def _agent_with_conn(config: CLIConfig, shell: str, conn: _FakeConn) -> CLIAgent: + agent = CLIAgent(config) + agent._ssh = cast("Any", SimpleNamespace(conn=conn)) + agent._shell = shell + return agent + + +def test_windows_shell_runs_batch_file_via_cmd() -> None: + inv = build_remote_invocation("powershell", "mini --task x") + + assert inv.command == "cmd /c .hud_run.bat" + assert inv.script_name == ".hud_run.bat" + assert inv.script_body == "@echo off\r\nmini --task x\r\n" + + +def test_presets_build_documented_non_interactive_commands() -> None: + opencode = CLIAgent(OpenCodeConfig())._build_cli_command( + prompt="fix the repo", + mcp_config_path=None, + ) + codex = CLIAgent(CodexConfig())._build_cli_command( + prompt="fix the repo", + mcp_config_path=None, + ) + aider = CLIAgent(AiderConfig())._build_cli_command( + prompt="fix the repo", + mcp_config_path=None, + ) + grok = CLIAgent(GrokBuildConfig())._build_cli_command( + prompt="fix the repo", + mcp_config_path=None, + ) + mini = CLIAgent(MiniSweAgentConfig())._build_cli_command( + prompt="fix the repo", + mcp_config_path=None, + ) + terminus = CLIAgent(Terminus2Config())._build_cli_command( + prompt="fix the repo", + mcp_config_path=None, + ) + + assert "opencode run" in opencode + assert "--dangerously-skip-permissions" in opencode + assert "'fix the repo'" in opencode + assert "codex exec" in codex + assert "--sandbox workspace-write" in codex + assert "--skip-git-repo-check" in codex + assert "'fix the repo'" in codex + assert "aider --model" in aider + assert "--message-file .hud_prompt.txt" in aider + assert "grok -p 'fix the repo'" in grok + assert "-m grok-build-0.1" in grok + assert "--always-approve" in grok + assert "--max-turns" not in grok + assert "--output-format" not in grok + assert "mini --model" in mini + assert "--task 'fix the repo'" in mini + assert "--yolo" in mini + assert "uv --no-config run --no-project --quiet" in terminus + assert "--with harbor==0.6.6" in terminus + assert "python .hud_terminus2.py" in terminus + + +def test_create_agent_constructs_opencode_with_default_model() -> None: + agent = create_agent("opencode") + + assert isinstance(agent, OpenCodeAgent) + assert agent.config.model == "openai/gpt-5.4" + + +def test_create_agent_constructs_codex_with_default_model() -> None: + agent = create_agent("codex") + + assert isinstance(agent, CodexAgent) + assert agent.config.model == "gpt-5.4" + + +def test_create_agent_constructs_grok_build_with_default_model() -> None: + agent = create_agent("grok_build") + + assert isinstance(agent, GrokBuildAgent) + assert agent.config.model == "grok-build-0.1" + + +def test_grok_build_uses_isolated_home_with_xai_api_key( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(settings, "xai_api_key", "xai-test") + + env = GrokBuildAgent()._build_env_vars() + + assert env["XAI_API_KEY"] == "xai-test" + assert env["HOME"] == "/tmp/hud-grok-home" + + +def test_create_agent_constructs_terminus_2_with_default_model() -> None: + agent = create_agent("terminus_2") + + assert isinstance(agent, Terminus2Agent) + assert agent.config.model == "openai/gpt-5.4" + assert agent.config.result_file == TERMINUS2_RESULT_PATH + + +async def test_exec_writes_prompt_runs_command_and_records_stdout() -> None: + sink: dict[str, bytes] = {} + conn = _FakeConn(sink, SimpleNamespace(stdout="done\n", stderr="", exit_status=0)) + config = CLIConfig( + model="m", + model_name="TestCLI", + command="agent", + args=["--message-file", "{prompt_file}"], + use_hud_gateway=False, + ) + agent = _agent_with_conn(config, "bash", conn) + run = _fake_run() + + await agent._exec(run, prompt="build it") + + assert sink[".hud_prompt.txt"] == b"build it" + assert len(conn.ran) == 1 + assert conn.ran[0].endswith("agent --message-file .hud_prompt.txt") + assert run.trace.status == "completed" + assert run.trace.content == "done" + assert run.steps[0].content == "done" + + +async def test_terminus_2_writes_runner_script_before_command() -> None: + sink: dict[str, bytes] = {TERMINUS2_RESULT_PATH: b"terminus done\n"} + conn = _FakeConn(sink, SimpleNamespace(stdout="", stderr="", exit_status=0)) + agent = Terminus2Agent(Terminus2Config(use_hud_gateway=False)) + agent._ssh = cast("Any", SimpleNamespace(conn=conn)) + agent._shell = "bash" + run = _fake_run() + + await agent._exec(run, prompt="solve it") + + assert TERMINUS2_SCRIPT_PATH in sink + assert b"from harbor.agents.terminus_2 import Terminus2" in sink[TERMINUS2_SCRIPT_PATH] + assert run.trace.content == "terminus done" + + +async def test_exec_uses_result_file_when_configured() -> None: + sink: dict[str, bytes] = {"answer.txt": b"file answer\n"} + conn = _FakeConn(sink, SimpleNamespace(stdout="stdout answer\n", stderr="", exit_status=0)) + config = CLIConfig( + model="m", + command="agent", + result_file="answer.txt", + use_hud_gateway=False, + ) + agent = _agent_with_conn(config, "bash", conn) + run = _fake_run() + + await agent._exec(run, prompt="x") + + assert run.trace.content == "file answer" + + +async def test_exec_nonzero_records_system_error() -> None: + sink: dict[str, bytes] = {} + conn = _FakeConn(sink, SimpleNamespace(stdout="", stderr="boom", exit_status=2)) + config = CLIConfig( + model="m", + model_name="TestCLI", + command="agent", + use_hud_gateway=False, + ) + agent = _agent_with_conn(config, "bash", conn) + run = _fake_run() + + await agent._exec(run, prompt="x") + + assert run.trace.status == "error" + assert run.steps[0].error == "boom" + assert run.steps[1].error == "boom" diff --git a/hud/agents/types.py b/hud/agents/types.py index 3b5466ff1..ccb3df0c1 100644 --- a/hud/agents/types.py +++ b/hud/agents/types.py @@ -126,20 +126,44 @@ class OpenAIChatConfig(AgentConfig): completion_kwargs: dict[str, Any] = Field(default_factory=dict) +# CLI agents (CLI over SSH) # ----------------------------------------------------------------------------- -# Claude Code (CLI over SSH) + + +class CLIConfig(AgentConfig): + """Configuration for a non-interactive coding-agent CLI run over SSH. + + ``args`` accepts ``{prompt}``, ``{prompt_file}``, ``{model}``, ``{max_steps}``, and + ``{mcp_config}`` placeholders. The prompt is always written to + ``.hud_prompt.txt`` first so CLIs can use file-based input. + """ + + model_name: str = "CLI" + command: str = "" + args: list[str] = Field(default_factory=list[str]) + extra_env: dict[str, str] = Field(default_factory=dict) + use_hud_gateway: bool = True + mcp_config: bool = False + stdin: bool = False + result_file: str | None = None + install_check: str | None = None + + +# ----------------------------------------------------------------------------- +# Claude CLI (over SSH) # ----------------------------------------------------------------------------- -class ClaudeSDKConfig(AgentConfig): - """Configuration for ClaudeSDKAgent (runs the ``claude`` CLI over SSH). +class ClaudeCLIConfig(CLIConfig): + """Configuration for ClaudeCLIAgent (runs the ``claude`` CLI over SSH). ``system_prompt`` is inherited from ``AgentConfig``. ``max_steps`` maps to the CLI's ``--max-turns``; values <= 0 leave the turn budget to the CLI (unlimited). """ - model_name: str = "Claude Code" + model_name: str = "Claude CLI" model: str = Field(default="claude-sonnet-4-5", validation_alias=_model_alias) + command: str = "claude" permission_mode: str = "bypassPermissions" max_steps: int = -1 allowed_tools: list[str] = Field( @@ -156,6 +180,125 @@ class ClaudeSDKConfig(AgentConfig): ) +class OpenCodeConfig(CLIConfig): + """Configuration for OpenCode's non-interactive ``opencode run`` mode.""" + + model_name: str = "OpenCode" + model: str = Field(default="openai/gpt-5.4", validation_alias=_model_alias) + command: str = "opencode" + args: list[str] = Field( + default_factory=lambda: [ + "run", + "--format", + "json", + "--model", + "{model}", + "--dangerously-skip-permissions", + "{prompt}", + ], + ) + + +class CodexConfig(CLIConfig): + """Configuration for Codex CLI's non-interactive ``codex exec`` mode.""" + + model_name: str = "Codex CLI" + model: str = Field(default="gpt-5.4", validation_alias=_model_alias) + command: str = "codex" + use_hud_gateway: bool = False + args: list[str] = Field( + default_factory=lambda: [ + "exec", + "--model", + "{model}", + "--sandbox", + "workspace-write", + "--skip-git-repo-check", + "{prompt}", + ], + ) + + +class GrokBuildConfig(CLIConfig): + """Configuration for Grok Build's headless ``grok -p`` mode.""" + + model_name: str = "Grok Build" + model: str = Field(default="grok-build-0.1", validation_alias=_model_alias) + command: str = "grok" + use_hud_gateway: bool = False + install_check: str | None = "mkdir -p /tmp/hud-grok-home" + args: list[str] = Field( + default_factory=lambda: [ + "-p", + "{prompt}", + "-m", + "{model}", + "--always-approve", + ], + ) + + +class AiderConfig(CLIConfig): + """Configuration for Aider's one-shot ``--message-file`` mode.""" + + model_name: str = "Aider" + model: str = Field(default="openai/gpt-5.4", validation_alias=_model_alias) + command: str = "aider" + args: list[str] = Field( + default_factory=lambda: [ + "--model", + "{model}", + "--yes-always", + "--no-auto-commits", + "--message-file", + "{prompt_file}", + ], + ) + + +class Terminus2Config(CLIConfig): + """Configuration for Harbor Terminus-2 run via ``uv --with harbor``.""" + + model_name: str = "Terminus-2" + model: str = Field(default="openai/gpt-5.4", validation_alias=_model_alias) + command: str = "uv" + use_hud_gateway: bool = False + args: list[str] = Field( + default_factory=lambda: [ + "--no-config", + "run", + "--no-project", + "--quiet", + "--python", + "3.12", + "--with", + "harbor==0.6.6", + "python", + ".hud_terminus2.py", + ], + ) + max_steps: int = 10 + result_file: str | None = ".hud_terminus2_logs/trajectory.json" + + +class MiniSweAgentConfig(CLIConfig): + """Configuration for mini-SWE-agent's local ``mini`` CLI.""" + + model_name: str = "mini-SWE-agent" + model: str = Field(default="openai/gpt-5.4", validation_alias=_model_alias) + command: str = "mini" + args: list[str] = Field( + default_factory=lambda: [ + "--model", + "{model}", + "--task", + "{prompt}", + "--yolo", + "--exit-immediately", + ], + ) + + # ----------------------------------------------------------------------------- # Browser Use # ----------------------------------------------------------------------------- diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 33c6f941e..9058d0e03 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -117,6 +117,12 @@ class AgentPreset: AgentPreset("Claude Sonnet 4.6", AgentType.CLAUDE, "claude-sonnet-4-6"), AgentPreset("GPT-5.4", AgentType.OPENAI, "gpt-5.4"), AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3-1-pro"), + AgentPreset("Codex CLI", AgentType.CODEX, "gpt-5.4"), + AgentPreset("OpenCode", AgentType.OPENCODE, "openai/gpt-5.4"), + AgentPreset("Aider", AgentType.AIDER, "openai/gpt-5.4"), + AgentPreset("Grok Build", AgentType.GROK_BUILD, "grok-build-0.1"), + AgentPreset("mini-SWE-agent", AgentType.MINI_SWE_AGENT, "openai/gpt-5.4"), + AgentPreset("Terminus-2", AgentType.TERMINUS_2, "openai/gpt-5.4"), AgentPreset( "Grok 4-1 Fast (xAI)", AgentType.OPENAI_COMPATIBLE, @@ -170,6 +176,30 @@ class AgentPreset: [openai_compatible] # base_url = "http://localhost:8000/v1" # model = "my-model" + +[codex] +# model = "gpt-5.4" + +[opencode] +# model = "openai/gpt-5.4" + +[aider] +# model = "openai/gpt-5.4" + +[grok_build] +# model = "grok-build-0.1" + +[mini_swe_agent] +# model = "openai/gpt-5.4" + +[terminus_2] +# model = "openai/gpt-5.4" +# max_steps = 10 + +[cli] +# command = "my-agent" +# args = ["--message-file", "{prompt_file}"] +# model = "my-model" """ # Agent type -> (settings attr, env var name) @@ -736,7 +766,7 @@ def eval_command( source: str | None = typer.Argument(None, help="Taskset slug or task JSON file"), agent: str | None = typer.Argument( None, - help="Model name (e.g. claude-sonnet-4-6) or agent type (claude, openai, gemini, openai_compatible)", # noqa: E501 + help="Model name (e.g. claude-sonnet-4-6) or agent type (claude, openai, gemini, openai_compatible, codex, opencode, aider, grok_build, mini_swe_agent, terminus_2, cli)", # noqa: E501 ), all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"), full: bool = typer.Option( diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py index 9a130fee4..ce732f97b 100644 --- a/hud/cli/tests/test_eval_config.py +++ b/hud/cli/tests/test_eval_config.py @@ -32,6 +32,24 @@ def test_parse_agent_type_accepts_known_value() -> None: assert cfg.agent_type.value == "openai" +@pytest.mark.parametrize( + "agent", + [ + "codex", + "opencode", + "aider", + "grok_build", + "mini_swe_agent", + "terminus_2", + "cli", + ], +) +def test_parse_agent_type_accepts_cli_agents(agent: str) -> None: + cfg = EvalConfig(agent_type=agent) + assert cfg.agent_type is not None + assert cfg.agent_type.value == agent + + def test_parse_agent_type_rejects_unknown() -> None: with pytest.raises(ValueError, match="Invalid agent"): EvalConfig(agent_type="not-an-agent") @@ -96,13 +114,18 @@ def test_load_missing_writes_template(tmp_path: Path) -> None: def test_load_parses_sections(tmp_path: Path) -> None: path = tmp_path / ".hud_eval.toml" path.write_text( - '[eval]\nagent = "openai"\nmax_steps = 5\n\n[openai]\nmodel = "gpt-4o"\n', + '[eval]\nagent = "openai"\nmax_steps = 5\n\n[openai]\nmodel = "gpt-4o"\n\n' + '[codex]\nmodel = "gpt-5"\n\n[opencode]\nmodel = "openai/gpt-5"\n\n' + '[grok_build]\nmodel = "grok-build-0.1"\n', encoding="utf-8", ) cfg = EvalConfig.load(str(path)) assert cfg.agent_type is not None and cfg.agent_type.value == "openai" assert cfg.max_steps == 5 assert cfg.agent_config["openai"]["model"] == "gpt-4o" + assert cfg.agent_config["codex"]["model"] == "gpt-5" + assert cfg.agent_config["opencode"]["model"] == "openai/gpt-5" + assert cfg.agent_config["grok_build"]["model"] == "grok-build-0.1" def test_load_resolves_env_var_placeholders( @@ -125,6 +148,12 @@ def test_merge_cli_overrides_fields() -> None: assert merged.max_steps == 7 +def test_merge_cli_accepts_cli_agent() -> None: + merged = EvalConfig().merge_cli(agent="codex", model="gpt-5") + assert merged.agent_type is not None and merged.agent_type.value == "codex" + assert merged.model == "gpt-5" + + def test_merge_cli_namespaced_config() -> None: merged = EvalConfig().merge_cli(config=["claude.max_tokens=100"]) assert merged.agent_config["claude"]["max_tokens"] == 100 @@ -150,3 +179,13 @@ def test_eval_max_steps_lands_in_agent_config() -> None: ) agent = eval_mod._build_agent(cfg) assert agent.config.max_steps == 17 + + +def test_build_agent_constructs_cli_agent() -> None: + from hud.agents.cli import CodexAgent + + cfg = EvalConfig(agent_type="codex", model="gpt-5") + agent = eval_mod._build_agent(cfg) + + assert isinstance(agent, CodexAgent) + assert agent.config.model == "gpt-5" diff --git a/hud/settings.py b/hud/settings.py index bf9552576..07e1e2b21 100644 --- a/hud/settings.py +++ b/hud/settings.py @@ -122,6 +122,12 @@ def settings_customise_sources( validation_alias="OPENROUTER_API_KEY", ) + xai_api_key: str | None = Field( + default=None, + description="API key for xAI models and Grok Build", + validation_alias="XAI_API_KEY", + ) + wandb_api_key: str | None = Field( default=None, description="API key for Weights & Biases", diff --git a/hud/types.py b/hud/types.py index b378a113c..f2e511cd6 100644 --- a/hud/types.py +++ b/hud/types.py @@ -45,14 +45,57 @@ from collections.abc import Callable from hud.agents.claude import ClaudeAgent + from hud.agents.cli import ( + AiderAgent, + CLIAgent, + CodexAgent, + GrokBuildAgent, + MiniSweAgent, + OpenCodeAgent, + Terminus2Agent, + ) from hud.agents.gemini import GeminiAgent from hud.agents.openai import OpenAIAgent from hud.agents.openai_compatible import OpenAIChatAgent - from hud.agents.types import ClaudeConfig, GeminiConfig, OpenAIChatConfig, OpenAIConfig - - AgentClass: TypeAlias = type[ClaudeAgent | GeminiAgent | OpenAIAgent | OpenAIChatAgent] + from hud.agents.types import ( + AiderConfig, + ClaudeConfig, + CLIConfig, + CodexConfig, + GeminiConfig, + GrokBuildConfig, + MiniSweAgentConfig, + OpenAIChatConfig, + OpenAIConfig, + OpenCodeConfig, + Terminus2Config, + ) + + AgentClass: TypeAlias = type[ + AiderAgent + | ClaudeAgent + | CLIAgent + | CodexAgent + | GeminiAgent + | GrokBuildAgent + | MiniSweAgent + | OpenAIAgent + | OpenAIChatAgent + | OpenCodeAgent + | Terminus2Agent + ] AgentConfigClass: TypeAlias = type[ - ClaudeConfig | GeminiConfig | OpenAIConfig | OpenAIChatConfig + AiderConfig + | ClaudeConfig + | CLIConfig + | CodexConfig + | GeminiConfig + | GrokBuildConfig + | MiniSweAgentConfig + | OpenAIChatConfig + | OpenAIConfig + | OpenCodeConfig + | Terminus2Config ] T = TypeVar("T") @@ -63,6 +106,13 @@ class AgentType(str, Enum): OPENAI = "openai" GEMINI = "gemini" OPENAI_COMPATIBLE = "openai_compatible" + CLI = "cli" + CODEX = "codex" + OPENCODE = "opencode" + AIDER = "aider" + GROK_BUILD = "grok_build" + MINI_SWE_AGENT = "mini_swe_agent" + TERMINUS_2 = "terminus_2" @property def cls(self) -> AgentClass: @@ -83,11 +133,51 @@ def cls(self) -> AgentClass: from hud.agents import OpenAIChatAgent return OpenAIChatAgent + case AgentType.CLI: + from hud.agents import CLIAgent + + return CLIAgent + case AgentType.CODEX: + from hud.agents import CodexAgent + + return CodexAgent + case AgentType.OPENCODE: + from hud.agents import OpenCodeAgent + + return OpenCodeAgent + case AgentType.AIDER: + from hud.agents import AiderAgent + + return AiderAgent + case AgentType.GROK_BUILD: + from hud.agents import GrokBuildAgent + + return GrokBuildAgent + case AgentType.MINI_SWE_AGENT: + from hud.agents import MiniSweAgent + + return MiniSweAgent + case AgentType.TERMINUS_2: + from hud.agents import Terminus2Agent + + return Terminus2Agent @property def config_cls(self) -> AgentConfigClass: """Get config class without importing agent (avoids SDK dependency).""" - from hud.agents.types import ClaudeConfig, GeminiConfig, OpenAIChatConfig, OpenAIConfig + from hud.agents.types import ( + AiderConfig, + ClaudeConfig, + CLIConfig, + CodexConfig, + GeminiConfig, + GrokBuildConfig, + MiniSweAgentConfig, + OpenAIChatConfig, + OpenAIConfig, + OpenCodeConfig, + Terminus2Config, + ) match self: case AgentType.CLAUDE: @@ -98,6 +188,20 @@ def config_cls(self) -> AgentConfigClass: return GeminiConfig case AgentType.OPENAI_COMPATIBLE: return OpenAIChatConfig + case AgentType.CLI: + return CLIConfig + case AgentType.CODEX: + return CodexConfig + case AgentType.OPENCODE: + return OpenCodeConfig + case AgentType.AIDER: + return AiderConfig + case AgentType.GROK_BUILD: + return GrokBuildConfig + case AgentType.MINI_SWE_AGENT: + return MiniSweAgentConfig + case AgentType.TERMINUS_2: + return Terminus2Config @property def gateway_provider(self) -> str: @@ -111,6 +215,17 @@ def gateway_provider(self) -> str: return "gemini" case AgentType.OPENAI_COMPATIBLE: return "openai" + case ( + AgentType.CLI + | AgentType.CODEX + | AgentType.OPENCODE + | AgentType.AIDER + | AgentType.GROK_BUILD + | AgentType.TERMINUS_2 + ): + return "openai" + case AgentType.MINI_SWE_AGENT: + return "openai" @classmethod def of(cls, agent: object) -> AgentType | None: