Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions docs/v6/advanced/harbor-convert.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ task dirs - is a *frontend* that loads into the same primitives (`Environment`,
`Task`, `Taskset`). Integrations are **loaders, not converters**: no codegen
roundtrip to run foreign tasks. The Harbor integration lives in the SDK repo at
[`integrations/harbor.py`](https://github.com/hud-evals/hud-python/blob/main/integrations/harbor.py)
- a recipe built only on the public SDK surface; copy it into your project or
run it from a checkout.
- a public-surface loader that maps Harbor folders into SDK primitives. The
included `HarborRuntime` is maintained with the SDK for local Docker execution;
copy the loader into your project or run it from a checkout.

## Prerequisites

Expand All @@ -26,22 +27,29 @@ directly - one row per task dir (`id` = the dir name), sharing one declarative
```python
from integrations.harbor import detect, load

assert detect("./terminal-bench")
taskset = load("./terminal-bench")
assert detect("./harbor_tasks")
taskset = load("./harbor_tasks")

for task in taskset:
print(task.env, task.id)
```

Like every task row, the result carries no placement. Run it by supplying one -
today that means a substrate already serving the control channel
(`runtime=Runtime(url)`); a docker provider that builds and runs each task's
`environment/` image is the planned follow-up:
Like every task row, the result carries no placement. Run it by supplying one.
For local Docker-backed Harbor execution, use `HarborRuntime`; it builds the
task's `environment/` image, runs a fresh container, exposes the workspace
through HUD's normal shell capability, and grades by running `tests/test.sh`:

```python
from hud import Runtime
from integrations.harbor import HarborRuntime

job = await taskset.run(agent, runtime=Runtime("tcp://127.0.0.1:8765"))
job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks"))
```

The eval CLI can run local Harbor task directories and datasets when you opt
into the Harbor source format:

```bash
hud eval ./harbor_tasks claude --format harbor --task-ids cancel-async-tasks --max-steps 30
```

## Export HUD tasks to Harbor
Expand Down
5 changes: 4 additions & 1 deletion docs/v6/reference/cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ For a platform taskset, pass its name or id directly: `hud eval "My Tasks" claud
| `--config`, `-c` | Agent config `key=value` (repeatable). |
| `--verbose`, `-v` | Show agent logs (step progress, tool calls) for batch runs too. |
| `--very-verbose`, `-vv` | Debug-level logs. |
| `--format` | Task source format: `hud` (default) or `harbor`. |
| `--runtime` | Placement: `local`, `hud` (HUD runtime tunnel), or `tcp://host:port`. Defaults to `local` for a tasks file; platform tasksets default to remote hosted execution. |
| `--remote` | Run the whole rollout remotely on the HUD platform. |
| `--yes`, `-y` | Skip confirmation prompt. |
Expand Down Expand Up @@ -133,7 +134,9 @@ hud sync env # sync environment metadata
```

External benchmark formats (currently Harbor) load directly into the runtime
as `Taskset`s - no conversion step. See [Harbor interop](/v6/advanced/harbor-convert).
as `Taskset`s - no conversion step. For local Harbor directories, opt in with
`--format harbor` so the CLI uses the Harbor loader and Docker-backed runtime
provider. See [Harbor interop](/v6/advanced/harbor-convert).

## Inspect

Expand Down
70 changes: 67 additions & 3 deletions hud/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:

_CONFIG_PATH = ".hud_eval.toml"
_PLACEMENT_CONFLICT_ERROR = "--runtime and --remote are mutually exclusive placement options"
_SOURCE_FORMATS = ("hud", "harbor")


def _resolve_env_vars(obj: Any) -> Any:
Expand Down Expand Up @@ -167,6 +168,7 @@ class AgentPreset:
# very_verbose = true
# auto_respond = true
# gateway = false # Route LLM API calls through HUD Gateway
# format = "hud" # hud or harbor
# runtime = "local" # local, hud, or tcp://host:port
# remote = false # Run the whole rollout remotely on HUD

Expand Down Expand Up @@ -264,6 +266,7 @@ class EvalConfig(BaseModel):
"group_size",
"auto_respond",
"gateway",
"format",
"runtime",
"remote",
}
Expand All @@ -279,6 +282,9 @@ class EvalConfig(BaseModel):
auto_respond: bool | None = None
group_size: int = 1
gateway: bool = False
#: Source format. ``None``/``hud`` means normal HUD task source loading;
#: ``harbor`` opts into the Harbor integration loader/runtime.
format: str | None = None
#: Placement: "local" (spawn each row's env from the source), "hud"
#: (HUD runtime tunnel), or a tcp:// url of an already-served env.
#: ``None`` means "infer from the source": a local file runs locally, a
Expand Down Expand Up @@ -306,6 +312,20 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
) from None
return v

@field_validator("format", mode="before")
@classmethod
def _parse_format(cls, v: Any) -> str | None:
if v is None:
return None
if not isinstance(v, str):
return v
normalized = v.strip().lower()
if normalized in ("", "hud"):
return None
if normalized in _SOURCE_FORMATS:
return normalized
raise ValueError(f"Invalid format: {v}. Must be one of: {', '.join(_SOURCE_FORMATS)}")

def source_is_local_file(self) -> bool:
"""Whether ``source`` points at an on-disk taskset (vs. a platform slug/id)."""
return self.source is not None and Path(self.source).exists()
Expand All @@ -319,6 +339,13 @@ def resolve_runtime(self) -> EvalConfig:
``--runtime`` is always honored, except ``local`` against a platform
taskset, which has no env to spawn.
"""
if self.format == "harbor":
if not self.source_is_local_file():
hud_console.error("--format harbor requires a local Harbor task directory")
raise typer.Exit(1)
if self.remote or (self.runtime is not None and self.runtime != "local"):
hud_console.error("--format harbor currently supports only local runtime placement")
raise typer.Exit(1)
if self.runtime is None:
if self.source_is_local_file():
return self.model_copy(update={"runtime": "local"})
Expand Down Expand Up @@ -502,6 +529,7 @@ def merge_cli(
gateway: bool = False,
config: list[str] | None = None,
task_ids: str | None = None,
format: str | None = None,
runtime: str | None = None,
remote: bool = False,
) -> EvalConfig:
Expand All @@ -517,6 +545,7 @@ def merge_cli(
"max_concurrent": max_concurrent,
"max_steps": max_steps,
"group_size": group_size,
"format": format,
"runtime": runtime,
}.items()
if value is not None
Expand Down Expand Up @@ -604,6 +633,8 @@ def display(self) -> None:
table.add_column("Value", style="green")

table.add_row("source", str(self.source or "-"))
if self.format:
table.add_row("format", self.format)
table.add_row("runtime", str(self.runtime or "-"))
table.add_row("agent", self.agent_type.value if self.agent_type else "-")
if self.task_ids:
Expand Down Expand Up @@ -728,6 +759,28 @@ def _spawn_target(source: Path) -> Path:
return resolved.parent


def _load_local_taskset(source_path: Path, source_format: str | None) -> Any:
from hud.eval import Taskset

format_name = source_format or "hud"
if format_name == "hud":
taskset = Taskset.from_file(source_path)
if len(taskset) == 0:
from integrations.harbor import detect

if detect(source_path):
hud_console.hint(
f"{source_path} looks like a Harbor task directory; "
"rerun with --format harbor to load it."
)
return taskset
if format_name == "harbor":
from integrations.harbor import load

return load(source_path)
raise ValueError(f"unsupported task source format: {format_name}")


def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
"""Map the config's ``runtime`` onto a placement for ``Taskset.run``.

Expand All @@ -744,6 +797,10 @@ def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
if cfg.runtime == "local":
if source_path is None:
raise ValueError("local placement requires a local source path")
if cfg.format == "harbor":
from integrations.harbor import HarborRuntime

return HarborRuntime(source_path)
return LocalRuntime(_spawn_target(source_path))
if cfg.runtime == "hud":
require_api_key("run HUD runtime tunnel evals")
Expand All @@ -767,18 +824,18 @@ async def _run_evaluation(cfg: EvalConfig) -> Any:
if cfg.source is None or cfg.agent_type is None:
raise ValueError("source and agent_type must be set")

from hud.eval import Taskset

source_path = Path(cfg.source)
is_local = source_path.exists()
if is_local:
hud_console.info(f"Loading tasks from: {cfg.source}")
try:
taskset = Taskset.from_file(source_path)
taskset = _load_local_taskset(source_path, cfg.format)
except Exception as e:
hud_console.error(f"Failed to load tasks from {cfg.source}: {e}")
raise typer.Exit(1) from e
else:
from hud.eval import Taskset

hud_console.info(f"Loading platform taskset: {cfg.source}")
try:
taskset = Taskset.from_api(cfg.source)
Expand Down Expand Up @@ -888,6 +945,11 @@ def eval_command(
gateway: bool = typer.Option(
False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
),
format: str | None = typer.Option(
None,
"--format",
help="Task source format: hud (default) or harbor.",
),
runtime: str | None = typer.Option(
None,
"--runtime",
Expand All @@ -908,6 +970,7 @@ def eval_command(
hud eval "My Tasks" claude-sonnet-4-6 --full # Platform taskset, run on the platform
hud eval tasks.json claude --config max_tokens=32768
hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway
hud eval ./harbor_tasks claude --format harbor # Run Harbor task dirs locally
hud eval tasks.json claude-sonnet-4-6 --runtime hud # Use HUD runtime tunnel
hud eval tasks.json claude-sonnet-4-6 --remote # Execute rollout remotely
"""
Expand Down Expand Up @@ -938,6 +1001,7 @@ def eval_command(
group_size=group_size,
config=config,
gateway=gateway,
format=format,
runtime=runtime,
remote=remote,
)
Expand Down
88 changes: 88 additions & 0 deletions hud/cli/tests/test_eval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@
_ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/anthropic.claude"


def _write_harbor_task(root: Path, name: str = "demo-task") -> Path:
task = root / name
(task / "environment").mkdir(parents=True)
(task / "tests").mkdir()
(task / "instruction.md").write_text("Fix the demo task.\n", encoding="utf-8")
(task / "task.toml").write_text(
'schema_version = "1.3"\n\n[task]\nname = "demo/demo-task"\n',
encoding="utf-8",
)
(task / "environment" / "Dockerfile").write_text("FROM python:3.12-slim\n", encoding="utf-8")
(task / "tests" / "test.sh").write_text(
"#!/usr/bin/env bash\nmkdir -p /logs/verifier\necho 1 > /logs/verifier/reward.txt\n",
encoding="utf-8",
)
return task


def test_is_bedrock_arn() -> None:
assert _is_bedrock_arn(_ARN) is True
assert _is_bedrock_arn("claude-sonnet-4-6") is False
Expand Down Expand Up @@ -136,6 +153,77 @@ def test_resolve_placement_runtime_hud_uses_tunnel(
assert isinstance(placement, HUDRuntime)


def test_load_local_taskset_uses_hud_loader_by_default(tmp_path: Path) -> None:
_write_harbor_task(tmp_path)

taskset = eval_mod._load_local_taskset(tmp_path, None)

assert len(taskset) == 0


def test_load_local_taskset_hints_harbor_format_on_zero_task_harbor_dir(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
_write_harbor_task(tmp_path)
hints: list[str] = []
monkeypatch.setattr(eval_mod.hud_console, "hint", lambda message, **_: hints.append(message))

taskset = eval_mod._load_local_taskset(tmp_path, None)

assert len(taskset) == 0
assert any("--format harbor" in hint for hint in hints)


def test_load_local_taskset_rejects_unknown_format(tmp_path: Path) -> None:
with pytest.raises(ValueError, match="unsupported task source format"):
eval_mod._load_local_taskset(tmp_path, "unknown")


def test_load_local_taskset_uses_harbor_loader_when_format_is_harbor(tmp_path: Path) -> None:
_write_harbor_task(tmp_path)

taskset = eval_mod._load_local_taskset(tmp_path, "harbor")

assert len(taskset) == 1
assert taskset["demo-task"].id == "demo-task"


def test_resolve_placement_local_harbor_format_uses_harbor_runtime(tmp_path: Path) -> None:
from integrations.harbor import HarborRuntime

_write_harbor_task(tmp_path)

placement = eval_mod._resolve_placement(
EvalConfig(runtime="local", format="harbor"),
tmp_path,
)

assert isinstance(placement, HarborRuntime)


def test_resolve_placement_local_hud_format_uses_local_runtime(tmp_path: Path) -> None:
from hud.eval import LocalRuntime

_write_harbor_task(tmp_path)

placement = eval_mod._resolve_placement(EvalConfig(runtime="local"), tmp_path)

assert isinstance(placement, LocalRuntime)


def test_harbor_format_rejects_nonlocal_source() -> None:
with pytest.raises(typer.Exit):
EvalConfig(source="platform/taskset", format="harbor").resolve_runtime()


def test_harbor_format_rejects_nonlocal_runtime(tmp_path: Path) -> None:
_write_harbor_task(tmp_path)

with pytest.raises(typer.Exit):
EvalConfig(source=str(tmp_path), format="harbor", runtime="hud").resolve_runtime()


def test_resolve_placement_remote_uses_hosted_runtime(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
Expand Down
11 changes: 6 additions & 5 deletions integrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
primitives. Integrations are **loaders, not converters**: no codegen roundtrip
to run foreign tasks.

This package lives outside ``hud`` on purpose: each module is a recipe built
**only on the public SDK surface** (``Environment``, ``Task``,
``Taskset``, ``Runtime``) — that constraint is the proof the core is
flexible. Copy a module into your project or run it from a checkout; nothing
in the SDK or CLI imports it.
This package lives outside ``hud`` on purpose: loaders are recipes built on the
public SDK surface (``Environment``, ``Task``, ``Taskset``, ``Runtime``). Copy a
loader into your project or run it from a checkout. The CLI may call selected
integrations explicitly for polished interop paths. A repo-maintained
integration may also expose a local provider for that explicit CLI path; that
provider is SDK implementation code, not the portable loader contract.

The contract: an integration module exposes ``detect(path) -> bool`` and
``load(path) -> Taskset``. Placement stays an execution-time concern — loaders
Expand Down
Loading
Loading