hud-evals · nancyjlau · Jun 29, 2026 · Jul 1, 2026 · Jul 3, 2026
diff --git a/docs/v6/advanced/harbor-convert.mdx b/docs/v6/advanced/harbor-convert.mdx
@@ -9,8 +9,9 @@ task dirs - is a *frontend* that loads into the same primitives (`Environment`,
 `Task`, `Taskset`). Integrations are **loaders, not converters**: no codegen
 roundtrip to run foreign tasks. The Harbor integration lives in the SDK repo at
 [`integrations/harbor.py`](https://github.com/hud-evals/hud-python/blob/main/integrations/harbor.py)
-- a recipe built only on the public SDK surface; copy it into your project or
-run it from a checkout.
+- a public-surface loader that maps Harbor folders into SDK primitives. The
+included `HarborRuntime` is maintained with the SDK for local Docker execution;
+copy the loader into your project or run it from a checkout.
 
 ## Prerequisites
 
@@ -26,22 +27,29 @@ directly - one row per task dir (`id` = the dir name), sharing one declarative
 ```python
 from integrations.harbor import detect, load
 
-assert detect("./terminal-bench")
-taskset = load("./terminal-bench")
+assert detect("./harbor_tasks")
+taskset = load("./harbor_tasks")
 
 for task in taskset:
     print(task.env, task.id)
 ```
 
-Like every task row, the result carries no placement. Run it by supplying one -
-today that means a substrate already serving the control channel
-(`runtime=Runtime(url)`); a docker provider that builds and runs each task's
-`environment/` image is the planned follow-up:
+Like every task row, the result carries no placement. Run it by supplying one.
+For local Docker-backed Harbor execution, use `HarborRuntime`; it builds the
+task's `environment/` image, runs a fresh container, exposes the workspace
+through HUD's normal shell capability, and grades by running `tests/test.sh`:
 
 ```python
-from hud import Runtime
+from integrations.harbor import HarborRuntime
 
-job = await taskset.run(agent, runtime=Runtime("tcp://127.0.0.1:8765"))
+job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks"))
+```
+
+The eval CLI can run local Harbor task directories and datasets when you opt
+into the Harbor source format:
+
+```bash
+hud eval ./harbor_tasks claude --format harbor --task-ids cancel-async-tasks --max-steps 30
 ```
 
 ## Export HUD tasks to Harbor

diff --git a/docs/v6/reference/cli.mdx b/docs/v6/reference/cli.mdx
@@ -105,6 +105,7 @@ For a platform taskset, pass its name or id directly: `hud eval "My Tasks" claud
 | `--config`, `-c` | Agent config `key=value` (repeatable). |
 | `--verbose`, `-v` | Show agent logs (step progress, tool calls) for batch runs too. |
 | `--very-verbose`, `-vv` | Debug-level logs. |
+| `--format` | Task source format: `hud` (default) or `harbor`. |
 | `--runtime` | Placement: `local`, `hud` (HUD runtime tunnel), or `tcp://host:port`. Defaults to `local` for a tasks file; platform tasksets default to remote hosted execution. |
 | `--remote` | Run the whole rollout remotely on the HUD platform. |
 | `--yes`, `-y` | Skip confirmation prompt. |
@@ -133,7 +134,9 @@ hud sync env                   # sync environment metadata
 ```
 
 External benchmark formats (currently Harbor) load directly into the runtime
-as `Taskset`s - no conversion step. See [Harbor interop](/v6/advanced/harbor-convert).
+as `Taskset`s - no conversion step. For local Harbor directories, opt in with
+`--format harbor` so the CLI uses the Harbor loader and Docker-backed runtime
+provider. See [Harbor interop](/v6/advanced/harbor-convert).
 
 ## Inspect
 

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
@@ -63,6 +63,7 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
 
 _CONFIG_PATH = ".hud_eval.toml"
 _PLACEMENT_CONFLICT_ERROR = "--runtime and --remote are mutually exclusive placement options"
+_SOURCE_FORMATS = ("hud", "harbor")
 
 
 def _resolve_env_vars(obj: Any) -> Any:
@@ -167,6 +168,7 @@ class AgentPreset:
 # very_verbose = true
 # auto_respond = true
 # gateway = false  # Route LLM API calls through HUD Gateway
+# format = "hud"  # hud or harbor
 # runtime = "local"  # local, hud, or tcp://host:port
 # remote = false  # Run the whole rollout remotely on HUD
 
@@ -264,6 +266,7 @@ class EvalConfig(BaseModel):
         "group_size",
         "auto_respond",
         "gateway",
+        "format",
         "runtime",
         "remote",
     }
@@ -279,6 +282,9 @@ class EvalConfig(BaseModel):
     auto_respond: bool | None = None
     group_size: int = 1
     gateway: bool = False
+    #: Source format. ``None``/``hud`` means normal HUD task source loading;
+    #: ``harbor`` opts into the Harbor integration loader/runtime.
+    format: str | None = None
     #: Placement: "local" (spawn each row's env from the source), "hud"
     #: (HUD runtime tunnel), or a tcp:// url of an already-served env.
     #: ``None`` means "infer from the source": a local file runs locally, a
@@ -306,6 +312,20 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
                 ) from None
         return v
 
+    @field_validator("format", mode="before")
+    @classmethod
+    def _parse_format(cls, v: Any) -> str | None:
+        if v is None:
+            return None
+        if not isinstance(v, str):
+            return v
+        normalized = v.strip().lower()
+        if normalized in ("", "hud"):
+            return None
+        if normalized in _SOURCE_FORMATS:
+            return normalized
+        raise ValueError(f"Invalid format: {v}. Must be one of: {', '.join(_SOURCE_FORMATS)}")
+
     def source_is_local_file(self) -> bool:
         """Whether ``source`` points at an on-disk taskset (vs. a platform slug/id)."""
         return self.source is not None and Path(self.source).exists()
@@ -319,6 +339,13 @@ def resolve_runtime(self) -> EvalConfig:
         ``--runtime`` is always honored, except ``local`` against a platform
         taskset, which has no env to spawn.
         """
+        if self.format == "harbor":
+            if not self.source_is_local_file():
+                hud_console.error("--format harbor requires a local Harbor task directory")
+                raise typer.Exit(1)
+            if self.remote or (self.runtime is not None and self.runtime != "local"):
+                hud_console.error("--format harbor currently supports only local runtime placement")
+                raise typer.Exit(1)
         if self.runtime is None:
             if self.source_is_local_file():
                 return self.model_copy(update={"runtime": "local"})
@@ -502,6 +529,7 @@ def merge_cli(
         gateway: bool = False,
         config: list[str] | None = None,
         task_ids: str | None = None,
+        format: str | None = None,
         runtime: str | None = None,
         remote: bool = False,
     ) -> EvalConfig:
@@ -517,6 +545,7 @@ def merge_cli(
                 "max_concurrent": max_concurrent,
                 "max_steps": max_steps,
                 "group_size": group_size,
+                "format": format,
                 "runtime": runtime,
             }.items()
             if value is not None
@@ -604,6 +633,8 @@ def display(self) -> None:
         table.add_column("Value", style="green")
 
         table.add_row("source", str(self.source or "-"))
+        if self.format:
+            table.add_row("format", self.format)
         table.add_row("runtime", str(self.runtime or "-"))
         table.add_row("agent", self.agent_type.value if self.agent_type else "-")
         if self.task_ids:
@@ -728,6 +759,28 @@ def _spawn_target(source: Path) -> Path:
     return resolved.parent
 
 
+def _load_local_taskset(source_path: Path, source_format: str | None) -> Any:
+    from hud.eval import Taskset
+
+    format_name = source_format or "hud"
+    if format_name == "hud":
+        taskset = Taskset.from_file(source_path)
+        if len(taskset) == 0:
+            from integrations.harbor import detect
+
+            if detect(source_path):
+                hud_console.hint(
+                    f"{source_path} looks like a Harbor task directory; "
+                    "rerun with --format harbor to load it."
+                )
+        return taskset
+    if format_name == "harbor":
+        from integrations.harbor import load
+
+        return load(source_path)
+    raise ValueError(f"unsupported task source format: {format_name}")
+
+
 def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
     """Map the config's ``runtime`` onto a placement for ``Taskset.run``.
 
@@ -744,6 +797,10 @@ def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
     if cfg.runtime == "local":
         if source_path is None:
             raise ValueError("local placement requires a local source path")
+        if cfg.format == "harbor":
+            from integrations.harbor import HarborRuntime
+
+            return HarborRuntime(source_path)
         return LocalRuntime(_spawn_target(source_path))
     if cfg.runtime == "hud":
         require_api_key("run HUD runtime tunnel evals")
@@ -767,18 +824,18 @@ async def _run_evaluation(cfg: EvalConfig) -> Any:
     if cfg.source is None or cfg.agent_type is None:
         raise ValueError("source and agent_type must be set")
 
-    from hud.eval import Taskset
-
     source_path = Path(cfg.source)
     is_local = source_path.exists()
     if is_local:
         hud_console.info(f"Loading tasks from: {cfg.source}")
         try:
-            taskset = Taskset.from_file(source_path)
+            taskset = _load_local_taskset(source_path, cfg.format)
         except Exception as e:
             hud_console.error(f"Failed to load tasks from {cfg.source}: {e}")
             raise typer.Exit(1) from e
     else:
+        from hud.eval import Taskset
+
         hud_console.info(f"Loading platform taskset: {cfg.source}")
         try:
             taskset = Taskset.from_api(cfg.source)
@@ -888,6 +945,11 @@ def eval_command(
     gateway: bool = typer.Option(
         False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
     ),
+    format: str | None = typer.Option(
+        None,
+        "--format",
+        help="Task source format: hud (default) or harbor.",
+    ),
     runtime: str | None = typer.Option(
         None,
         "--runtime",
@@ -908,6 +970,7 @@ def eval_command(
         hud eval "My Tasks" claude-sonnet-4-6 --full   # Platform taskset, run on the platform
         hud eval tasks.json claude --config max_tokens=32768
         hud eval tasks.json claude --gateway           # Route LLM calls through HUD Gateway
+        hud eval ./harbor_tasks claude --format harbor # Run Harbor task dirs locally
         hud eval tasks.json claude-sonnet-4-6 --runtime hud  # Use HUD runtime tunnel
         hud eval tasks.json claude-sonnet-4-6 --remote       # Execute rollout remotely
     """
@@ -938,6 +1001,7 @@ def eval_command(
             group_size=group_size,
             config=config,
             gateway=gateway,
+            format=format,
             runtime=runtime,
             remote=remote,
         )

diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py
@@ -20,6 +20,23 @@
 _ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/anthropic.claude"
 
 
+def _write_harbor_task(root: Path, name: str = "demo-task") -> Path:
+    task = root / name
+    (task / "environment").mkdir(parents=True)
+    (task / "tests").mkdir()
+    (task / "instruction.md").write_text("Fix the demo task.\n", encoding="utf-8")
+    (task / "task.toml").write_text(
+        'schema_version = "1.3"\n\n[task]\nname = "demo/demo-task"\n',
+        encoding="utf-8",
+    )
+    (task / "environment" / "Dockerfile").write_text("FROM python:3.12-slim\n", encoding="utf-8")
+    (task / "tests" / "test.sh").write_text(
+        "#!/usr/bin/env bash\nmkdir -p /logs/verifier\necho 1 > /logs/verifier/reward.txt\n",
+        encoding="utf-8",
+    )
+    return task
+
+
 def test_is_bedrock_arn() -> None:
     assert _is_bedrock_arn(_ARN) is True
     assert _is_bedrock_arn("claude-sonnet-4-6") is False
@@ -136,6 +153,77 @@ def test_resolve_placement_runtime_hud_uses_tunnel(
     assert isinstance(placement, HUDRuntime)
 
 
+def test_load_local_taskset_uses_hud_loader_by_default(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    taskset = eval_mod._load_local_taskset(tmp_path, None)
+
+    assert len(taskset) == 0
+
+
+def test_load_local_taskset_hints_harbor_format_on_zero_task_harbor_dir(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    _write_harbor_task(tmp_path)
+    hints: list[str] = []
+    monkeypatch.setattr(eval_mod.hud_console, "hint", lambda message, **_: hints.append(message))
+
+    taskset = eval_mod._load_local_taskset(tmp_path, None)
+
+    assert len(taskset) == 0
+    assert any("--format harbor" in hint for hint in hints)
+
+
+def test_load_local_taskset_rejects_unknown_format(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="unsupported task source format"):
+        eval_mod._load_local_taskset(tmp_path, "unknown")
+
+
+def test_load_local_taskset_uses_harbor_loader_when_format_is_harbor(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    taskset = eval_mod._load_local_taskset(tmp_path, "harbor")
+
+    assert len(taskset) == 1
+    assert taskset["demo-task"].id == "demo-task"
+
+
+def test_resolve_placement_local_harbor_format_uses_harbor_runtime(tmp_path: Path) -> None:
+    from integrations.harbor import HarborRuntime
+
+    _write_harbor_task(tmp_path)
+
+    placement = eval_mod._resolve_placement(
+        EvalConfig(runtime="local", format="harbor"),
+        tmp_path,
+    )
+
+    assert isinstance(placement, HarborRuntime)
+
+
+def test_resolve_placement_local_hud_format_uses_local_runtime(tmp_path: Path) -> None:
+    from hud.eval import LocalRuntime
+
+    _write_harbor_task(tmp_path)
+
+    placement = eval_mod._resolve_placement(EvalConfig(runtime="local"), tmp_path)
+
+    assert isinstance(placement, LocalRuntime)
+
+
+def test_harbor_format_rejects_nonlocal_source() -> None:
+    with pytest.raises(typer.Exit):
+        EvalConfig(source="platform/taskset", format="harbor").resolve_runtime()
+
+
+def test_harbor_format_rejects_nonlocal_runtime(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    with pytest.raises(typer.Exit):
+        EvalConfig(source=str(tmp_path), format="harbor", runtime="hud").resolve_runtime()
+
+
 def test_resolve_placement_remote_uses_hosted_runtime(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,

diff --git a/integrations/__init__.py b/integrations/__init__.py
@@ -5,11 +5,12 @@
 primitives. Integrations are **loaders, not converters**: no codegen roundtrip
 to run foreign tasks.
 
-This package lives outside ``hud`` on purpose: each module is a recipe built
-**only on the public SDK surface** (``Environment``, ``Task``,
-``Taskset``, ``Runtime``) — that constraint is the proof the core is
-flexible. Copy a module into your project or run it from a checkout; nothing
-in the SDK or CLI imports it.
+This package lives outside ``hud`` on purpose: loaders are recipes built on the
+public SDK surface (``Environment``, ``Task``, ``Taskset``, ``Runtime``). Copy a
+loader into your project or run it from a checkout. The CLI may call selected
+integrations explicitly for polished interop paths. A repo-maintained
+integration may also expose a local provider for that explicit CLI path; that
+provider is SDK implementation code, not the portable loader contract.
 
 The contract: an integration module exposes ``detect(path) -> bool`` and
 ``load(path) -> Taskset``. Placement stays an execution-time concern — loaders