diff --git a/docs/v6/advanced/harbor-convert.mdx b/docs/v6/advanced/harbor-convert.mdx index 5ddb9f4f..21dbd721 100644 --- a/docs/v6/advanced/harbor-convert.mdx +++ b/docs/v6/advanced/harbor-convert.mdx @@ -9,8 +9,9 @@ task dirs - is a *frontend* that loads into the same primitives (`Environment`, `Task`, `Taskset`). Integrations are **loaders, not converters**: no codegen roundtrip to run foreign tasks. The Harbor integration lives in the SDK repo at [`integrations/harbor.py`](https://github.com/hud-evals/hud-python/blob/main/integrations/harbor.py) -- a recipe built only on the public SDK surface; copy it into your project or -run it from a checkout. +- a public-surface loader that maps Harbor folders into SDK primitives. The +included `HarborRuntime` is maintained with the SDK for local Docker execution; +copy the loader into your project or run it from a checkout. ## Prerequisites @@ -26,22 +27,29 @@ directly - one row per task dir (`id` = the dir name), sharing one declarative ```python from integrations.harbor import detect, load -assert detect("./terminal-bench") -taskset = load("./terminal-bench") +assert detect("./harbor_tasks") +taskset = load("./harbor_tasks") for task in taskset: print(task.env, task.id) ``` -Like every task row, the result carries no placement. Run it by supplying one - -today that means a substrate already serving the control channel -(`runtime=Runtime(url)`); a docker provider that builds and runs each task's -`environment/` image is the planned follow-up: +Like every task row, the result carries no placement. Run it by supplying one. +For local Docker-backed Harbor execution, use `HarborRuntime`; it builds the +task's `environment/` image, runs a fresh container, exposes the workspace +through HUD's normal shell capability, and grades by running `tests/test.sh`: ```python -from hud import Runtime +from integrations.harbor import HarborRuntime -job = await taskset.run(agent, runtime=Runtime("tcp://127.0.0.1:8765")) +job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks")) +``` + +The eval CLI can run local Harbor task directories and datasets when you opt +into the Harbor source format: + +```bash +hud eval ./harbor_tasks claude --format harbor --task-ids cancel-async-tasks --max-steps 30 ``` ## Export HUD tasks to Harbor diff --git a/docs/v6/reference/cli.mdx b/docs/v6/reference/cli.mdx index 6a5f51bb..8b18431b 100644 --- a/docs/v6/reference/cli.mdx +++ b/docs/v6/reference/cli.mdx @@ -105,6 +105,7 @@ For a platform taskset, pass its name or id directly: `hud eval "My Tasks" claud | `--config`, `-c` | Agent config `key=value` (repeatable). | | `--verbose`, `-v` | Show agent logs (step progress, tool calls) for batch runs too. | | `--very-verbose`, `-vv` | Debug-level logs. | +| `--format` | Task source format: `hud` (default) or `harbor`. | | `--runtime` | Placement: `local`, `hud` (HUD runtime tunnel), or `tcp://host:port`. Defaults to `local` for a tasks file; platform tasksets default to remote hosted execution. | | `--remote` | Run the whole rollout remotely on the HUD platform. | | `--yes`, `-y` | Skip confirmation prompt. | @@ -133,7 +134,9 @@ hud sync env # sync environment metadata ``` External benchmark formats (currently Harbor) load directly into the runtime -as `Taskset`s - no conversion step. See [Harbor interop](/v6/advanced/harbor-convert). +as `Taskset`s - no conversion step. For local Harbor directories, opt in with +`--format harbor` so the CLI uses the Harbor loader and Docker-backed runtime +provider. See [Harbor interop](/v6/advanced/harbor-convert). ## Inspect diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 39afd6ed..01bf0883 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -63,6 +63,7 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None: _CONFIG_PATH = ".hud_eval.toml" _PLACEMENT_CONFLICT_ERROR = "--runtime and --remote are mutually exclusive placement options" +_SOURCE_FORMATS = ("hud", "harbor") def _resolve_env_vars(obj: Any) -> Any: @@ -167,6 +168,7 @@ class AgentPreset: # very_verbose = true # auto_respond = true # gateway = false # Route LLM API calls through HUD Gateway +# format = "hud" # hud or harbor # runtime = "local" # local, hud, or tcp://host:port # remote = false # Run the whole rollout remotely on HUD @@ -264,6 +266,7 @@ class EvalConfig(BaseModel): "group_size", "auto_respond", "gateway", + "format", "runtime", "remote", } @@ -279,6 +282,9 @@ class EvalConfig(BaseModel): auto_respond: bool | None = None group_size: int = 1 gateway: bool = False + #: Source format. ``None``/``hud`` means normal HUD task source loading; + #: ``harbor`` opts into the Harbor integration loader/runtime. + format: str | None = None #: Placement: "local" (spawn each row's env from the source), "hud" #: (HUD runtime tunnel), or a tcp:// url of an already-served env. #: ``None`` means "infer from the source": a local file runs locally, a @@ -306,6 +312,20 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None: ) from None return v + @field_validator("format", mode="before") + @classmethod + def _parse_format(cls, v: Any) -> str | None: + if v is None: + return None + if not isinstance(v, str): + return v + normalized = v.strip().lower() + if normalized in ("", "hud"): + return None + if normalized in _SOURCE_FORMATS: + return normalized + raise ValueError(f"Invalid format: {v}. Must be one of: {', '.join(_SOURCE_FORMATS)}") + def source_is_local_file(self) -> bool: """Whether ``source`` points at an on-disk taskset (vs. a platform slug/id).""" return self.source is not None and Path(self.source).exists() @@ -319,6 +339,13 @@ def resolve_runtime(self) -> EvalConfig: ``--runtime`` is always honored, except ``local`` against a platform taskset, which has no env to spawn. """ + if self.format == "harbor": + if not self.source_is_local_file(): + hud_console.error("--format harbor requires a local Harbor task directory") + raise typer.Exit(1) + if self.remote or (self.runtime is not None and self.runtime != "local"): + hud_console.error("--format harbor currently supports only local runtime placement") + raise typer.Exit(1) if self.runtime is None: if self.source_is_local_file(): return self.model_copy(update={"runtime": "local"}) @@ -502,6 +529,7 @@ def merge_cli( gateway: bool = False, config: list[str] | None = None, task_ids: str | None = None, + format: str | None = None, runtime: str | None = None, remote: bool = False, ) -> EvalConfig: @@ -517,6 +545,7 @@ def merge_cli( "max_concurrent": max_concurrent, "max_steps": max_steps, "group_size": group_size, + "format": format, "runtime": runtime, }.items() if value is not None @@ -604,6 +633,8 @@ def display(self) -> None: table.add_column("Value", style="green") table.add_row("source", str(self.source or "-")) + if self.format: + table.add_row("format", self.format) table.add_row("runtime", str(self.runtime or "-")) table.add_row("agent", self.agent_type.value if self.agent_type else "-") if self.task_ids: @@ -728,6 +759,28 @@ def _spawn_target(source: Path) -> Path: return resolved.parent +def _load_local_taskset(source_path: Path, source_format: str | None) -> Any: + from hud.eval import Taskset + + format_name = source_format or "hud" + if format_name == "hud": + taskset = Taskset.from_file(source_path) + if len(taskset) == 0: + from integrations.harbor import detect + + if detect(source_path): + hud_console.hint( + f"{source_path} looks like a Harbor task directory; " + "rerun with --format harbor to load it." + ) + return taskset + if format_name == "harbor": + from integrations.harbor import load + + return load(source_path) + raise ValueError(f"unsupported task source format: {format_name}") + + def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any: """Map the config's ``runtime`` onto a placement for ``Taskset.run``. @@ -744,6 +797,10 @@ def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any: if cfg.runtime == "local": if source_path is None: raise ValueError("local placement requires a local source path") + if cfg.format == "harbor": + from integrations.harbor import HarborRuntime + + return HarborRuntime(source_path) return LocalRuntime(_spawn_target(source_path)) if cfg.runtime == "hud": require_api_key("run HUD runtime tunnel evals") @@ -767,18 +824,18 @@ async def _run_evaluation(cfg: EvalConfig) -> Any: if cfg.source is None or cfg.agent_type is None: raise ValueError("source and agent_type must be set") - from hud.eval import Taskset - source_path = Path(cfg.source) is_local = source_path.exists() if is_local: hud_console.info(f"Loading tasks from: {cfg.source}") try: - taskset = Taskset.from_file(source_path) + taskset = _load_local_taskset(source_path, cfg.format) except Exception as e: hud_console.error(f"Failed to load tasks from {cfg.source}: {e}") raise typer.Exit(1) from e else: + from hud.eval import Taskset + hud_console.info(f"Loading platform taskset: {cfg.source}") try: taskset = Taskset.from_api(cfg.source) @@ -888,6 +945,11 @@ def eval_command( gateway: bool = typer.Option( False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway" ), + format: str | None = typer.Option( + None, + "--format", + help="Task source format: hud (default) or harbor.", + ), runtime: str | None = typer.Option( None, "--runtime", @@ -908,6 +970,7 @@ def eval_command( hud eval "My Tasks" claude-sonnet-4-6 --full # Platform taskset, run on the platform hud eval tasks.json claude --config max_tokens=32768 hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway + hud eval ./harbor_tasks claude --format harbor # Run Harbor task dirs locally hud eval tasks.json claude-sonnet-4-6 --runtime hud # Use HUD runtime tunnel hud eval tasks.json claude-sonnet-4-6 --remote # Execute rollout remotely """ @@ -938,6 +1001,7 @@ def eval_command( group_size=group_size, config=config, gateway=gateway, + format=format, runtime=runtime, remote=remote, ) diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py index 6b94f0b2..bbd0d4f9 100644 --- a/hud/cli/tests/test_eval_config.py +++ b/hud/cli/tests/test_eval_config.py @@ -20,6 +20,23 @@ _ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/anthropic.claude" +def _write_harbor_task(root: Path, name: str = "demo-task") -> Path: + task = root / name + (task / "environment").mkdir(parents=True) + (task / "tests").mkdir() + (task / "instruction.md").write_text("Fix the demo task.\n", encoding="utf-8") + (task / "task.toml").write_text( + 'schema_version = "1.3"\n\n[task]\nname = "demo/demo-task"\n', + encoding="utf-8", + ) + (task / "environment" / "Dockerfile").write_text("FROM python:3.12-slim\n", encoding="utf-8") + (task / "tests" / "test.sh").write_text( + "#!/usr/bin/env bash\nmkdir -p /logs/verifier\necho 1 > /logs/verifier/reward.txt\n", + encoding="utf-8", + ) + return task + + def test_is_bedrock_arn() -> None: assert _is_bedrock_arn(_ARN) is True assert _is_bedrock_arn("claude-sonnet-4-6") is False @@ -136,6 +153,77 @@ def test_resolve_placement_runtime_hud_uses_tunnel( assert isinstance(placement, HUDRuntime) +def test_load_local_taskset_uses_hud_loader_by_default(tmp_path: Path) -> None: + _write_harbor_task(tmp_path) + + taskset = eval_mod._load_local_taskset(tmp_path, None) + + assert len(taskset) == 0 + + +def test_load_local_taskset_hints_harbor_format_on_zero_task_harbor_dir( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _write_harbor_task(tmp_path) + hints: list[str] = [] + monkeypatch.setattr(eval_mod.hud_console, "hint", lambda message, **_: hints.append(message)) + + taskset = eval_mod._load_local_taskset(tmp_path, None) + + assert len(taskset) == 0 + assert any("--format harbor" in hint for hint in hints) + + +def test_load_local_taskset_rejects_unknown_format(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="unsupported task source format"): + eval_mod._load_local_taskset(tmp_path, "unknown") + + +def test_load_local_taskset_uses_harbor_loader_when_format_is_harbor(tmp_path: Path) -> None: + _write_harbor_task(tmp_path) + + taskset = eval_mod._load_local_taskset(tmp_path, "harbor") + + assert len(taskset) == 1 + assert taskset["demo-task"].id == "demo-task" + + +def test_resolve_placement_local_harbor_format_uses_harbor_runtime(tmp_path: Path) -> None: + from integrations.harbor import HarborRuntime + + _write_harbor_task(tmp_path) + + placement = eval_mod._resolve_placement( + EvalConfig(runtime="local", format="harbor"), + tmp_path, + ) + + assert isinstance(placement, HarborRuntime) + + +def test_resolve_placement_local_hud_format_uses_local_runtime(tmp_path: Path) -> None: + from hud.eval import LocalRuntime + + _write_harbor_task(tmp_path) + + placement = eval_mod._resolve_placement(EvalConfig(runtime="local"), tmp_path) + + assert isinstance(placement, LocalRuntime) + + +def test_harbor_format_rejects_nonlocal_source() -> None: + with pytest.raises(typer.Exit): + EvalConfig(source="platform/taskset", format="harbor").resolve_runtime() + + +def test_harbor_format_rejects_nonlocal_runtime(tmp_path: Path) -> None: + _write_harbor_task(tmp_path) + + with pytest.raises(typer.Exit): + EvalConfig(source=str(tmp_path), format="harbor", runtime="hud").resolve_runtime() + + def test_resolve_placement_remote_uses_hosted_runtime( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, diff --git a/integrations/__init__.py b/integrations/__init__.py index c8549e0f..baa460f4 100644 --- a/integrations/__init__.py +++ b/integrations/__init__.py @@ -5,11 +5,12 @@ primitives. Integrations are **loaders, not converters**: no codegen roundtrip to run foreign tasks. -This package lives outside ``hud`` on purpose: each module is a recipe built -**only on the public SDK surface** (``Environment``, ``Task``, -``Taskset``, ``Runtime``) — that constraint is the proof the core is -flexible. Copy a module into your project or run it from a checkout; nothing -in the SDK or CLI imports it. +This package lives outside ``hud`` on purpose: loaders are recipes built on the +public SDK surface (``Environment``, ``Task``, ``Taskset``, ``Runtime``). Copy a +loader into your project or run it from a checkout. The CLI may call selected +integrations explicitly for polished interop paths. A repo-maintained +integration may also expose a local provider for that explicit CLI path; that +provider is SDK implementation code, not the portable loader contract. The contract: an integration module exposes ``detect(path) -> bool`` and ``load(path) -> Taskset``. Placement stays an execution-time concern — loaders diff --git a/integrations/harbor.py b/integrations/harbor.py index 497711e3..90342625 100644 --- a/integrations/harbor.py +++ b/integrations/harbor.py @@ -11,11 +11,9 @@ :func:`load` parses a task dir (or a dataset of them) into rows sharing one env name per distinct ``environment/`` build context — no codegen, no -roundtrip. Like every row, the result is runnable -once a placement is supplied (``runtime=Runtime(url)`` against a served substrate -today). Providers receive the row being placed, so a docker provider that -builds and runs each row's ``environment/`` image is the named follow-up — -expressible without engine changes. +roundtrip. Like every row, the result is runnable once a placement is supplied. +Use :class:`HarborRuntime` for local Docker-backed execution of Harbor tasks, or +``runtime=Runtime(url)`` to attach to a substrate served elsewhere. :func:`export` is the reverse direction: turn a HUD task source into self-contained Harbor task folders (``task.toml`` + ``instruction.md`` + @@ -40,19 +38,23 @@ from __future__ import annotations -import hashlib import json import logging -import re import shutil -import tomllib -from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any from hud.environment import Environment from hud.environment.server import TaskRunner from hud.eval import Task, Taskset +from integrations.harbor_common import ( + _HarborTask, + _is_harbor_task, + _parse_task, + _slugify, + _task_dirs, +) +from integrations.harbor_runtime import HarborRuntime if TYPE_CHECKING: from collections.abc import Callable @@ -74,18 +76,12 @@ "__pycache__", "*.pyc", ".git", ".venv", "venv", "*.egg-info", ".pytest_cache" ) - # ─── load: Harbor dirs -> Taskset ────────────────────────────────────── def detect(path: str | Path) -> bool: """True when *path* is a Harbor task dir or a dataset of them.""" - root = Path(path) - if _is_harbor_task(root): - return True - if root.is_dir(): - return any(_is_harbor_task(d) for d in root.iterdir() if d.is_dir()) - return False + return bool(_task_dirs(path)) def load(path: str | Path) -> Taskset: @@ -96,12 +92,8 @@ def load(path: str | Path) -> Taskset: context (content-hashed), derived from the dataset name. """ root = Path(path).resolve() - if _is_harbor_task(root): - task_dirs = [root] - dataset_name = root.parent.name - else: - task_dirs = sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d)) - dataset_name = root.name + task_dirs = _task_dirs(root) + dataset_name = root.parent.name if _is_harbor_task(root) else root.name if not task_dirs: raise ValueError(f"no Harbor tasks found in {path}") @@ -126,54 +118,6 @@ def load(path: str | Path) -> Taskset: return Taskset(base_name, tasks) -def _slugify(name: str) -> str: - """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name.""" - normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-")) - return re.sub(r"-+", "-", normalized).strip("-") or "harbor" - - -def _is_harbor_task(path: Path) -> bool: - return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists() - - -def _hash_directory(path: Path) -> str: - """Content-hash a directory for grouping tasks by identical environments.""" - hasher = hashlib.sha256() - if not path.exists(): - return "empty" - for file_path in sorted(path.rglob("*")): - if file_path.is_file(): - hasher.update(str(file_path.relative_to(path)).encode()) - hasher.update(file_path.read_bytes()) - return hasher.hexdigest()[:16] - - -@dataclass(frozen=True, slots=True) -class _HarborTask: - """One parsed Harbor task dir.""" - - task_id: str - config: dict[str, Any] - env_hash: str - - -def _parse_task(task_dir: Path) -> _HarborTask | None: - if not (task_dir / "instruction.md").is_file(): - LOGGER.warning("failed to read instruction.md in %s", task_dir) - return None - try: - config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8")) - except (OSError, tomllib.TOMLDecodeError): - LOGGER.warning("failed to parse task.toml in %s", task_dir) - config = {} - env_dir = task_dir / "environment" - return _HarborTask( - task_id=task_dir.name, - config=config, - env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env", - ) - - # ─── export: HUD tasks -> Harbor task folders ─────────────────────────── @@ -443,6 +387,7 @@ async def export( "ALLOWED_PROTOCOLS", "CONTROL_PORT", "DEFAULT_ANSWER_FILE", + "HarborRuntime", "detect", "export", "load", diff --git a/integrations/harbor_common.py b/integrations/harbor_common.py new file mode 100644 index 00000000..53294e09 --- /dev/null +++ b/integrations/harbor_common.py @@ -0,0 +1,70 @@ +"""Shared helpers for Harbor task integration.""" + +from __future__ import annotations + +import hashlib +import logging +import re +import tomllib +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +LOGGER = logging.getLogger(__name__) + + +def _slugify(name: str) -> str: + """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name.""" + normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-")) + return re.sub(r"-+", "-", normalized).strip("-") or "harbor" + + +def _is_harbor_task(path: Path) -> bool: + return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists() + + +def _task_dirs(path: str | Path) -> list[Path]: + root = Path(path) + if _is_harbor_task(root): + return [root] + if root.is_dir(): + return sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d)) + return [] + + +def _hash_directory(path: Path) -> str: + """Content-hash a directory for grouping tasks by identical environments.""" + hasher = hashlib.sha256() + if not path.exists(): + return "empty" + for file_path in sorted(path.rglob("*")): + if file_path.is_file(): + hasher.update(str(file_path.relative_to(path)).encode()) + hasher.update(file_path.read_bytes()) + return hasher.hexdigest()[:16] + + +@dataclass(frozen=True, slots=True) +class _HarborTask: + """One parsed Harbor task dir.""" + + task_id: str + config: dict[str, Any] + env_hash: str + + +def _parse_task(task_dir: Path) -> _HarborTask | None: + if not (task_dir / "instruction.md").is_file(): + LOGGER.warning("failed to read instruction.md in %s", task_dir) + return None + try: + config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8")) + except (OSError, tomllib.TOMLDecodeError): + LOGGER.warning("failed to parse task.toml in %s", task_dir) + config = {} + env_dir = task_dir / "environment" + return _HarborTask( + task_id=task_dir.name, + config=config, + env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env", + ) diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py new file mode 100644 index 00000000..50277726 --- /dev/null +++ b/integrations/harbor_runtime.py @@ -0,0 +1,444 @@ +"""Local Docker-backed runtime for Harbor task directories.""" + +from __future__ import annotations + +import asyncio +import contextlib +import json +import os +import tempfile +import tomllib +import uuid +from collections.abc import AsyncGenerator # noqa: TC003 - env.template resolves this at runtime. +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from hud.environment import Environment +from hud.environment.workspace import Workspace +from integrations.harbor_common import _hash_directory, _slugify, _task_dirs + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + import asyncssh + + from hud.eval import Task + from hud.eval.runtime import Runtime + + +class HarborRuntime: + """Run Harbor task directories through HUD's local rollout engine. + + The provider builds the Harbor task's ``environment/`` Docker context, then + materializes the built image's working directory onto a writable host + workspace and bind-mounts it back over the same guest path. Because the + workspace is the image's actual working directory (source *plus* every file + the build generated — start scripts, installed dependencies, compiled output, + seeded databases — with their original mode bits), the agent sees exactly + what the image would run, and edits made over SFTP are visible to the running + process. If the task ships a ``docker-compose.yaml``/``.yml``, the provider + starts it with an overlay that keeps the ``main`` service idle while + preserving sidecars such as databases. Shell commands execute inside the main + container via ``docker exec``. Grading runs the Harbor ``tests/test.sh`` + inside the same main container, bounded by the task's ``[verifier] + timeout_sec``, and reads ``/logs/verifier/reward.json`` or ``reward.txt``. + """ + + def __init__( + self, + path: str | Path, + *, + ready_timeout: float = 120.0, + ) -> None: + self.root = Path(path).resolve() + self.ready_timeout = ready_timeout + self._task_dirs = {task_dir.name: task_dir for task_dir in _task_dirs(self.root)} + if not self._task_dirs: + raise ValueError(f"no Harbor tasks found in {path}") + + @contextlib.asynccontextmanager + async def __call__(self, task: Task) -> AsyncIterator[Runtime]: + from hud.eval.runtime import Runtime, _docker, _local + + task_dir = self._task_dirs.get(task.id) + if task_dir is None: + raise KeyError(f"HarborRuntime has no task directory for {task.id!r}") + env_dir = task_dir / "environment" + tests_dir = task_dir / "tests" + if not (env_dir / "Dockerfile").is_file(): + raise FileNotFoundError(f"Harbor task {task.id!r} has no environment/Dockerfile") + if not (tests_dir / "test.sh").is_file(): + raise FileNotFoundError(f"Harbor task {task.id!r} has no tests/test.sh") + + with tempfile.TemporaryDirectory(prefix=f"hud-harbor-{_slugify(task.id)}-") as tmp: + tmp_path = Path(tmp) + workspace = tmp_path / "workspace" + logs = tmp_path / "logs" + workspace.mkdir() + logs.mkdir(parents=True, exist_ok=True) + + image = await self._build_image(env_dir) + workdir = await _image_workdir(image) + await _materialize_workspace(image, workspace, workdir) + + compose_file = _compose_file(env_dir) + if compose_file is not None: + await _docker("image", "rm", image, check=False) + acquire = self._compose_container( + task, compose_file, workspace, workdir, tests_dir, logs + ) + else: + acquire = self._single_container(task, image, workspace, workdir, tests_dir, logs) + async with acquire as (container, provider): + env = self._environment_for(task, task_dir, workspace, workdir, logs, container) + async with _local(env) as runtime: + yield Runtime( + runtime.url, + params={ + **runtime.params, + "provider": provider, + "container": container, + "ready_timeout": self.ready_timeout, + }, + config=runtime.config, + ) + + @contextlib.asynccontextmanager + async def _single_container( + self, + task: Task, + image: str, + workspace: Path, + workdir: str, + tests_dir: Path, + logs: Path, + ) -> AsyncIterator[tuple[str, str]]: + from hud.eval.runtime import _docker + + container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}" + out, _ = await _docker( + "run", + "--detach", + "--name", + container_name, + "--workdir", + workdir, + "--entrypoint", + "sleep", + "--volume", + f"{workspace}:{workdir}", + "--volume", + f"{tests_dir}:/tests:ro", + "--volume", + f"{logs}:/logs", + image, + "infinity", + ) + container = out.strip() + try: + yield container, "harbor" + finally: + with contextlib.suppress(Exception): + await _release_mount_permissions(container, workdir) + await _docker("rm", "--force", "--volumes", container, check=False) + await _docker("image", "rm", image, check=False) + + @contextlib.asynccontextmanager + async def _compose_container( + self, + task: Task, + compose_file: Path, + workspace: Path, + workdir: str, + tests_dir: Path, + logs: Path, + ) -> AsyncIterator[tuple[str, str]]: + from hud.eval.runtime import _docker + + project = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}" + overlay = workspace.parent / "compose.hud.yaml" + overlay.write_text( + _compose_overlay(workspace=workspace, workdir=workdir, tests_dir=tests_dir, logs=logs), + encoding="utf-8", + newline="\n", + ) + compose_args = ("compose", "-f", str(compose_file), "-f", str(overlay), "-p", project) + container = "" + try: + await _docker(*compose_args, "up", "--detach", "--build") + out, _ = await _docker(*compose_args, "ps", "-q", "main") + container = out.strip() + if not container: + raise RuntimeError( + f"docker compose project {project} did not create a main service" + ) + yield container, "harbor-compose" + finally: + if container: + with contextlib.suppress(Exception): + await _release_mount_permissions(container, workdir) + await _docker( + *compose_args, + "down", + "--volumes", + "--remove-orphans", + "--rmi", + "local", + check=False, + ) + + async def _build_image(self, env_dir: Path) -> str: + from hud.eval.runtime import _docker + + tag = f"hud-harbor:{_hash_directory(env_dir)}-{uuid.uuid4().hex[:8]}" + await _docker("build", "--tag", tag, str(env_dir)) + return tag + + def _environment_for( + self, + task: Task, + task_dir: Path, + workspace: Path, + workdir: str, + logs: Path, + container: str, + ) -> Environment: + env = Environment(task.env) + workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path=workdir) + verifier_timeout = _verifier_timeout(task_dir) + + @env.initialize + async def _up() -> None: + await workspace_daemon.start() + env.add_capability(workspace_daemon.capability("shell")) + + @env.shutdown + async def _down() -> None: + await workspace_daemon.stop() + + @env.template(id=task.id, description=f"Harbor task {task.id}") + async def _run_harbor_task() -> AsyncGenerator[Any, Any]: + answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8") + yield await self._grade( + container, workdir, logs, answer, verifier_timeout=verifier_timeout + ) + + return env + + async def _grade( + self, container: str, workdir: str, logs: Path, answer: Any, *, verifier_timeout: float + ) -> dict[str, Any]: + answer_file = logs / "agent_answer.txt" + answer_file.parent.mkdir(parents=True, exist_ok=True) + answer_file.write_text("" if answer is None else str(answer), encoding="utf-8") + proc = await asyncio.create_subprocess_exec( + "docker", + "exec", + "--workdir", + workdir, + container, + "bash", + "/tests/test.sh", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + out_bytes, err_bytes = await asyncio.wait_for( + proc.communicate(), timeout=verifier_timeout + ) + except TimeoutError: + proc.kill() + await proc.wait() + return { + "score": 0.0, + "isError": True, + "content": f"Harbor verifier timed out after {verifier_timeout:.0f}s", + "info": {"verifier_timeout_sec": verifier_timeout}, + } + out = out_bytes.decode("utf-8", "replace") + err = err_bytes.decode("utf-8", "replace") + reward, info = _read_harbor_reward(logs / "verifier") + info.update( + { + "stdout": out[-4000:], + "stderr": err[-4000:], + } + ) + if reward is None: + return { + "score": 0.0, + "isError": True, + "content": "Harbor verifier did not write reward.json or reward.txt", + "info": info, + } + return {"score": reward, "info": info} + + +class _DockerWorkspace(Workspace): + """Workspace SFTP over a host bind mount, shell commands via docker exec.""" + + def __init__(self, *args: Any, container: str, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self._container = container + + async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> None: + command = process.command or "bash -l" + proc = await asyncio.create_subprocess_exec( + "docker", + "exec", + "-i", + "--workdir", + self._guest_path, + self._container, + "bash", + "-lc", + command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout_data, stderr_data = await asyncio.wait_for(proc.communicate(), timeout=3600.0) + except TimeoutError: + proc.kill() + await proc.wait() + process.stderr.write(b"workspace: command timed out after 3600s\n") + process.exit(1) + return + except asyncio.CancelledError: + proc.kill() + await proc.wait() + raise + + if stdout_data: + process.stdout.write(stdout_data) + if stderr_data: + process.stderr.write(stderr_data) + process.exit(proc.returncode if proc.returncode is not None else 0) + + +_DEFAULT_VERIFIER_TIMEOUT = 600.0 + + +def _verifier_timeout(task_dir: Path) -> float: + """The task's ``[verifier] timeout_sec``, or the Harbor default.""" + try: + config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8")) + except (OSError, tomllib.TOMLDecodeError): + return _DEFAULT_VERIFIER_TIMEOUT + verifier = config.get("verifier") + timeout = verifier.get("timeout_sec") if isinstance(verifier, dict) else None + if isinstance(timeout, int | float) and not isinstance(timeout, bool) and timeout > 0: + return float(timeout) + return _DEFAULT_VERIFIER_TIMEOUT + + +async def _image_workdir(image: str) -> str: + """The image's configured ``WORKDIR``, or ``/app`` when it declares none.""" + from hud.eval.runtime import _docker + + out, _ = await _docker("image", "inspect", "--format", "{{.Config.WorkingDir}}", image) + return out.strip() or "/app" + + +async def _materialize_workspace(image: str, workspace: Path, workdir: str) -> None: + """Copy the built image's ``workdir`` onto the host workspace, then own it. + + The ``workdir`` bind mount would otherwise shadow everything the Docker build + generated there (start scripts, installed dependencies, compiled output, + seeded databases). Copying the image's actual ``workdir`` out first makes the + mounted workspace a faithful, editable copy of what the image runs. Files + arrive owned by the container's build user; hand them to the host user so the + agent can edit them over SFTP and teardown can remove them. + """ + from hud.eval.runtime import _docker + + out, _ = await _docker("create", image, "true") + container = out.strip() + try: + await _docker("cp", f"{container}:{workdir}/.", str(workspace)) + finally: + await _docker("rm", "--force", "--volumes", container, check=False) + + if hasattr(os, "getuid"): + await _docker( + "run", + "--rm", + "--volume", + f"{workspace}:{workdir}", + image, + "chown", + "-R", + f"{os.getuid()}:{os.getgid()}", + workdir, + check=False, + ) + + +def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]: + reward_json = verifier_logs / "reward.json" + if reward_json.is_file(): + data = json.loads(reward_json.read_text(encoding="utf-8")) + if isinstance(data, int | float): + return float(data), {"reward_file": str(reward_json)} + if isinstance(data, dict): + for key in ("reward", "score"): + value = data.get(key) + if isinstance(value, int | float): + return float(value), {"reward_file": str(reward_json), "reward_json": data} + return None, {"reward_file": str(reward_json), "reward_parse_error": "no numeric reward"} + + reward_txt = verifier_logs / "reward.txt" + if reward_txt.is_file(): + text = reward_txt.read_text(encoding="utf-8").strip() + try: + return float(text), {"reward_file": str(reward_txt)} + except ValueError: + return None, {"reward_file": str(reward_txt), "reward_parse_error": text} + + return None, {} + + +async def _release_mount_permissions(container: str, workdir: str) -> None: + """Let the host user delete files that container-root created in mounts.""" + from hud.eval.runtime import _docker + + await _docker( + "exec", + container, + "sh", + "-lc", + f"chmod -R a+rwX {workdir} /logs 2>/dev/null || true", + check=False, + ) + + +def _compose_file(env_dir: Path) -> Path | None: + for name in ("docker-compose.yaml", "docker-compose.yml", "compose.yaml", "compose.yml"): + path = env_dir / name + if path.is_file(): + return path + return None + + +def _compose_overlay(*, workspace: Path, workdir: str, tests_dir: Path, logs: Path) -> str: + """Compose override that keeps Harbor's main service idle for agent work. + + Only ``main`` is touched: it is parked on ``sleep`` with the materialized + workspace mounted over its working directory, and the Harbor ``/tests`` and + ``/logs`` paths bound in. Every other service (databases, caches) is + inherited from the task's own compose file unchanged. + """ + return "\n".join( + [ + "services:", + " main:", + f" working_dir: {json.dumps(workdir)}", + ' entrypoint: ["sleep"]', + ' command: ["infinity"]', + " volumes:", + f" - {json.dumps(f'{workspace}:{workdir}')}", + f" - {json.dumps(f'{tests_dir}:/tests:ro')}", + f" - {json.dumps(f'{logs}:/logs')}", + "", + ], + ) diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py index b7343b51..c695bee3 100644 --- a/integrations/tests/test_harbor.py +++ b/integrations/tests/test_harbor.py @@ -2,12 +2,24 @@ from __future__ import annotations +import asyncio +import json +import os import textwrap from typing import TYPE_CHECKING import pytest -from integrations.harbor import detect, export, load +from hud.eval import Task +from integrations.harbor import HarborRuntime, detect, export, load +from integrations.harbor_runtime import ( + _compose_file, + _compose_overlay, + _image_workdir, + _materialize_workspace, + _read_harbor_reward, + _verifier_timeout, +) from .conftest import make_harbor_task @@ -74,6 +86,276 @@ def test_load_skips_unparseable_toml_but_keeps_the_rest(tmp_path: Path) -> None: assert {task.id for task in taskset} == {"good", "broken"} +def test_harbor_runtime_accepts_dataset_dirs(single_task: Path) -> None: + runtime = HarborRuntime(single_task.parent) + + assert single_task.name in runtime._task_dirs + + +async def test_harbor_runtime_builds_unique_images_per_acquisition( + single_task: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + calls: list[tuple[tuple[str, ...], bool]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append((args, check)) + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + runtime = HarborRuntime(single_task.parent) + + first = await runtime._build_image(single_task / "environment") + second = await runtime._build_image(single_task / "environment") + + assert first != second + assert first.startswith("hud-harbor:") + assert second.startswith("hud-harbor:") + assert [args[2] for args, _ in calls] == [first, second] + + +async def test_compose_container_cleans_up_after_failed_up( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + compose = single_task / "environment" / "docker-compose.yaml" + compose.write_text("services:\n main:\n build: .\n", encoding="utf-8") + calls: list[tuple[tuple[str, ...], bool]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append((args, check)) + if args[-3:] == ("up", "--detach", "--build"): + raise RuntimeError("compose failed") + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + runtime = HarborRuntime(single_task.parent) + + with pytest.raises(RuntimeError, match="compose failed"): + async with runtime._compose_container( + Task(env="bench", id=single_task.name), + compose, + tmp_path / "workspace", + "/app", + single_task / "tests", + tmp_path / "logs", + ): + raise AssertionError("compose acquisition should not yield") + + assert any( + args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False + for args, check in calls + ) + + +async def test_compose_container_cleans_up_when_main_service_is_missing( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + compose = single_task / "environment" / "docker-compose.yaml" + compose.write_text("services:\n api:\n build: .\n", encoding="utf-8") + calls: list[tuple[tuple[str, ...], bool]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append((args, check)) + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + runtime = HarborRuntime(single_task.parent) + + with pytest.raises(RuntimeError, match="did not create a main service"): + async with runtime._compose_container( + Task(env="bench", id=single_task.name), + compose, + tmp_path / "workspace", + "/app", + single_task / "tests", + tmp_path / "logs", + ): + raise AssertionError("compose acquisition should not yield") + + assert any( + args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False + for args, check in calls + ) + + +def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None: + env = tmp_path / "environment" + env.mkdir() + compose = env / "docker-compose.yaml" + compose.write_text("services: {}\n", encoding="utf-8") + + assert _compose_file(env) == compose + + +def test_compose_overlay_parks_main_and_mounts_workspace_tests_and_logs(tmp_path: Path) -> None: + overlay = _compose_overlay( + workspace=tmp_path / "workspace", + workdir="/srv/app", + tests_dir=tmp_path / "tests", + logs=tmp_path / "logs", + ) + + assert "main:" in overlay + assert 'entrypoint: ["sleep"]' in overlay + assert 'working_dir: "/srv/app"' in overlay + assert f"{tmp_path / 'workspace'}:/srv/app" in overlay + assert f"{tmp_path / 'tests'}:/tests:ro" in overlay + assert f"{tmp_path / 'logs'}:/logs" in overlay + + +async def test_image_workdir_reads_config_working_dir(monkeypatch: pytest.MonkeyPatch) -> None: + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + assert args == ("image", "inspect", "--format", "{{.Config.WorkingDir}}", "img") + return "/srv/app\n", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + + assert await _image_workdir("img") == "/srv/app" + + +async def test_image_workdir_defaults_to_app_when_unset(monkeypatch: pytest.MonkeyPatch) -> None: + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + return "\n", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + + assert await _image_workdir("img") == "/app" + + +async def test_materialize_workspace_copies_image_workdir_and_owns_it( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + calls: list[tuple[str, ...]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append(args) + if args[0] == "create": + return "tempcid\n", "" + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + + await _materialize_workspace("img", workspace, "/app") + + # Contents of the image's workdir are copied out into the host workspace. + assert ("cp", "tempcid:/app/.", str(workspace)) in calls + # The throwaway container is removed. + assert any(a[0] == "rm" for a in calls) + # On POSIX hosts, ownership is handed to the host user via a chown pass. + if hasattr(os, "getuid"): + assert any(a[0] == "run" and "chown" in a and a[-1] == "/app" for a in calls) + + +def test_read_harbor_reward_prefers_reward_and_score_keys(tmp_path: Path) -> None: + verifier = tmp_path / "verifier" + verifier.mkdir() + (verifier / "reward.json").write_text(json.dumps({"reward": 0.5, "total": 5}), "utf-8") + + reward, info = _read_harbor_reward(verifier) + + assert reward == 0.5 + assert info["reward_json"] == {"reward": 0.5, "total": 5} + + +def test_read_harbor_reward_rejects_dict_without_reward_or_score(tmp_path: Path) -> None: + verifier = tmp_path / "verifier" + verifier.mkdir() + (verifier / "reward.json").write_text(json.dumps({"passed": 3, "total": 5}), "utf-8") + + reward, info = _read_harbor_reward(verifier) + + assert reward is None + assert info["reward_parse_error"] == "no numeric reward" + + +def test_verifier_timeout_reads_task_toml(single_task: Path) -> None: + assert _verifier_timeout(single_task) == 120.0 + + +def test_verifier_timeout_defaults_when_missing_or_invalid(tmp_path: Path) -> None: + no_verifier = tmp_path / "no-verifier" + no_verifier.mkdir() + (no_verifier / "task.toml").write_text('[metadata]\ncategory = "systems"\n', "utf-8") + broken = tmp_path / "broken" + broken.mkdir() + (broken / "task.toml").write_text("not toml [", "utf-8") + + assert _verifier_timeout(no_verifier) == 600.0 + assert _verifier_timeout(broken) == 600.0 + assert _verifier_timeout(tmp_path / "missing") == 600.0 + + +async def test_grade_reads_reward_after_verifier_completes( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + logs = tmp_path / "logs" + (logs / "verifier").mkdir(parents=True) + (logs / "verifier" / "reward.txt").write_text("1.0\n", "utf-8") + + class FakeProc: + returncode = 0 + + async def communicate(self) -> tuple[bytes, bytes]: + return b"verifier out", b"" + + async def fake_exec(*args: str, **kwargs: object) -> FakeProc: + assert args[:2] == ("docker", "exec") + return FakeProc() + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec) + runtime = HarborRuntime(single_task.parent) + + result = await runtime._grade("container", "/app", logs, "done", verifier_timeout=120.0) + + assert result["score"] == 1.0 + assert result["info"]["stdout"] == "verifier out" + assert (logs / "agent_answer.txt").read_text("utf-8") == "done" + + +async def test_grade_times_out_when_verifier_hangs( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeProc: + killed = False + + async def communicate(self) -> tuple[bytes, bytes]: + await asyncio.sleep(3600) + raise AssertionError("unreachable") + + def kill(self) -> None: + self.killed = True + + async def wait(self) -> int: + return -9 + + proc = FakeProc() + + async def fake_exec(*args: str, **kwargs: object) -> FakeProc: + return proc + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec) + runtime = HarborRuntime(single_task.parent) + + result = await runtime._grade( + "container", "/app", tmp_path / "logs", None, verifier_timeout=0.05 + ) + + assert result["isError"] is True + assert "timed out" in result["content"] + assert proc.killed + + # ─── export: HUD tasks -> Harbor task folders ─────────────────────────── _ENV_PY = """\ diff --git a/pyproject.toml b/pyproject.toml index 81973229..1bca6168 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,6 @@ build-backend = "hatchling.build" exclude = [ "docs/", "cookbooks/", - "integrations/", "**/checkpoints/", "**/*.safetensors", "**/*.ckpt", @@ -85,6 +84,7 @@ allow-direct-references = true [tool.hatch.build.targets.sdist] include = [ "hud/**", + "integrations/**", "README.md", "LICENSE", "pyproject.toml" @@ -102,7 +102,7 @@ exclude = [ ] [tool.hatch.build.targets.wheel] -packages = ["hud"] +packages = ["hud", "integrations"] # Ensure py.typed is included in the package [tool.hatch.build.targets.wheel.force-include]