From 2b5aefc4b72bc615e7f6616b16e05ab89a3212a2 Mon Sep 17 00:00:00 2001 From: Nancy Date: Sun, 28 Jun 2026 19:15:45 -0700 Subject: [PATCH 1/3] Add Harbor runtime support and split the code to make it cleaner --- docs/v6/advanced/harbor-convert.mdx | 23 +- docs/v6/reference/cli.mdx | 3 +- hud/cli/eval.py | 44 +- hud/cli/tests/test_eval_config.py | 41 ++ integrations/__init__.py | 5 +- integrations/harbor.py | 85 +--- integrations/harbor_common.py | 70 +++ integrations/harbor_runtime.py | 679 ++++++++++++++++++++++++++++ integrations/tests/test_harbor.py | 175 ++++++- pyproject.toml | 4 +- 10 files changed, 1040 insertions(+), 89 deletions(-) create mode 100644 integrations/harbor_common.py create mode 100644 integrations/harbor_runtime.py diff --git a/docs/v6/advanced/harbor-convert.mdx b/docs/v6/advanced/harbor-convert.mdx index 5ddb9f4f9..1680fab16 100644 --- a/docs/v6/advanced/harbor-convert.mdx +++ b/docs/v6/advanced/harbor-convert.mdx @@ -26,22 +26,29 @@ directly - one row per task dir (`id` = the dir name), sharing one declarative ```python from integrations.harbor import detect, load -assert detect("./terminal-bench") -taskset = load("./terminal-bench") +assert detect("./harbor_tasks") +taskset = load("./harbor_tasks") for task in taskset: print(task.env, task.id) ``` -Like every task row, the result carries no placement. Run it by supplying one - -today that means a substrate already serving the control channel -(`runtime=Runtime(url)`); a docker provider that builds and runs each task's -`environment/` image is the planned follow-up: +Like every task row, the result carries no placement. Run it by supplying one. +For local Docker-backed Harbor execution, use `HarborRuntime`; it builds the +task's `environment/` image, runs a fresh container, exposes the workspace +through HUD's normal shell capability, and grades by running `tests/test.sh`: ```python -from hud import Runtime +from integrations.harbor import HarborRuntime -job = await taskset.run(agent, runtime=Runtime("tcp://127.0.0.1:8765")) +job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks")) +``` + +The eval CLI also detects local Harbor task directories and datasets when using +local runtime placement: + +```bash +hud eval ./harbor_tasks claude --task-ids cancel-async-tasks --max-steps 30 ``` ## Export HUD tasks to Harbor diff --git a/docs/v6/reference/cli.mdx b/docs/v6/reference/cli.mdx index 6a5f51bbd..852963982 100644 --- a/docs/v6/reference/cli.mdx +++ b/docs/v6/reference/cli.mdx @@ -133,7 +133,8 @@ hud sync env # sync environment metadata ``` External benchmark formats (currently Harbor) load directly into the runtime -as `Taskset`s - no conversion step. See [Harbor interop](/v6/advanced/harbor-convert). +as `Taskset`s - no conversion step. Local Harbor directories run with the Harbor +Docker-backed runtime provider. See [Harbor interop](/v6/advanced/harbor-convert). ## Inspect diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 39afd6edf..9978eb402 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -728,7 +728,32 @@ def _spawn_target(source: Path) -> Path: return resolved.parent -def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any: +def _is_harbor_source(source_path: Path | None) -> bool: + if source_path is None or not source_path.exists(): + return False + if not source_path.is_dir(): + return False + from integrations.harbor import detect + + return detect(source_path) + + +def _load_local_taskset(source_path: Path) -> tuple[Any, str]: + from hud.eval import Taskset + + if _is_harbor_source(source_path): + from integrations.harbor import load + + return load(source_path), "harbor" + return Taskset.from_file(source_path), "hud" + + +def _resolve_placement( + cfg: EvalConfig, + source_path: Path | None, + *, + source_kind: str = "hud", +) -> Any: """Map the config's ``runtime`` onto a placement for ``Taskset.run``. "local" spawns each row's env from the source next to the tasks file; @@ -744,6 +769,10 @@ def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any: if cfg.runtime == "local": if source_path is None: raise ValueError("local placement requires a local source path") + if source_kind == "harbor": + from integrations.harbor import HarborRuntime + + return HarborRuntime(source_path) return LocalRuntime(_spawn_target(source_path)) if cfg.runtime == "hud": require_api_key("run HUD runtime tunnel evals") @@ -767,18 +796,19 @@ async def _run_evaluation(cfg: EvalConfig) -> Any: if cfg.source is None or cfg.agent_type is None: raise ValueError("source and agent_type must be set") - from hud.eval import Taskset - source_path = Path(cfg.source) is_local = source_path.exists() + source_kind = "api" if is_local: hud_console.info(f"Loading tasks from: {cfg.source}") try: - taskset = Taskset.from_file(source_path) + taskset, source_kind = _load_local_taskset(source_path) except Exception as e: hud_console.error(f"Failed to load tasks from {cfg.source}: {e}") raise typer.Exit(1) from e else: + from hud.eval import Taskset + hud_console.info(f"Loading platform taskset: {cfg.source}") try: taskset = Taskset.from_api(cfg.source) @@ -832,7 +862,11 @@ async def _run_evaluation(cfg: EvalConfig) -> Any: ) agent = _build_agent(cfg) - placement = _resolve_placement(cfg, source_path if is_local else None) + placement = _resolve_placement( + cfg, + source_path if is_local else None, + source_kind=source_kind, + ) job = await taskset.run( agent, diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py index 6b94f0b23..e1183860c 100644 --- a/hud/cli/tests/test_eval_config.py +++ b/hud/cli/tests/test_eval_config.py @@ -20,6 +20,23 @@ _ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/anthropic.claude" +def _write_harbor_task(root: Path, name: str = "demo-task") -> Path: + task = root / name + (task / "environment").mkdir(parents=True) + (task / "tests").mkdir() + (task / "instruction.md").write_text("Fix the demo task.\n", encoding="utf-8") + (task / "task.toml").write_text( + 'schema_version = "1.3"\n\n[task]\nname = "demo/demo-task"\n', + encoding="utf-8", + ) + (task / "environment" / "Dockerfile").write_text("FROM python:3.12-slim\n", encoding="utf-8") + (task / "tests" / "test.sh").write_text( + "#!/usr/bin/env bash\nmkdir -p /logs/verifier\necho 1 > /logs/verifier/reward.txt\n", + encoding="utf-8", + ) + return task + + def test_is_bedrock_arn() -> None: assert _is_bedrock_arn(_ARN) is True assert _is_bedrock_arn("claude-sonnet-4-6") is False @@ -136,6 +153,30 @@ def test_resolve_placement_runtime_hud_uses_tunnel( assert isinstance(placement, HUDRuntime) +def test_load_local_taskset_uses_harbor_loader_for_harbor_dirs(tmp_path: Path) -> None: + _write_harbor_task(tmp_path) + + taskset, source_kind = eval_mod._load_local_taskset(tmp_path) + + assert source_kind == "harbor" + assert len(taskset) == 1 + assert taskset["demo-task"].id == "demo-task" + + +def test_resolve_placement_local_harbor_uses_harbor_runtime(tmp_path: Path) -> None: + from integrations.harbor import HarborRuntime + + _write_harbor_task(tmp_path) + + placement = eval_mod._resolve_placement( + EvalConfig(runtime="local"), + tmp_path, + source_kind="harbor", + ) + + assert isinstance(placement, HarborRuntime) + + def test_resolve_placement_remote_uses_hosted_runtime( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, diff --git a/integrations/__init__.py b/integrations/__init__.py index c8549e0fe..e4a817b4e 100644 --- a/integrations/__init__.py +++ b/integrations/__init__.py @@ -8,8 +8,9 @@ This package lives outside ``hud`` on purpose: each module is a recipe built **only on the public SDK surface** (``Environment``, ``Task``, ``Taskset``, ``Runtime``) — that constraint is the proof the core is -flexible. Copy a module into your project or run it from a checkout; nothing -in the SDK or CLI imports it. +flexible. Copy a module into your project or run it from a checkout. The CLI may +call selected integrations explicitly for polished interop paths, but the +integration contract itself stays independent of private SDK hooks. The contract: an integration module exposes ``detect(path) -> bool`` and ``load(path) -> Taskset``. Placement stays an execution-time concern — loaders diff --git a/integrations/harbor.py b/integrations/harbor.py index 497711e37..903426251 100644 --- a/integrations/harbor.py +++ b/integrations/harbor.py @@ -11,11 +11,9 @@ :func:`load` parses a task dir (or a dataset of them) into rows sharing one env name per distinct ``environment/`` build context — no codegen, no -roundtrip. Like every row, the result is runnable -once a placement is supplied (``runtime=Runtime(url)`` against a served substrate -today). Providers receive the row being placed, so a docker provider that -builds and runs each row's ``environment/`` image is the named follow-up — -expressible without engine changes. +roundtrip. Like every row, the result is runnable once a placement is supplied. +Use :class:`HarborRuntime` for local Docker-backed execution of Harbor tasks, or +``runtime=Runtime(url)`` to attach to a substrate served elsewhere. :func:`export` is the reverse direction: turn a HUD task source into self-contained Harbor task folders (``task.toml`` + ``instruction.md`` + @@ -40,19 +38,23 @@ from __future__ import annotations -import hashlib import json import logging -import re import shutil -import tomllib -from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any from hud.environment import Environment from hud.environment.server import TaskRunner from hud.eval import Task, Taskset +from integrations.harbor_common import ( + _HarborTask, + _is_harbor_task, + _parse_task, + _slugify, + _task_dirs, +) +from integrations.harbor_runtime import HarborRuntime if TYPE_CHECKING: from collections.abc import Callable @@ -74,18 +76,12 @@ "__pycache__", "*.pyc", ".git", ".venv", "venv", "*.egg-info", ".pytest_cache" ) - # ─── load: Harbor dirs -> Taskset ────────────────────────────────────── def detect(path: str | Path) -> bool: """True when *path* is a Harbor task dir or a dataset of them.""" - root = Path(path) - if _is_harbor_task(root): - return True - if root.is_dir(): - return any(_is_harbor_task(d) for d in root.iterdir() if d.is_dir()) - return False + return bool(_task_dirs(path)) def load(path: str | Path) -> Taskset: @@ -96,12 +92,8 @@ def load(path: str | Path) -> Taskset: context (content-hashed), derived from the dataset name. """ root = Path(path).resolve() - if _is_harbor_task(root): - task_dirs = [root] - dataset_name = root.parent.name - else: - task_dirs = sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d)) - dataset_name = root.name + task_dirs = _task_dirs(root) + dataset_name = root.parent.name if _is_harbor_task(root) else root.name if not task_dirs: raise ValueError(f"no Harbor tasks found in {path}") @@ -126,54 +118,6 @@ def load(path: str | Path) -> Taskset: return Taskset(base_name, tasks) -def _slugify(name: str) -> str: - """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name.""" - normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-")) - return re.sub(r"-+", "-", normalized).strip("-") or "harbor" - - -def _is_harbor_task(path: Path) -> bool: - return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists() - - -def _hash_directory(path: Path) -> str: - """Content-hash a directory for grouping tasks by identical environments.""" - hasher = hashlib.sha256() - if not path.exists(): - return "empty" - for file_path in sorted(path.rglob("*")): - if file_path.is_file(): - hasher.update(str(file_path.relative_to(path)).encode()) - hasher.update(file_path.read_bytes()) - return hasher.hexdigest()[:16] - - -@dataclass(frozen=True, slots=True) -class _HarborTask: - """One parsed Harbor task dir.""" - - task_id: str - config: dict[str, Any] - env_hash: str - - -def _parse_task(task_dir: Path) -> _HarborTask | None: - if not (task_dir / "instruction.md").is_file(): - LOGGER.warning("failed to read instruction.md in %s", task_dir) - return None - try: - config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8")) - except (OSError, tomllib.TOMLDecodeError): - LOGGER.warning("failed to parse task.toml in %s", task_dir) - config = {} - env_dir = task_dir / "environment" - return _HarborTask( - task_id=task_dir.name, - config=config, - env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env", - ) - - # ─── export: HUD tasks -> Harbor task folders ─────────────────────────── @@ -443,6 +387,7 @@ async def export( "ALLOWED_PROTOCOLS", "CONTROL_PORT", "DEFAULT_ANSWER_FILE", + "HarborRuntime", "detect", "export", "load", diff --git a/integrations/harbor_common.py b/integrations/harbor_common.py new file mode 100644 index 000000000..53294e091 --- /dev/null +++ b/integrations/harbor_common.py @@ -0,0 +1,70 @@ +"""Shared helpers for Harbor task integration.""" + +from __future__ import annotations + +import hashlib +import logging +import re +import tomllib +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +LOGGER = logging.getLogger(__name__) + + +def _slugify(name: str) -> str: + """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name.""" + normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-")) + return re.sub(r"-+", "-", normalized).strip("-") or "harbor" + + +def _is_harbor_task(path: Path) -> bool: + return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists() + + +def _task_dirs(path: str | Path) -> list[Path]: + root = Path(path) + if _is_harbor_task(root): + return [root] + if root.is_dir(): + return sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d)) + return [] + + +def _hash_directory(path: Path) -> str: + """Content-hash a directory for grouping tasks by identical environments.""" + hasher = hashlib.sha256() + if not path.exists(): + return "empty" + for file_path in sorted(path.rglob("*")): + if file_path.is_file(): + hasher.update(str(file_path.relative_to(path)).encode()) + hasher.update(file_path.read_bytes()) + return hasher.hexdigest()[:16] + + +@dataclass(frozen=True, slots=True) +class _HarborTask: + """One parsed Harbor task dir.""" + + task_id: str + config: dict[str, Any] + env_hash: str + + +def _parse_task(task_dir: Path) -> _HarborTask | None: + if not (task_dir / "instruction.md").is_file(): + LOGGER.warning("failed to read instruction.md in %s", task_dir) + return None + try: + config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8")) + except (OSError, tomllib.TOMLDecodeError): + LOGGER.warning("failed to parse task.toml in %s", task_dir) + config = {} + env_dir = task_dir / "environment" + return _HarborTask( + task_id=task_dir.name, + config=config, + env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env", + ) diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py new file mode 100644 index 000000000..872399bc1 --- /dev/null +++ b/integrations/harbor_runtime.py @@ -0,0 +1,679 @@ +"""Local Docker-backed runtime for Harbor task directories.""" + +from __future__ import annotations + +import contextlib +import json +import shlex +import shutil +import tempfile +import uuid +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from hud.environment import Environment +from hud.environment.workspace import Workspace +from integrations.harbor_common import _hash_directory, _slugify, _task_dirs + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator, AsyncIterator + + import asyncssh + + from hud.eval import Task + from hud.eval.runtime import Runtime + + +class HarborRuntime: + """Run Harbor task directories through HUD's local rollout engine. + + The provider builds the Harbor task's ``environment/`` Docker context, runs + a fresh container with a writable host workspace mounted at ``/app``, and + serves a small HUD control channel from the host process. If the task ships a + ``docker-compose.yaml``/``.yml``, the provider starts it with an overlay that + keeps the ``main`` service idle while preserving sidecars such as databases. + The agent receives normal HUD SSH/SFTP access; shell commands execute inside + the main container via ``docker exec`` while file transfer edits the mounted + host workspace. Grading runs the Harbor ``tests/test.sh`` inside the same + main container and reads ``/logs/verifier/reward.json`` or ``reward.txt``. + """ + + def __init__( + self, + path: str | Path, + *, + ready_timeout: float = 120.0, + ) -> None: + self.root = Path(path).resolve() + self.ready_timeout = ready_timeout + self._task_dirs = {task_dir.name: task_dir for task_dir in _task_dirs(self.root)} + if not self._task_dirs: + raise ValueError(f"no Harbor tasks found in {path}") + self._image_cache: dict[Path, str] = {} + + @contextlib.asynccontextmanager + async def __call__(self, task: Task) -> AsyncIterator[Runtime]: + from hud.eval.runtime import Runtime, _local + + task_dir = self._task_dirs.get(task.id) + if task_dir is None: + raise KeyError(f"HarborRuntime has no task directory for {task.id!r}") + env_dir = task_dir / "environment" + tests_dir = task_dir / "tests" + if not (env_dir / "Dockerfile").is_file(): + raise FileNotFoundError(f"Harbor task {task.id!r} has no environment/Dockerfile") + if not (tests_dir / "test.sh").is_file(): + raise FileNotFoundError(f"Harbor task {task.id!r} has no tests/test.sh") + + with tempfile.TemporaryDirectory(prefix=f"hud-harbor-{_slugify(task.id)}-") as tmp: + tmp_path = Path(tmp) + workspace = tmp_path / "workspace" + logs = tmp_path / "logs" + shutil.copytree(env_dir, workspace) + _ensure_start_script(workspace) + _ensure_dockerfile_created_dirs(workspace) + preserved_paths = _preserved_image_paths(workspace) + logs.mkdir(parents=True, exist_ok=True) + + compose_file = _compose_file(env_dir) + if compose_file is not None: + async with self._compose_container( + task, + compose_file, + workspace, + tests_dir, + logs, + preserved_paths, + ) as ( + container, + provider, + ): + env = self._environment_for(task, task_dir, workspace, logs, container) + async with _local(env) as runtime: + yield Runtime( + runtime.url, + params={ + **runtime.params, + "provider": provider, + "container": container, + "ready_timeout": self.ready_timeout, + }, + config=runtime.config, + ) + else: + async with self._single_container( + task, + task_dir, + workspace, + tests_dir, + logs, + preserved_paths, + ) as ( + container, + provider, + ): + env = self._environment_for(task, task_dir, workspace, logs, container) + async with _local(env) as runtime: + yield Runtime( + runtime.url, + params={ + **runtime.params, + "provider": provider, + "container": container, + "ready_timeout": self.ready_timeout, + }, + config=runtime.config, + ) + + @contextlib.asynccontextmanager + async def _single_container( + self, + task: Task, + task_dir: Path, + workspace: Path, + tests_dir: Path, + logs: Path, + preserved_paths: list[str], + ) -> AsyncIterator[tuple[str, str]]: + from hud.eval.runtime import _docker + + image = await self._image_for(task_dir) + env_dir = task_dir / "environment" + await _restore_image_generated_files(image, workspace) + container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}" + preserved_volume_args = [arg for path in preserved_paths for arg in ("--volume", path)] + out, _ = await _docker( + "run", + "--detach", + "--name", + container_name, + "--workdir", + "/app", + "--entrypoint", + "sleep", + "--volume", + f"{workspace}:/app", + "--volume", + f"{tests_dir}:/tests:ro", + "--volume", + f"{logs}:/logs", + *preserved_volume_args, + image, + "infinity", + ) + container = out.strip() + try: + yield container, "harbor" + finally: + await _release_mount_permissions(container) + await _docker("rm", "--force", "--volumes", container, check=False) + await _docker("image", "rm", image, check=False) + self._image_cache.pop(env_dir, None) + + @contextlib.asynccontextmanager + async def _compose_container( + self, + task: Task, + compose_file: Path, + workspace: Path, + tests_dir: Path, + logs: Path, + preserved_paths: list[str], + ) -> AsyncIterator[tuple[str, str]]: + from hud.eval.runtime import _docker + + project = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}" + overlay = workspace.parent / "compose.hud.yaml" + overlay.write_text( + _compose_overlay( + workspace=workspace, + tests_dir=tests_dir, + logs=logs, + preserved_paths=preserved_paths, + ), + encoding="utf-8", + newline="\n", + ) + compose_args = ("compose", "-f", str(compose_file), "-f", str(overlay), "-p", project) + await _docker(*compose_args, "up", "--detach", "--build") + out, _ = await _docker(*compose_args, "ps", "-q", "main") + container = out.strip() + if not container: + raise RuntimeError(f"docker compose project {project} did not create a main service") + try: + yield container, "harbor-compose" + finally: + await _release_mount_permissions(container) + await _docker( + *compose_args, + "down", + "--volumes", + "--remove-orphans", + "--rmi", + "local", + check=False, + ) + + async def _image_for(self, task_dir: Path) -> str: + from hud.eval.runtime import _docker + + env_dir = task_dir / "environment" + cached = self._image_cache.get(env_dir) + if cached is not None: + return cached + tag = f"hud-harbor:{_hash_directory(env_dir)}" + await _docker("build", "--tag", tag, str(env_dir)) + self._image_cache[env_dir] = tag + return tag + + def _environment_for( + self, + task: Task, + task_dir: Path, + workspace: Path, + logs: Path, + container: str, + ) -> Environment: + env = Environment(task.env) + workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path="/app") + + @env.initialize + async def _up() -> None: + await workspace_daemon.start() + env.add_capability(workspace_daemon.capability("shell")) + + @env.shutdown + async def _down() -> None: + await workspace_daemon.stop() + + @env.template(id=task.id, description=f"Harbor task {task.id}") + async def _run_harbor_task() -> AsyncGenerator[Any, Any]: + answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8") + yield await self._grade(container, logs, answer) + + return env + + async def _grade(self, container: str, logs: Path, answer: Any) -> dict[str, Any]: + from hud.eval.runtime import _docker + + answer_file = logs / "agent_answer.txt" + answer_file.parent.mkdir(parents=True, exist_ok=True) + answer_file.write_text("" if answer is None else str(answer), encoding="utf-8") + out, err = await _docker( + "exec", + "--workdir", + "/app", + container, + "bash", + "/tests/test.sh", + check=False, + ) + reward, info = _read_harbor_reward(logs / "verifier") + info.update( + { + "stdout": out[-4000:], + "stderr": err[-4000:], + } + ) + if reward is None: + return { + "score": 0.0, + "isError": True, + "content": "Harbor verifier did not write reward.json or reward.txt", + "info": info, + } + return {"score": reward, "info": info} + + +class _DockerWorkspace(Workspace): + """Workspace SFTP over a host bind mount, shell commands via docker exec.""" + + def __init__(self, *args: Any, container: str, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self._container = container + + async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> None: + import asyncio + + command = process.command or "bash -l" + proc = await asyncio.create_subprocess_exec( + "docker", + "exec", + "-i", + "--workdir", + self._guest_path, + self._container, + "bash", + "-lc", + command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout_data, stderr_data = await asyncio.wait_for(proc.communicate(), timeout=3600.0) + except TimeoutError: + proc.kill() + await proc.wait() + process.stderr.write(b"workspace: command timed out after 3600s\n") + process.exit(1) + return + except asyncio.CancelledError: + proc.kill() + await proc.wait() + raise + + if stdout_data: + process.stdout.write(stdout_data) + if stderr_data: + process.stderr.write(stderr_data) + process.exit(proc.returncode if proc.returncode is not None else 0) + + +def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]: + reward_json = verifier_logs / "reward.json" + if reward_json.is_file(): + data = json.loads(reward_json.read_text(encoding="utf-8")) + if isinstance(data, int | float): + return float(data), {"reward_file": str(reward_json)} + if isinstance(data, dict): + for key in ("reward", "score"): + value = data.get(key) + if isinstance(value, int | float): + return float(value), {"reward_file": str(reward_json), "reward_json": data} + numeric = [float(value) for value in data.values() if isinstance(value, int | float)] + if numeric: + return sum(numeric) / len(numeric), { + "reward_file": str(reward_json), + "reward_json": data, + } + return None, {"reward_file": str(reward_json), "reward_parse_error": "no numeric reward"} + + reward_txt = verifier_logs / "reward.txt" + if reward_txt.is_file(): + text = reward_txt.read_text(encoding="utf-8").strip() + try: + return float(text), {"reward_file": str(reward_txt)} + except ValueError: + return None, {"reward_file": str(reward_txt), "reward_parse_error": text} + + return None, {} + + +async def _release_mount_permissions(container: str) -> None: + """Let the host user delete files that container-root created in mounts.""" + from hud.eval.runtime import _docker + + await _docker( + "exec", + container, + "sh", + "-lc", + "chmod -R a+rwX /app /logs 2>/dev/null || true", + check=False, + ) + + +def _compose_file(env_dir: Path) -> Path | None: + for name in ("docker-compose.yaml", "docker-compose.yml", "compose.yaml", "compose.yml"): + path = env_dir / name + if path.is_file(): + return path + return None + + +def _compose_overlay( + *, + workspace: Path, + tests_dir: Path, + logs: Path, + preserved_paths: list[str] | None = None, +) -> str: + """Compose override that keeps Harbor's main service idle for agent work.""" + preserved_paths = preserved_paths or [] + volume_lines = [ + f" - {json.dumps(f'{workspace}:/app')}", + f" - {json.dumps(f'{tests_dir}:/tests:ro')}", + f" - {json.dumps(f'{logs}:/logs')}", + ] + volume_lines.extend(f" - {json.dumps(path)}" for path in preserved_paths) + return "\n".join( + [ + "services:", + " main:", + " build:", + f" context: {json.dumps(str(workspace))}", + " working_dir: /app", + ' entrypoint: ["sleep"]', + ' command: ["infinity"]', + " volumes:", + *volume_lines, + "", + ], + ) + + +def _preserved_image_paths(workspace: Path) -> list[str]: + """Image-populated subpaths that should survive the editable ``/app`` mount.""" + paths: list[str] = [] + if (workspace / "package.json").is_file(): + paths.append("/app/node_modules") + if _node_build_output_is_image_populated(workspace, "dist"): + paths.append("/app/dist") + if (workspace / "composer.json").is_file(): + paths.append("/app/vendor") + return paths + + +def _node_build_output_is_image_populated(workspace: Path, dirname: str) -> bool: + if (workspace / dirname).exists(): + return False + dockerfile = workspace / "Dockerfile" + if not dockerfile.is_file(): + return False + dockerfile_text = dockerfile.read_text(encoding="utf-8") + entrypoint = workspace / "docker-entrypoint.sh" + entrypoint_text = entrypoint.read_text(encoding="utf-8") if entrypoint.is_file() else "" + return ( + "npm run build" in dockerfile_text + or f"/app/{dirname}" in dockerfile_text + or f" {dirname}/" in entrypoint_text + or f" {dirname}" in entrypoint_text + ) + + +def _ensure_start_script(workspace: Path) -> None: + """Preserve build-generated /app/start_app.sh hidden by the workspace mount.""" + start = workspace / "start_app.sh" + entrypoint = workspace / "docker-entrypoint.sh" + if not entrypoint.is_file(): + _restore_dockerfile_script(workspace, entrypoint, "/app/docker-entrypoint.sh") + if entrypoint.is_file(): + entrypoint.chmod(entrypoint.stat().st_mode | 0o111) + if start.exists(): + start.chmod(start.stat().st_mode | 0o111) + return + text = _script_from_dockerfile(workspace, "/app/start_app.sh") + if text is None and entrypoint.is_file(): + text = "#!/usr/bin/env bash\nset -e\ncd /app\nexec sh /app/docker-entrypoint.sh\n" + if text is None: + return + start.write_text(text, encoding="utf-8", newline="\n") + start.chmod(0o755) + + +def _ensure_dockerfile_created_dirs(workspace: Path) -> None: + """Recreate simple Dockerfile-created ``/app`` dirs hidden by the bind mount.""" + for path in _dockerfile_created_app_dirs(workspace): + path.mkdir(parents=True, exist_ok=True) + + +async def _restore_image_generated_files(image: str, workspace: Path) -> None: + """Copy selected build-generated files from the image into the workspace. + + Some Harbor images initialize file-backed databases during ``docker build``. + The editable ``/app`` bind mount hides those generated files, so copy them + out of the built image before starting the task container. + """ + container_paths = _dockerfile_declared_generated_app_files(workspace) + if not container_paths: + return + + from hud.eval.runtime import _docker + + out, _ = await _docker("create", image, "true") + container = out.strip() + try: + for container_path in container_paths: + host_path = _host_path_for_app_file(workspace, container_path) + if host_path is None or host_path.exists(): + continue + host_path.parent.mkdir(parents=True, exist_ok=True) + await _docker("cp", f"{container}:{container_path}", str(host_path), check=False) + finally: + await _docker("rm", "--force", "--volumes", container, check=False) + + +def _dockerfile_declared_generated_app_files(workspace: Path) -> list[str]: + """Find Dockerfile-declared file-backed DB paths under ``/app``.""" + dockerfile = workspace / "Dockerfile" + if not dockerfile.is_file(): + return [] + + paths: list[str] = [] + for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")): + stripped = instruction.strip() + if not stripped.startswith("ENV "): + continue + for key, value in _env_pairs(stripped.removeprefix("ENV ").strip()): + if not _is_generated_db_env_key(key): + continue + if _is_app_database_path(value): + paths.append(value) + return list(dict.fromkeys(paths)) + + +def _env_pairs(body: str) -> list[tuple[str, str]]: + try: + tokens = shlex.split(body) + except ValueError: + return [] + if not tokens: + return [] + + pairs: list[tuple[str, str]] = [] + if all("=" in token for token in tokens): + for token in tokens: + key, value = token.split("=", 1) + pairs.append((key, value)) + return pairs + + if len(tokens) >= 2: + pairs.append((tokens[0], tokens[1])) + return pairs + + +def _is_generated_db_env_key(key: str) -> bool: + normalized = key.upper() + return normalized in { + "DB_PATH", + "DATABASE_PATH", + "SQLITE_PATH", + "SQLITE_DB_PATH", + "SQLITE_DATABASE_PATH", + } or normalized.endswith(("_DB_PATH", "_DATABASE_PATH", "_SQLITE_PATH")) + + +def _is_app_database_path(path: str) -> bool: + lowered = path.lower() + return lowered.startswith("/app/") and lowered.endswith((".db", ".sqlite", ".sqlite3")) + + +def _host_path_for_app_file(workspace: Path, container_path: str) -> Path | None: + if not container_path.startswith("/app/"): + return None + rel = container_path.removeprefix("/app/") + if rel.startswith("../") or "/../" in rel or rel == "..": + return None + return workspace / rel + + +def _dockerfile_created_app_dirs(workspace: Path) -> list[Path]: + dockerfile = workspace / "Dockerfile" + if not dockerfile.is_file(): + return [] + paths: list[Path] = [] + for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")): + stripped = instruction.strip() + if not stripped.startswith("RUN "): + continue + command = stripped.removeprefix("RUN ").strip() + try: + tokens = shlex.split(command) + except ValueError: + continue + index = 0 + while index < len(tokens): + if tokens[index] != "mkdir": + index += 1 + continue + index += 1 + while index < len(tokens): + token = tokens[index] + if token in {"&&", "||", ";"}: + break + if token.startswith("-"): + index += 1 + continue + host_path = _app_dir_from_mkdir_token(workspace, token) + if host_path is not None: + paths.append(host_path) + index += 1 + return paths + + +def _app_dir_from_mkdir_token(workspace: Path, token: str) -> Path | None: + if not token or any(char in token for char in "$*?["): + return None + raw = token.rstrip("/") + if raw in {"", "."}: + return None + if raw.startswith("/app/"): + rel = raw.removeprefix("/app/") + elif raw == "/app": + return workspace + elif raw.startswith("/"): + return None + else: + rel = raw + if rel.startswith("../") or "/../" in rel or rel == "..": + return None + return workspace / rel + + +def _restore_dockerfile_script(workspace: Path, host_path: Path, container_path: str) -> None: + """Restore a Dockerfile-generated script hidden by a bind mount.""" + text = _script_from_dockerfile(workspace, container_path) + if text is None: + return + host_path.write_text(text, encoding="utf-8", newline="\n") + host_path.chmod(0o755) + + +def _script_from_dockerfile(workspace: Path, container_path: str) -> str | None: + """Extract a Dockerfile-generated script from a simple ``RUN printf`` command.""" + dockerfile = workspace / "Dockerfile" + if not dockerfile.is_file(): + return None + for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")): + stripped = instruction.strip() + if not stripped.startswith("RUN ") or container_path not in stripped: + continue + command = stripped.removeprefix("RUN ").strip() + try: + tokens = shlex.split(command) + except ValueError: + continue + redirect = _redirect_index(tokens, container_path) + if redirect is None or redirect < 2 or tokens[0] != "printf": + continue + text = _script_from_printf_args(tokens[1:redirect]) + if text is not None: + return text + return None + + +def _redirect_index(tokens: list[str], target: str) -> int | None: + for index, token in enumerate(tokens): + if token in {">", ">>"} and index + 1 < len(tokens) and tokens[index + 1] == target: + return index + if token in {f">{target}", f">>{target}"}: + return index + return None + + +def _script_from_printf_args(args: list[str]) -> str | None: + if not args: + return None + if args[0] in {"%s\\n", "%s\n"}: + if len(args) < 2: + return None + return "\n".join(args[1:]) + "\n" + if len(args) == 1: + return args[0].replace("\\r", "\r").replace("\\n", "\n").replace("\\t", "\t") + return None + + +def _dockerfile_logical_lines(text: str) -> list[str]: + """Join backslash-continued Dockerfile lines for simple instruction parsing.""" + lines: list[str] = [] + current = "" + for raw_line in text.splitlines(): + line = raw_line.rstrip() + if line.endswith("\\"): + current += line[:-1] + " " + continue + lines.append(current + line) + current = "" + if current: + lines.append(current) + return lines diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py index b7343b517..c67b7d924 100644 --- a/integrations/tests/test_harbor.py +++ b/integrations/tests/test_harbor.py @@ -7,7 +7,16 @@ import pytest -from integrations.harbor import detect, export, load +from integrations.harbor import HarborRuntime, detect, export, load +from integrations.harbor_runtime import ( + _compose_file, + _compose_overlay, + _dockerfile_declared_generated_app_files, + _ensure_dockerfile_created_dirs, + _ensure_start_script, + _host_path_for_app_file, + _preserved_image_paths, +) from .conftest import make_harbor_task @@ -74,6 +83,170 @@ def test_load_skips_unparseable_toml_but_keeps_the_rest(tmp_path: Path) -> None: assert {task.id for task in taskset} == {"good", "broken"} +def test_harbor_runtime_accepts_dataset_dirs(single_task: Path) -> None: + runtime = HarborRuntime(single_task.parent) + + assert single_task.name in runtime._task_dirs + + +def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None: + env = tmp_path / "environment" + env.mkdir() + compose = env / "docker-compose.yaml" + compose.write_text("services: {}\n", encoding="utf-8") + + assert _compose_file(env) == compose + + +def test_compose_overlay_mounts_main_workspace_tests_and_logs(tmp_path: Path) -> None: + overlay = _compose_overlay( + workspace=tmp_path / "workspace", + tests_dir=tmp_path / "tests", + logs=tmp_path / "logs", + preserved_paths=[], + ) + + assert "main:" in overlay + assert 'entrypoint: ["sleep"]' in overlay + assert f"{tmp_path / 'workspace'}:/app" in overlay + assert f"{tmp_path / 'tests'}:/tests:ro" in overlay + assert f"{tmp_path / 'logs'}:/logs" in overlay + + +def test_compose_overlay_preserves_image_dependency_subpaths(tmp_path: Path) -> None: + overlay = _compose_overlay( + workspace=tmp_path / "workspace", + tests_dir=tmp_path / "tests", + logs=tmp_path / "logs", + preserved_paths=["/app/node_modules"], + ) + + assert ' - "/app/node_modules"' in overlay + + +def test_preserved_image_paths_detects_node_and_php_dependency_dirs(tmp_path: Path) -> None: + (tmp_path / "package.json").write_text("{}", encoding="utf-8") + (tmp_path / "composer.json").write_text("{}", encoding="utf-8") + + assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/vendor"] + + +def test_preserved_image_paths_detects_node_build_output(tmp_path: Path) -> None: + (tmp_path / "package.json").write_text("{}", encoding="utf-8") + (tmp_path / "Dockerfile").write_text( + "FROM node:20-slim\nRUN npm ci\nRUN npm run build\n", + encoding="utf-8", + ) + + assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/dist"] + + +def test_ensure_start_script_recreates_build_generated_entrypoint(tmp_path: Path) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + (workspace / "docker-entrypoint.sh").write_text("echo start\n", encoding="utf-8") + + _ensure_start_script(workspace) + + start = workspace / "start_app.sh" + assert start.exists() + text = start.read_text(encoding="utf-8") + assert "exec sh /app/docker-entrypoint.sh" in text + + +def test_ensure_start_script_preserves_dockerfile_generated_command(tmp_path: Path) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + (workspace / "docker-entrypoint.sh").write_text('exec "$@"\n', encoding="utf-8") + (workspace / "Dockerfile").write_text( + "FROM python:3.11-slim\n" + "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' " + "'exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app' " + "> /app/start_app.sh && chmod +x /app/start_app.sh\n", + encoding="utf-8", + ) + + _ensure_start_script(workspace) + + text = (workspace / "start_app.sh").read_text(encoding="utf-8") + assert "exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app" in text + assert (workspace / "docker-entrypoint.sh").stat().st_mode & 0o111 + + +def test_ensure_start_script_restores_generated_entrypoint(tmp_path: Path) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + (workspace / "Dockerfile").write_text( + "FROM python:3.11-slim\n" + "RUN printf '#!/bin/sh\\npython -m src.seed --init\\n" + "exec uvicorn src.main:app --host 0.0.0.0 --port 8000\\n' " + "> /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh\n" + "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' " + "'exec /app/docker-entrypoint.sh' > /app/start_app.sh && chmod +x /app/start_app.sh\n", + encoding="utf-8", + ) + + _ensure_start_script(workspace) + + entrypoint = workspace / "docker-entrypoint.sh" + assert entrypoint.exists() + assert entrypoint.stat().st_mode & 0o111 + assert "python -m src.seed --init" in entrypoint.read_text(encoding="utf-8") + assert "exec /app/docker-entrypoint.sh" in (workspace / "start_app.sh").read_text( + encoding="utf-8", + ) + + +def test_ensure_dockerfile_created_dirs_restores_app_dirs(tmp_path: Path) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + (workspace / "Dockerfile").write_text( + "FROM node:20-slim\n" + "RUN mkdir -p static/uploads /app/tmp/cache && mkdir -p /var/lib/ignored\n", + encoding="utf-8", + ) + + _ensure_dockerfile_created_dirs(workspace) + + assert (workspace / "static" / "uploads").is_dir() + assert (workspace / "tmp" / "cache").is_dir() + assert not (workspace / "var" / "lib" / "ignored").exists() + + +def test_dockerfile_declared_generated_app_files_detects_seeded_sqlite_db(tmp_path: Path) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + (workspace / "Dockerfile").write_text( + "FROM python:3.11-slim\n" + "ENV DB_PATH=/app/data/salon_workforce.db\n" + "RUN python -m src.seed --init\n", + encoding="utf-8", + ) + + assert _dockerfile_declared_generated_app_files(workspace) == [ + "/app/data/salon_workforce.db", + ] + assert _host_path_for_app_file(workspace, "/app/data/salon_workforce.db") == ( + workspace / "data" / "salon_workforce.db" + ) + + +def test_dockerfile_declared_generated_app_files_ignores_non_app_or_non_db_paths( + tmp_path: Path, +) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + (workspace / "Dockerfile").write_text( + "FROM python:3.11-slim\n" + "ENV DB_PATH=/var/lib/app.db CACHE_PATH=/app/cache\n" + "ENV SOME_DATABASE_PATH=/app/data/app.txt\n", + encoding="utf-8", + ) + + assert _dockerfile_declared_generated_app_files(workspace) == [] + assert _host_path_for_app_file(workspace, "/tmp/app.db") is None + + # ─── export: HUD tasks -> Harbor task folders ─────────────────────────── _ENV_PY = """\ diff --git a/pyproject.toml b/pyproject.toml index 819732290..1bca6168e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,6 @@ build-backend = "hatchling.build" exclude = [ "docs/", "cookbooks/", - "integrations/", "**/checkpoints/", "**/*.safetensors", "**/*.ckpt", @@ -85,6 +84,7 @@ allow-direct-references = true [tool.hatch.build.targets.sdist] include = [ "hud/**", + "integrations/**", "README.md", "LICENSE", "pyproject.toml" @@ -102,7 +102,7 @@ exclude = [ ] [tool.hatch.build.targets.wheel] -packages = ["hud"] +packages = ["hud", "integrations"] # Ensure py.typed is included in the package [tool.hatch.build.targets.wheel.force-include] From 4a41ff0283f417549440474461dd7af3bf25f99a Mon Sep 17 00:00:00 2001 From: Nancy Date: Tue, 30 Jun 2026 19:01:42 -0700 Subject: [PATCH 2/3] Refine Harbor runtime CLI integration --- docs/v6/advanced/harbor-convert.mdx | 11 +- docs/v6/reference/cli.mdx | 6 +- hud/cli/eval.py | 86 ++++++---- hud/cli/tests/test_eval_config.py | 59 ++++++- integrations/__init__.py | 12 +- integrations/harbor_runtime.py | 163 ++++++++++--------- integrations/tests/test_harbor.py | 241 ++++++++++++++++++++++++++++ 7 files changed, 452 insertions(+), 126 deletions(-) diff --git a/docs/v6/advanced/harbor-convert.mdx b/docs/v6/advanced/harbor-convert.mdx index 1680fab16..21dbd721b 100644 --- a/docs/v6/advanced/harbor-convert.mdx +++ b/docs/v6/advanced/harbor-convert.mdx @@ -9,8 +9,9 @@ task dirs - is a *frontend* that loads into the same primitives (`Environment`, `Task`, `Taskset`). Integrations are **loaders, not converters**: no codegen roundtrip to run foreign tasks. The Harbor integration lives in the SDK repo at [`integrations/harbor.py`](https://github.com/hud-evals/hud-python/blob/main/integrations/harbor.py) -- a recipe built only on the public SDK surface; copy it into your project or -run it from a checkout. +- a public-surface loader that maps Harbor folders into SDK primitives. The +included `HarborRuntime` is maintained with the SDK for local Docker execution; +copy the loader into your project or run it from a checkout. ## Prerequisites @@ -44,11 +45,11 @@ from integrations.harbor import HarborRuntime job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks")) ``` -The eval CLI also detects local Harbor task directories and datasets when using -local runtime placement: +The eval CLI can run local Harbor task directories and datasets when you opt +into the Harbor source format: ```bash -hud eval ./harbor_tasks claude --task-ids cancel-async-tasks --max-steps 30 +hud eval ./harbor_tasks claude --format harbor --task-ids cancel-async-tasks --max-steps 30 ``` ## Export HUD tasks to Harbor diff --git a/docs/v6/reference/cli.mdx b/docs/v6/reference/cli.mdx index 852963982..8b18431b1 100644 --- a/docs/v6/reference/cli.mdx +++ b/docs/v6/reference/cli.mdx @@ -105,6 +105,7 @@ For a platform taskset, pass its name or id directly: `hud eval "My Tasks" claud | `--config`, `-c` | Agent config `key=value` (repeatable). | | `--verbose`, `-v` | Show agent logs (step progress, tool calls) for batch runs too. | | `--very-verbose`, `-vv` | Debug-level logs. | +| `--format` | Task source format: `hud` (default) or `harbor`. | | `--runtime` | Placement: `local`, `hud` (HUD runtime tunnel), or `tcp://host:port`. Defaults to `local` for a tasks file; platform tasksets default to remote hosted execution. | | `--remote` | Run the whole rollout remotely on the HUD platform. | | `--yes`, `-y` | Skip confirmation prompt. | @@ -133,8 +134,9 @@ hud sync env # sync environment metadata ``` External benchmark formats (currently Harbor) load directly into the runtime -as `Taskset`s - no conversion step. Local Harbor directories run with the Harbor -Docker-backed runtime provider. See [Harbor interop](/v6/advanced/harbor-convert). +as `Taskset`s - no conversion step. For local Harbor directories, opt in with +`--format harbor` so the CLI uses the Harbor loader and Docker-backed runtime +provider. See [Harbor interop](/v6/advanced/harbor-convert). ## Inspect diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 9978eb402..01bf0883c 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -63,6 +63,7 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None: _CONFIG_PATH = ".hud_eval.toml" _PLACEMENT_CONFLICT_ERROR = "--runtime and --remote are mutually exclusive placement options" +_SOURCE_FORMATS = ("hud", "harbor") def _resolve_env_vars(obj: Any) -> Any: @@ -167,6 +168,7 @@ class AgentPreset: # very_verbose = true # auto_respond = true # gateway = false # Route LLM API calls through HUD Gateway +# format = "hud" # hud or harbor # runtime = "local" # local, hud, or tcp://host:port # remote = false # Run the whole rollout remotely on HUD @@ -264,6 +266,7 @@ class EvalConfig(BaseModel): "group_size", "auto_respond", "gateway", + "format", "runtime", "remote", } @@ -279,6 +282,9 @@ class EvalConfig(BaseModel): auto_respond: bool | None = None group_size: int = 1 gateway: bool = False + #: Source format. ``None``/``hud`` means normal HUD task source loading; + #: ``harbor`` opts into the Harbor integration loader/runtime. + format: str | None = None #: Placement: "local" (spawn each row's env from the source), "hud" #: (HUD runtime tunnel), or a tcp:// url of an already-served env. #: ``None`` means "infer from the source": a local file runs locally, a @@ -306,6 +312,20 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None: ) from None return v + @field_validator("format", mode="before") + @classmethod + def _parse_format(cls, v: Any) -> str | None: + if v is None: + return None + if not isinstance(v, str): + return v + normalized = v.strip().lower() + if normalized in ("", "hud"): + return None + if normalized in _SOURCE_FORMATS: + return normalized + raise ValueError(f"Invalid format: {v}. Must be one of: {', '.join(_SOURCE_FORMATS)}") + def source_is_local_file(self) -> bool: """Whether ``source`` points at an on-disk taskset (vs. a platform slug/id).""" return self.source is not None and Path(self.source).exists() @@ -319,6 +339,13 @@ def resolve_runtime(self) -> EvalConfig: ``--runtime`` is always honored, except ``local`` against a platform taskset, which has no env to spawn. """ + if self.format == "harbor": + if not self.source_is_local_file(): + hud_console.error("--format harbor requires a local Harbor task directory") + raise typer.Exit(1) + if self.remote or (self.runtime is not None and self.runtime != "local"): + hud_console.error("--format harbor currently supports only local runtime placement") + raise typer.Exit(1) if self.runtime is None: if self.source_is_local_file(): return self.model_copy(update={"runtime": "local"}) @@ -502,6 +529,7 @@ def merge_cli( gateway: bool = False, config: list[str] | None = None, task_ids: str | None = None, + format: str | None = None, runtime: str | None = None, remote: bool = False, ) -> EvalConfig: @@ -517,6 +545,7 @@ def merge_cli( "max_concurrent": max_concurrent, "max_steps": max_steps, "group_size": group_size, + "format": format, "runtime": runtime, }.items() if value is not None @@ -604,6 +633,8 @@ def display(self) -> None: table.add_column("Value", style="green") table.add_row("source", str(self.source or "-")) + if self.format: + table.add_row("format", self.format) table.add_row("runtime", str(self.runtime or "-")) table.add_row("agent", self.agent_type.value if self.agent_type else "-") if self.task_ids: @@ -728,32 +759,29 @@ def _spawn_target(source: Path) -> Path: return resolved.parent -def _is_harbor_source(source_path: Path | None) -> bool: - if source_path is None or not source_path.exists(): - return False - if not source_path.is_dir(): - return False - from integrations.harbor import detect - - return detect(source_path) - - -def _load_local_taskset(source_path: Path) -> tuple[Any, str]: +def _load_local_taskset(source_path: Path, source_format: str | None) -> Any: from hud.eval import Taskset - if _is_harbor_source(source_path): + format_name = source_format or "hud" + if format_name == "hud": + taskset = Taskset.from_file(source_path) + if len(taskset) == 0: + from integrations.harbor import detect + + if detect(source_path): + hud_console.hint( + f"{source_path} looks like a Harbor task directory; " + "rerun with --format harbor to load it." + ) + return taskset + if format_name == "harbor": from integrations.harbor import load - return load(source_path), "harbor" - return Taskset.from_file(source_path), "hud" + return load(source_path) + raise ValueError(f"unsupported task source format: {format_name}") -def _resolve_placement( - cfg: EvalConfig, - source_path: Path | None, - *, - source_kind: str = "hud", -) -> Any: +def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any: """Map the config's ``runtime`` onto a placement for ``Taskset.run``. "local" spawns each row's env from the source next to the tasks file; @@ -769,7 +797,7 @@ def _resolve_placement( if cfg.runtime == "local": if source_path is None: raise ValueError("local placement requires a local source path") - if source_kind == "harbor": + if cfg.format == "harbor": from integrations.harbor import HarborRuntime return HarborRuntime(source_path) @@ -798,11 +826,10 @@ async def _run_evaluation(cfg: EvalConfig) -> Any: source_path = Path(cfg.source) is_local = source_path.exists() - source_kind = "api" if is_local: hud_console.info(f"Loading tasks from: {cfg.source}") try: - taskset, source_kind = _load_local_taskset(source_path) + taskset = _load_local_taskset(source_path, cfg.format) except Exception as e: hud_console.error(f"Failed to load tasks from {cfg.source}: {e}") raise typer.Exit(1) from e @@ -862,11 +889,7 @@ async def _run_evaluation(cfg: EvalConfig) -> Any: ) agent = _build_agent(cfg) - placement = _resolve_placement( - cfg, - source_path if is_local else None, - source_kind=source_kind, - ) + placement = _resolve_placement(cfg, source_path if is_local else None) job = await taskset.run( agent, @@ -922,6 +945,11 @@ def eval_command( gateway: bool = typer.Option( False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway" ), + format: str | None = typer.Option( + None, + "--format", + help="Task source format: hud (default) or harbor.", + ), runtime: str | None = typer.Option( None, "--runtime", @@ -942,6 +970,7 @@ def eval_command( hud eval "My Tasks" claude-sonnet-4-6 --full # Platform taskset, run on the platform hud eval tasks.json claude --config max_tokens=32768 hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway + hud eval ./harbor_tasks claude --format harbor # Run Harbor task dirs locally hud eval tasks.json claude-sonnet-4-6 --runtime hud # Use HUD runtime tunnel hud eval tasks.json claude-sonnet-4-6 --remote # Execute rollout remotely """ @@ -972,6 +1001,7 @@ def eval_command( group_size=group_size, config=config, gateway=gateway, + format=format, runtime=runtime, remote=remote, ) diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py index e1183860c..bbd0d4f96 100644 --- a/hud/cli/tests/test_eval_config.py +++ b/hud/cli/tests/test_eval_config.py @@ -153,30 +153,77 @@ def test_resolve_placement_runtime_hud_uses_tunnel( assert isinstance(placement, HUDRuntime) -def test_load_local_taskset_uses_harbor_loader_for_harbor_dirs(tmp_path: Path) -> None: +def test_load_local_taskset_uses_hud_loader_by_default(tmp_path: Path) -> None: _write_harbor_task(tmp_path) - taskset, source_kind = eval_mod._load_local_taskset(tmp_path) + taskset = eval_mod._load_local_taskset(tmp_path, None) + + assert len(taskset) == 0 + + +def test_load_local_taskset_hints_harbor_format_on_zero_task_harbor_dir( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _write_harbor_task(tmp_path) + hints: list[str] = [] + monkeypatch.setattr(eval_mod.hud_console, "hint", lambda message, **_: hints.append(message)) + + taskset = eval_mod._load_local_taskset(tmp_path, None) + + assert len(taskset) == 0 + assert any("--format harbor" in hint for hint in hints) + + +def test_load_local_taskset_rejects_unknown_format(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="unsupported task source format"): + eval_mod._load_local_taskset(tmp_path, "unknown") + + +def test_load_local_taskset_uses_harbor_loader_when_format_is_harbor(tmp_path: Path) -> None: + _write_harbor_task(tmp_path) + + taskset = eval_mod._load_local_taskset(tmp_path, "harbor") - assert source_kind == "harbor" assert len(taskset) == 1 assert taskset["demo-task"].id == "demo-task" -def test_resolve_placement_local_harbor_uses_harbor_runtime(tmp_path: Path) -> None: +def test_resolve_placement_local_harbor_format_uses_harbor_runtime(tmp_path: Path) -> None: from integrations.harbor import HarborRuntime _write_harbor_task(tmp_path) placement = eval_mod._resolve_placement( - EvalConfig(runtime="local"), + EvalConfig(runtime="local", format="harbor"), tmp_path, - source_kind="harbor", ) assert isinstance(placement, HarborRuntime) +def test_resolve_placement_local_hud_format_uses_local_runtime(tmp_path: Path) -> None: + from hud.eval import LocalRuntime + + _write_harbor_task(tmp_path) + + placement = eval_mod._resolve_placement(EvalConfig(runtime="local"), tmp_path) + + assert isinstance(placement, LocalRuntime) + + +def test_harbor_format_rejects_nonlocal_source() -> None: + with pytest.raises(typer.Exit): + EvalConfig(source="platform/taskset", format="harbor").resolve_runtime() + + +def test_harbor_format_rejects_nonlocal_runtime(tmp_path: Path) -> None: + _write_harbor_task(tmp_path) + + with pytest.raises(typer.Exit): + EvalConfig(source=str(tmp_path), format="harbor", runtime="hud").resolve_runtime() + + def test_resolve_placement_remote_uses_hosted_runtime( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, diff --git a/integrations/__init__.py b/integrations/__init__.py index e4a817b4e..baa460f4d 100644 --- a/integrations/__init__.py +++ b/integrations/__init__.py @@ -5,12 +5,12 @@ primitives. Integrations are **loaders, not converters**: no codegen roundtrip to run foreign tasks. -This package lives outside ``hud`` on purpose: each module is a recipe built -**only on the public SDK surface** (``Environment``, ``Task``, -``Taskset``, ``Runtime``) — that constraint is the proof the core is -flexible. Copy a module into your project or run it from a checkout. The CLI may -call selected integrations explicitly for polished interop paths, but the -integration contract itself stays independent of private SDK hooks. +This package lives outside ``hud`` on purpose: loaders are recipes built on the +public SDK surface (``Environment``, ``Task``, ``Taskset``, ``Runtime``). Copy a +loader into your project or run it from a checkout. The CLI may call selected +integrations explicitly for polished interop paths. A repo-maintained +integration may also expose a local provider for that explicit CLI path; that +provider is SDK implementation code, not the portable loader contract. The contract: an integration module exposes ``detect(path) -> bool`` and ``load(path) -> Taskset``. Placement stays an execution-time concern — loaders diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py index 872399bc1..5d644525a 100644 --- a/integrations/harbor_runtime.py +++ b/integrations/harbor_runtime.py @@ -2,12 +2,15 @@ from __future__ import annotations +import asyncio import contextlib import json import shlex import shutil import tempfile +import tomllib import uuid +from collections.abc import AsyncGenerator # noqa: TC003 - env.template resolves this at runtime. from pathlib import Path from typing import TYPE_CHECKING, Any @@ -16,7 +19,7 @@ from integrations.harbor_common import _hash_directory, _slugify, _task_dirs if TYPE_CHECKING: - from collections.abc import AsyncGenerator, AsyncIterator + from collections.abc import AsyncIterator import asyncssh @@ -35,7 +38,8 @@ class HarborRuntime: The agent receives normal HUD SSH/SFTP access; shell commands execute inside the main container via ``docker exec`` while file transfer edits the mounted host workspace. Grading runs the Harbor ``tests/test.sh`` inside the same - main container and reads ``/logs/verifier/reward.json`` or ``reward.txt``. + main container, bounded by the task's ``[verifier] timeout_sec``, and reads + ``/logs/verifier/reward.json`` or ``reward.txt``. """ def __init__( @@ -49,7 +53,6 @@ def __init__( self._task_dirs = {task_dir.name: task_dir for task_dir in _task_dirs(self.root)} if not self._task_dirs: raise ValueError(f"no Harbor tasks found in {path}") - self._image_cache: dict[Path, str] = {} @contextlib.asynccontextmanager async def __call__(self, task: Task) -> AsyncIterator[Runtime]: @@ -77,53 +80,26 @@ async def __call__(self, task: Task) -> AsyncIterator[Runtime]: compose_file = _compose_file(env_dir) if compose_file is not None: - async with self._compose_container( - task, - compose_file, - workspace, - tests_dir, - logs, - preserved_paths, - ) as ( - container, - provider, - ): - env = self._environment_for(task, task_dir, workspace, logs, container) - async with _local(env) as runtime: - yield Runtime( - runtime.url, - params={ - **runtime.params, - "provider": provider, - "container": container, - "ready_timeout": self.ready_timeout, - }, - config=runtime.config, - ) + acquire = self._compose_container( + task, compose_file, workspace, tests_dir, logs, preserved_paths + ) else: - async with self._single_container( - task, - task_dir, - workspace, - tests_dir, - logs, - preserved_paths, - ) as ( - container, - provider, - ): - env = self._environment_for(task, task_dir, workspace, logs, container) - async with _local(env) as runtime: - yield Runtime( - runtime.url, - params={ - **runtime.params, - "provider": provider, - "container": container, - "ready_timeout": self.ready_timeout, - }, - config=runtime.config, - ) + acquire = self._single_container( + task, task_dir, workspace, tests_dir, logs, preserved_paths + ) + async with acquire as (container, provider): + env = self._environment_for(task, task_dir, workspace, logs, container) + async with _local(env) as runtime: + yield Runtime( + runtime.url, + params={ + **runtime.params, + "provider": provider, + "container": container, + "ready_timeout": self.ready_timeout, + }, + config=runtime.config, + ) @contextlib.asynccontextmanager async def _single_container( @@ -137,8 +113,8 @@ async def _single_container( ) -> AsyncIterator[tuple[str, str]]: from hud.eval.runtime import _docker - image = await self._image_for(task_dir) env_dir = task_dir / "environment" + image = await self._build_image(env_dir) await _restore_image_generated_files(image, workspace) container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}" preserved_volume_args = [arg for path in preserved_paths for arg in ("--volume", path)] @@ -165,10 +141,10 @@ async def _single_container( try: yield container, "harbor" finally: - await _release_mount_permissions(container) + with contextlib.suppress(Exception): + await _release_mount_permissions(container) await _docker("rm", "--force", "--volumes", container, check=False) await _docker("image", "rm", image, check=False) - self._image_cache.pop(env_dir, None) @contextlib.asynccontextmanager async def _compose_container( @@ -195,15 +171,23 @@ async def _compose_container( newline="\n", ) compose_args = ("compose", "-f", str(compose_file), "-f", str(overlay), "-p", project) - await _docker(*compose_args, "up", "--detach", "--build") - out, _ = await _docker(*compose_args, "ps", "-q", "main") - container = out.strip() - if not container: - raise RuntimeError(f"docker compose project {project} did not create a main service") + container = "" try: + await _docker(*compose_args, "up", "--detach", "--build") + out, _ = await _docker(*compose_args, "ps", "-q", "main") + container = out.strip() + if not container: + raise RuntimeError( + f"docker compose project {project} did not create a main service" + ) + if _dockerfile_declared_generated_app_files(workspace): + image_out, _ = await _docker("inspect", "--format", "{{.Image}}", container) + await _restore_image_generated_files(image_out.strip(), workspace) yield container, "harbor-compose" finally: - await _release_mount_permissions(container) + if container: + with contextlib.suppress(Exception): + await _release_mount_permissions(container) await _docker( *compose_args, "down", @@ -214,16 +198,11 @@ async def _compose_container( check=False, ) - async def _image_for(self, task_dir: Path) -> str: + async def _build_image(self, env_dir: Path) -> str: from hud.eval.runtime import _docker - env_dir = task_dir / "environment" - cached = self._image_cache.get(env_dir) - if cached is not None: - return cached - tag = f"hud-harbor:{_hash_directory(env_dir)}" + tag = f"hud-harbor:{_hash_directory(env_dir)}-{uuid.uuid4().hex[:8]}" await _docker("build", "--tag", tag, str(env_dir)) - self._image_cache[env_dir] = tag return tag def _environment_for( @@ -236,6 +215,7 @@ def _environment_for( ) -> Environment: env = Environment(task.env) workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path="/app") + verifier_timeout = _verifier_timeout(task_dir) @env.initialize async def _up() -> None: @@ -249,25 +229,42 @@ async def _down() -> None: @env.template(id=task.id, description=f"Harbor task {task.id}") async def _run_harbor_task() -> AsyncGenerator[Any, Any]: answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8") - yield await self._grade(container, logs, answer) + yield await self._grade(container, logs, answer, verifier_timeout=verifier_timeout) return env - async def _grade(self, container: str, logs: Path, answer: Any) -> dict[str, Any]: - from hud.eval.runtime import _docker - + async def _grade( + self, container: str, logs: Path, answer: Any, *, verifier_timeout: float + ) -> dict[str, Any]: answer_file = logs / "agent_answer.txt" answer_file.parent.mkdir(parents=True, exist_ok=True) answer_file.write_text("" if answer is None else str(answer), encoding="utf-8") - out, err = await _docker( + proc = await asyncio.create_subprocess_exec( + "docker", "exec", "--workdir", "/app", container, "bash", "/tests/test.sh", - check=False, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, ) + try: + out_bytes, err_bytes = await asyncio.wait_for( + proc.communicate(), timeout=verifier_timeout + ) + except TimeoutError: + proc.kill() + await proc.wait() + return { + "score": 0.0, + "isError": True, + "content": f"Harbor verifier timed out after {verifier_timeout:.0f}s", + "info": {"verifier_timeout_sec": verifier_timeout}, + } + out = out_bytes.decode("utf-8", "replace") + err = err_bytes.decode("utf-8", "replace") reward, info = _read_harbor_reward(logs / "verifier") info.update( { @@ -293,8 +290,6 @@ def __init__(self, *args: Any, container: str, **kwargs: Any) -> None: self._container = container async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> None: - import asyncio - command = process.command or "bash -l" proc = await asyncio.create_subprocess_exec( "docker", @@ -329,6 +324,22 @@ async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> No process.exit(proc.returncode if proc.returncode is not None else 0) +_DEFAULT_VERIFIER_TIMEOUT = 600.0 + + +def _verifier_timeout(task_dir: Path) -> float: + """The task's ``[verifier] timeout_sec``, or the Harbor default.""" + try: + config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8")) + except (OSError, tomllib.TOMLDecodeError): + return _DEFAULT_VERIFIER_TIMEOUT + verifier = config.get("verifier") + timeout = verifier.get("timeout_sec") if isinstance(verifier, dict) else None + if isinstance(timeout, int | float) and not isinstance(timeout, bool) and timeout > 0: + return float(timeout) + return _DEFAULT_VERIFIER_TIMEOUT + + def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]: reward_json = verifier_logs / "reward.json" if reward_json.is_file(): @@ -340,12 +351,6 @@ def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, An value = data.get(key) if isinstance(value, int | float): return float(value), {"reward_file": str(reward_json), "reward_json": data} - numeric = [float(value) for value in data.values() if isinstance(value, int | float)] - if numeric: - return sum(numeric) / len(numeric), { - "reward_file": str(reward_json), - "reward_json": data, - } return None, {"reward_file": str(reward_json), "reward_parse_error": "no numeric reward"} reward_txt = verifier_logs / "reward.txt" diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py index c67b7d924..623f8560b 100644 --- a/integrations/tests/test_harbor.py +++ b/integrations/tests/test_harbor.py @@ -2,11 +2,14 @@ from __future__ import annotations +import asyncio +import json import textwrap from typing import TYPE_CHECKING import pytest +from hud.eval import Task from integrations.harbor import HarborRuntime, detect, export, load from integrations.harbor_runtime import ( _compose_file, @@ -16,6 +19,8 @@ _ensure_start_script, _host_path_for_app_file, _preserved_image_paths, + _read_harbor_reward, + _verifier_timeout, ) from .conftest import make_harbor_task @@ -89,6 +94,96 @@ def test_harbor_runtime_accepts_dataset_dirs(single_task: Path) -> None: assert single_task.name in runtime._task_dirs +async def test_harbor_runtime_builds_unique_images_per_acquisition( + single_task: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + calls: list[tuple[tuple[str, ...], bool]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append((args, check)) + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + runtime = HarborRuntime(single_task.parent) + + first = await runtime._build_image(single_task / "environment") + second = await runtime._build_image(single_task / "environment") + + assert first != second + assert first.startswith("hud-harbor:") + assert second.startswith("hud-harbor:") + assert [args[2] for args, _ in calls] == [first, second] + + +async def test_compose_container_cleans_up_after_failed_up( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + compose = single_task / "environment" / "docker-compose.yaml" + compose.write_text("services:\n main:\n build: .\n", encoding="utf-8") + calls: list[tuple[tuple[str, ...], bool]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append((args, check)) + if args[-3:] == ("up", "--detach", "--build"): + raise RuntimeError("compose failed") + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + runtime = HarborRuntime(single_task.parent) + + with pytest.raises(RuntimeError, match="compose failed"): + async with runtime._compose_container( + Task(env="bench", id=single_task.name), + compose, + tmp_path / "workspace", + single_task / "tests", + tmp_path / "logs", + [], + ): + raise AssertionError("compose acquisition should not yield") + + assert any( + args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False + for args, check in calls + ) + + +async def test_compose_container_cleans_up_when_main_service_is_missing( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + compose = single_task / "environment" / "docker-compose.yaml" + compose.write_text("services:\n api:\n build: .\n", encoding="utf-8") + calls: list[tuple[tuple[str, ...], bool]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append((args, check)) + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + runtime = HarborRuntime(single_task.parent) + + with pytest.raises(RuntimeError, match="did not create a main service"): + async with runtime._compose_container( + Task(env="bench", id=single_task.name), + compose, + tmp_path / "workspace", + single_task / "tests", + tmp_path / "logs", + [], + ): + raise AssertionError("compose acquisition should not yield") + + assert any( + args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False + for args, check in calls + ) + + def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None: env = tmp_path / "environment" env.mkdir() @@ -247,6 +342,152 @@ def test_dockerfile_declared_generated_app_files_ignores_non_app_or_non_db_paths assert _host_path_for_app_file(workspace, "/tmp/app.db") is None +async def test_compose_container_restores_image_generated_db_files( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + compose = single_task / "environment" / "docker-compose.yaml" + compose.write_text("services:\n main:\n build: .\n", encoding="utf-8") + workspace = tmp_path / "workspace" + workspace.mkdir() + (workspace / "Dockerfile").write_text( + "FROM python:3.11-slim\nENV DB_PATH=/app/data/app.db\n", encoding="utf-8" + ) + calls: list[tuple[str, ...]] = [] + + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + calls.append(args) + if args[0] == "compose" and args[-3:] == ("ps", "-q", "main"): + return "maincontainer\n", "" + if args[0] == "inspect": + return "sha256:mainimage\n", "" + if args[0] == "create": + return "tempcontainer\n", "" + return "", "" + + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) + runtime = HarborRuntime(single_task.parent) + + async with runtime._compose_container( + Task(env="bench", id=single_task.name), + compose, + workspace, + single_task / "tests", + tmp_path / "logs", + [], + ): + pass + + assert ("inspect", "--format", "{{.Image}}", "maincontainer") in calls + assert ( + "cp", + "tempcontainer:/app/data/app.db", + str(workspace / "data" / "app.db"), + ) in calls + + +def test_read_harbor_reward_prefers_reward_and_score_keys(tmp_path: Path) -> None: + verifier = tmp_path / "verifier" + verifier.mkdir() + (verifier / "reward.json").write_text(json.dumps({"reward": 0.5, "total": 5}), "utf-8") + + reward, info = _read_harbor_reward(verifier) + + assert reward == 0.5 + assert info["reward_json"] == {"reward": 0.5, "total": 5} + + +def test_read_harbor_reward_rejects_dict_without_reward_or_score(tmp_path: Path) -> None: + verifier = tmp_path / "verifier" + verifier.mkdir() + (verifier / "reward.json").write_text(json.dumps({"passed": 3, "total": 5}), "utf-8") + + reward, info = _read_harbor_reward(verifier) + + assert reward is None + assert info["reward_parse_error"] == "no numeric reward" + + +def test_verifier_timeout_reads_task_toml(single_task: Path) -> None: + assert _verifier_timeout(single_task) == 120.0 + + +def test_verifier_timeout_defaults_when_missing_or_invalid(tmp_path: Path) -> None: + no_verifier = tmp_path / "no-verifier" + no_verifier.mkdir() + (no_verifier / "task.toml").write_text('[metadata]\ncategory = "systems"\n', "utf-8") + broken = tmp_path / "broken" + broken.mkdir() + (broken / "task.toml").write_text("not toml [", "utf-8") + + assert _verifier_timeout(no_verifier) == 600.0 + assert _verifier_timeout(broken) == 600.0 + assert _verifier_timeout(tmp_path / "missing") == 600.0 + + +async def test_grade_reads_reward_after_verifier_completes( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + logs = tmp_path / "logs" + (logs / "verifier").mkdir(parents=True) + (logs / "verifier" / "reward.txt").write_text("1.0\n", "utf-8") + + class FakeProc: + returncode = 0 + + async def communicate(self) -> tuple[bytes, bytes]: + return b"verifier out", b"" + + async def fake_exec(*args: str, **kwargs: object) -> FakeProc: + assert args[:2] == ("docker", "exec") + return FakeProc() + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec) + runtime = HarborRuntime(single_task.parent) + + result = await runtime._grade("container", logs, "done", verifier_timeout=120.0) + + assert result["score"] == 1.0 + assert result["info"]["stdout"] == "verifier out" + assert (logs / "agent_answer.txt").read_text("utf-8") == "done" + + +async def test_grade_times_out_when_verifier_hangs( + single_task: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeProc: + killed = False + + async def communicate(self) -> tuple[bytes, bytes]: + await asyncio.sleep(3600) + raise AssertionError("unreachable") + + def kill(self) -> None: + self.killed = True + + async def wait(self) -> int: + return -9 + + proc = FakeProc() + + async def fake_exec(*args: str, **kwargs: object) -> FakeProc: + return proc + + monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec) + runtime = HarborRuntime(single_task.parent) + + result = await runtime._grade("container", tmp_path / "logs", None, verifier_timeout=0.05) + + assert result["isError"] is True + assert "timed out" in result["content"] + assert proc.killed + + # ─── export: HUD tasks -> Harbor task folders ─────────────────────────── _ENV_PY = """\ From 455f23c96e21c5039cae063819da4415ac97af9b Mon Sep 17 00:00:00 2001 From: Nancy Date: Fri, 3 Jul 2026 11:15:37 +0800 Subject: [PATCH 3/3] Generalize Harbor runtime via image-workdir materialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the Dockerfile-parsing fidelity heuristics (start-script recreation, mkdir-dir restoration, node_modules/vendor/dist submounts, seeded-SQLite restoration, and the hardcoded /app workdir) with a single mechanism: after building the task image, copy its actual working directory onto the host workspace and bind-mount that back over the same guest path. The workspace is then the image's real workdir — source plus every build-generated file, with original mode bits — so nothing the build produced is shadowed by the editable mount, and the guest path is derived from the image's WORKDIR instead of assumed to be /app. This removes ~330 lines of corpus-tuned parsing and makes the runner faithful to any Harbor/terminal-bench image shape rather than the NOVATStyle export specifically. Validated on real Docker across all six artifact classes the heuristics used to cover (compose+postgres, gunicorn start script, node_modules, mkdir dirs, node dist, seeded sqlite): all return reward 0.0 with is_error false and clean teardown. Co-Authored-By: Claude Fable 5 --- integrations/harbor_runtime.py | 430 +++++++----------------------- integrations/tests/test_harbor.py | 200 +++----------- 2 files changed, 129 insertions(+), 501 deletions(-) diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py index 5d644525a..50277726c 100644 --- a/integrations/harbor_runtime.py +++ b/integrations/harbor_runtime.py @@ -5,8 +5,7 @@ import asyncio import contextlib import json -import shlex -import shutil +import os import tempfile import tomllib import uuid @@ -30,16 +29,19 @@ class HarborRuntime: """Run Harbor task directories through HUD's local rollout engine. - The provider builds the Harbor task's ``environment/`` Docker context, runs - a fresh container with a writable host workspace mounted at ``/app``, and - serves a small HUD control channel from the host process. If the task ships a - ``docker-compose.yaml``/``.yml``, the provider starts it with an overlay that - keeps the ``main`` service idle while preserving sidecars such as databases. - The agent receives normal HUD SSH/SFTP access; shell commands execute inside - the main container via ``docker exec`` while file transfer edits the mounted - host workspace. Grading runs the Harbor ``tests/test.sh`` inside the same - main container, bounded by the task's ``[verifier] timeout_sec``, and reads - ``/logs/verifier/reward.json`` or ``reward.txt``. + The provider builds the Harbor task's ``environment/`` Docker context, then + materializes the built image's working directory onto a writable host + workspace and bind-mounts it back over the same guest path. Because the + workspace is the image's actual working directory (source *plus* every file + the build generated — start scripts, installed dependencies, compiled output, + seeded databases — with their original mode bits), the agent sees exactly + what the image would run, and edits made over SFTP are visible to the running + process. If the task ships a ``docker-compose.yaml``/``.yml``, the provider + starts it with an overlay that keeps the ``main`` service idle while + preserving sidecars such as databases. Shell commands execute inside the main + container via ``docker exec``. Grading runs the Harbor ``tests/test.sh`` + inside the same main container, bounded by the task's ``[verifier] + timeout_sec``, and reads ``/logs/verifier/reward.json`` or ``reward.txt``. """ def __init__( @@ -56,7 +58,7 @@ def __init__( @contextlib.asynccontextmanager async def __call__(self, task: Task) -> AsyncIterator[Runtime]: - from hud.eval.runtime import Runtime, _local + from hud.eval.runtime import Runtime, _docker, _local task_dir = self._task_dirs.get(task.id) if task_dir is None: @@ -72,23 +74,23 @@ async def __call__(self, task: Task) -> AsyncIterator[Runtime]: tmp_path = Path(tmp) workspace = tmp_path / "workspace" logs = tmp_path / "logs" - shutil.copytree(env_dir, workspace) - _ensure_start_script(workspace) - _ensure_dockerfile_created_dirs(workspace) - preserved_paths = _preserved_image_paths(workspace) + workspace.mkdir() logs.mkdir(parents=True, exist_ok=True) + image = await self._build_image(env_dir) + workdir = await _image_workdir(image) + await _materialize_workspace(image, workspace, workdir) + compose_file = _compose_file(env_dir) if compose_file is not None: + await _docker("image", "rm", image, check=False) acquire = self._compose_container( - task, compose_file, workspace, tests_dir, logs, preserved_paths + task, compose_file, workspace, workdir, tests_dir, logs ) else: - acquire = self._single_container( - task, task_dir, workspace, tests_dir, logs, preserved_paths - ) + acquire = self._single_container(task, image, workspace, workdir, tests_dir, logs) async with acquire as (container, provider): - env = self._environment_for(task, task_dir, workspace, logs, container) + env = self._environment_for(task, task_dir, workspace, workdir, logs, container) async with _local(env) as runtime: yield Runtime( runtime.url, @@ -105,35 +107,30 @@ async def __call__(self, task: Task) -> AsyncIterator[Runtime]: async def _single_container( self, task: Task, - task_dir: Path, + image: str, workspace: Path, + workdir: str, tests_dir: Path, logs: Path, - preserved_paths: list[str], ) -> AsyncIterator[tuple[str, str]]: from hud.eval.runtime import _docker - env_dir = task_dir / "environment" - image = await self._build_image(env_dir) - await _restore_image_generated_files(image, workspace) container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}" - preserved_volume_args = [arg for path in preserved_paths for arg in ("--volume", path)] out, _ = await _docker( "run", "--detach", "--name", container_name, "--workdir", - "/app", + workdir, "--entrypoint", "sleep", "--volume", - f"{workspace}:/app", + f"{workspace}:{workdir}", "--volume", f"{tests_dir}:/tests:ro", "--volume", f"{logs}:/logs", - *preserved_volume_args, image, "infinity", ) @@ -142,7 +139,7 @@ async def _single_container( yield container, "harbor" finally: with contextlib.suppress(Exception): - await _release_mount_permissions(container) + await _release_mount_permissions(container, workdir) await _docker("rm", "--force", "--volumes", container, check=False) await _docker("image", "rm", image, check=False) @@ -152,21 +149,16 @@ async def _compose_container( task: Task, compose_file: Path, workspace: Path, + workdir: str, tests_dir: Path, logs: Path, - preserved_paths: list[str], ) -> AsyncIterator[tuple[str, str]]: from hud.eval.runtime import _docker project = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}" overlay = workspace.parent / "compose.hud.yaml" overlay.write_text( - _compose_overlay( - workspace=workspace, - tests_dir=tests_dir, - logs=logs, - preserved_paths=preserved_paths, - ), + _compose_overlay(workspace=workspace, workdir=workdir, tests_dir=tests_dir, logs=logs), encoding="utf-8", newline="\n", ) @@ -180,14 +172,11 @@ async def _compose_container( raise RuntimeError( f"docker compose project {project} did not create a main service" ) - if _dockerfile_declared_generated_app_files(workspace): - image_out, _ = await _docker("inspect", "--format", "{{.Image}}", container) - await _restore_image_generated_files(image_out.strip(), workspace) yield container, "harbor-compose" finally: if container: with contextlib.suppress(Exception): - await _release_mount_permissions(container) + await _release_mount_permissions(container, workdir) await _docker( *compose_args, "down", @@ -210,11 +199,12 @@ def _environment_for( task: Task, task_dir: Path, workspace: Path, + workdir: str, logs: Path, container: str, ) -> Environment: env = Environment(task.env) - workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path="/app") + workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path=workdir) verifier_timeout = _verifier_timeout(task_dir) @env.initialize @@ -229,12 +219,14 @@ async def _down() -> None: @env.template(id=task.id, description=f"Harbor task {task.id}") async def _run_harbor_task() -> AsyncGenerator[Any, Any]: answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8") - yield await self._grade(container, logs, answer, verifier_timeout=verifier_timeout) + yield await self._grade( + container, workdir, logs, answer, verifier_timeout=verifier_timeout + ) return env async def _grade( - self, container: str, logs: Path, answer: Any, *, verifier_timeout: float + self, container: str, workdir: str, logs: Path, answer: Any, *, verifier_timeout: float ) -> dict[str, Any]: answer_file = logs / "agent_answer.txt" answer_file.parent.mkdir(parents=True, exist_ok=True) @@ -243,7 +235,7 @@ async def _grade( "docker", "exec", "--workdir", - "/app", + workdir, container, "bash", "/tests/test.sh", @@ -340,6 +332,48 @@ def _verifier_timeout(task_dir: Path) -> float: return _DEFAULT_VERIFIER_TIMEOUT +async def _image_workdir(image: str) -> str: + """The image's configured ``WORKDIR``, or ``/app`` when it declares none.""" + from hud.eval.runtime import _docker + + out, _ = await _docker("image", "inspect", "--format", "{{.Config.WorkingDir}}", image) + return out.strip() or "/app" + + +async def _materialize_workspace(image: str, workspace: Path, workdir: str) -> None: + """Copy the built image's ``workdir`` onto the host workspace, then own it. + + The ``workdir`` bind mount would otherwise shadow everything the Docker build + generated there (start scripts, installed dependencies, compiled output, + seeded databases). Copying the image's actual ``workdir`` out first makes the + mounted workspace a faithful, editable copy of what the image runs. Files + arrive owned by the container's build user; hand them to the host user so the + agent can edit them over SFTP and teardown can remove them. + """ + from hud.eval.runtime import _docker + + out, _ = await _docker("create", image, "true") + container = out.strip() + try: + await _docker("cp", f"{container}:{workdir}/.", str(workspace)) + finally: + await _docker("rm", "--force", "--volumes", container, check=False) + + if hasattr(os, "getuid"): + await _docker( + "run", + "--rm", + "--volume", + f"{workspace}:{workdir}", + image, + "chown", + "-R", + f"{os.getuid()}:{os.getgid()}", + workdir, + check=False, + ) + + def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]: reward_json = verifier_logs / "reward.json" if reward_json.is_file(): @@ -364,7 +398,7 @@ def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, An return None, {} -async def _release_mount_permissions(container: str) -> None: +async def _release_mount_permissions(container: str, workdir: str) -> None: """Let the host user delete files that container-root created in mounts.""" from hud.eval.runtime import _docker @@ -373,7 +407,7 @@ async def _release_mount_permissions(container: str) -> None: container, "sh", "-lc", - "chmod -R a+rwX /app /logs 2>/dev/null || true", + f"chmod -R a+rwX {workdir} /logs 2>/dev/null || true", check=False, ) @@ -386,299 +420,25 @@ def _compose_file(env_dir: Path) -> Path | None: return None -def _compose_overlay( - *, - workspace: Path, - tests_dir: Path, - logs: Path, - preserved_paths: list[str] | None = None, -) -> str: - """Compose override that keeps Harbor's main service idle for agent work.""" - preserved_paths = preserved_paths or [] - volume_lines = [ - f" - {json.dumps(f'{workspace}:/app')}", - f" - {json.dumps(f'{tests_dir}:/tests:ro')}", - f" - {json.dumps(f'{logs}:/logs')}", - ] - volume_lines.extend(f" - {json.dumps(path)}" for path in preserved_paths) +def _compose_overlay(*, workspace: Path, workdir: str, tests_dir: Path, logs: Path) -> str: + """Compose override that keeps Harbor's main service idle for agent work. + + Only ``main`` is touched: it is parked on ``sleep`` with the materialized + workspace mounted over its working directory, and the Harbor ``/tests`` and + ``/logs`` paths bound in. Every other service (databases, caches) is + inherited from the task's own compose file unchanged. + """ return "\n".join( [ "services:", " main:", - " build:", - f" context: {json.dumps(str(workspace))}", - " working_dir: /app", + f" working_dir: {json.dumps(workdir)}", ' entrypoint: ["sleep"]', ' command: ["infinity"]', " volumes:", - *volume_lines, + f" - {json.dumps(f'{workspace}:{workdir}')}", + f" - {json.dumps(f'{tests_dir}:/tests:ro')}", + f" - {json.dumps(f'{logs}:/logs')}", "", ], ) - - -def _preserved_image_paths(workspace: Path) -> list[str]: - """Image-populated subpaths that should survive the editable ``/app`` mount.""" - paths: list[str] = [] - if (workspace / "package.json").is_file(): - paths.append("/app/node_modules") - if _node_build_output_is_image_populated(workspace, "dist"): - paths.append("/app/dist") - if (workspace / "composer.json").is_file(): - paths.append("/app/vendor") - return paths - - -def _node_build_output_is_image_populated(workspace: Path, dirname: str) -> bool: - if (workspace / dirname).exists(): - return False - dockerfile = workspace / "Dockerfile" - if not dockerfile.is_file(): - return False - dockerfile_text = dockerfile.read_text(encoding="utf-8") - entrypoint = workspace / "docker-entrypoint.sh" - entrypoint_text = entrypoint.read_text(encoding="utf-8") if entrypoint.is_file() else "" - return ( - "npm run build" in dockerfile_text - or f"/app/{dirname}" in dockerfile_text - or f" {dirname}/" in entrypoint_text - or f" {dirname}" in entrypoint_text - ) - - -def _ensure_start_script(workspace: Path) -> None: - """Preserve build-generated /app/start_app.sh hidden by the workspace mount.""" - start = workspace / "start_app.sh" - entrypoint = workspace / "docker-entrypoint.sh" - if not entrypoint.is_file(): - _restore_dockerfile_script(workspace, entrypoint, "/app/docker-entrypoint.sh") - if entrypoint.is_file(): - entrypoint.chmod(entrypoint.stat().st_mode | 0o111) - if start.exists(): - start.chmod(start.stat().st_mode | 0o111) - return - text = _script_from_dockerfile(workspace, "/app/start_app.sh") - if text is None and entrypoint.is_file(): - text = "#!/usr/bin/env bash\nset -e\ncd /app\nexec sh /app/docker-entrypoint.sh\n" - if text is None: - return - start.write_text(text, encoding="utf-8", newline="\n") - start.chmod(0o755) - - -def _ensure_dockerfile_created_dirs(workspace: Path) -> None: - """Recreate simple Dockerfile-created ``/app`` dirs hidden by the bind mount.""" - for path in _dockerfile_created_app_dirs(workspace): - path.mkdir(parents=True, exist_ok=True) - - -async def _restore_image_generated_files(image: str, workspace: Path) -> None: - """Copy selected build-generated files from the image into the workspace. - - Some Harbor images initialize file-backed databases during ``docker build``. - The editable ``/app`` bind mount hides those generated files, so copy them - out of the built image before starting the task container. - """ - container_paths = _dockerfile_declared_generated_app_files(workspace) - if not container_paths: - return - - from hud.eval.runtime import _docker - - out, _ = await _docker("create", image, "true") - container = out.strip() - try: - for container_path in container_paths: - host_path = _host_path_for_app_file(workspace, container_path) - if host_path is None or host_path.exists(): - continue - host_path.parent.mkdir(parents=True, exist_ok=True) - await _docker("cp", f"{container}:{container_path}", str(host_path), check=False) - finally: - await _docker("rm", "--force", "--volumes", container, check=False) - - -def _dockerfile_declared_generated_app_files(workspace: Path) -> list[str]: - """Find Dockerfile-declared file-backed DB paths under ``/app``.""" - dockerfile = workspace / "Dockerfile" - if not dockerfile.is_file(): - return [] - - paths: list[str] = [] - for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")): - stripped = instruction.strip() - if not stripped.startswith("ENV "): - continue - for key, value in _env_pairs(stripped.removeprefix("ENV ").strip()): - if not _is_generated_db_env_key(key): - continue - if _is_app_database_path(value): - paths.append(value) - return list(dict.fromkeys(paths)) - - -def _env_pairs(body: str) -> list[tuple[str, str]]: - try: - tokens = shlex.split(body) - except ValueError: - return [] - if not tokens: - return [] - - pairs: list[tuple[str, str]] = [] - if all("=" in token for token in tokens): - for token in tokens: - key, value = token.split("=", 1) - pairs.append((key, value)) - return pairs - - if len(tokens) >= 2: - pairs.append((tokens[0], tokens[1])) - return pairs - - -def _is_generated_db_env_key(key: str) -> bool: - normalized = key.upper() - return normalized in { - "DB_PATH", - "DATABASE_PATH", - "SQLITE_PATH", - "SQLITE_DB_PATH", - "SQLITE_DATABASE_PATH", - } or normalized.endswith(("_DB_PATH", "_DATABASE_PATH", "_SQLITE_PATH")) - - -def _is_app_database_path(path: str) -> bool: - lowered = path.lower() - return lowered.startswith("/app/") and lowered.endswith((".db", ".sqlite", ".sqlite3")) - - -def _host_path_for_app_file(workspace: Path, container_path: str) -> Path | None: - if not container_path.startswith("/app/"): - return None - rel = container_path.removeprefix("/app/") - if rel.startswith("../") or "/../" in rel or rel == "..": - return None - return workspace / rel - - -def _dockerfile_created_app_dirs(workspace: Path) -> list[Path]: - dockerfile = workspace / "Dockerfile" - if not dockerfile.is_file(): - return [] - paths: list[Path] = [] - for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")): - stripped = instruction.strip() - if not stripped.startswith("RUN "): - continue - command = stripped.removeprefix("RUN ").strip() - try: - tokens = shlex.split(command) - except ValueError: - continue - index = 0 - while index < len(tokens): - if tokens[index] != "mkdir": - index += 1 - continue - index += 1 - while index < len(tokens): - token = tokens[index] - if token in {"&&", "||", ";"}: - break - if token.startswith("-"): - index += 1 - continue - host_path = _app_dir_from_mkdir_token(workspace, token) - if host_path is not None: - paths.append(host_path) - index += 1 - return paths - - -def _app_dir_from_mkdir_token(workspace: Path, token: str) -> Path | None: - if not token or any(char in token for char in "$*?["): - return None - raw = token.rstrip("/") - if raw in {"", "."}: - return None - if raw.startswith("/app/"): - rel = raw.removeprefix("/app/") - elif raw == "/app": - return workspace - elif raw.startswith("/"): - return None - else: - rel = raw - if rel.startswith("../") or "/../" in rel or rel == "..": - return None - return workspace / rel - - -def _restore_dockerfile_script(workspace: Path, host_path: Path, container_path: str) -> None: - """Restore a Dockerfile-generated script hidden by a bind mount.""" - text = _script_from_dockerfile(workspace, container_path) - if text is None: - return - host_path.write_text(text, encoding="utf-8", newline="\n") - host_path.chmod(0o755) - - -def _script_from_dockerfile(workspace: Path, container_path: str) -> str | None: - """Extract a Dockerfile-generated script from a simple ``RUN printf`` command.""" - dockerfile = workspace / "Dockerfile" - if not dockerfile.is_file(): - return None - for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")): - stripped = instruction.strip() - if not stripped.startswith("RUN ") or container_path not in stripped: - continue - command = stripped.removeprefix("RUN ").strip() - try: - tokens = shlex.split(command) - except ValueError: - continue - redirect = _redirect_index(tokens, container_path) - if redirect is None or redirect < 2 or tokens[0] != "printf": - continue - text = _script_from_printf_args(tokens[1:redirect]) - if text is not None: - return text - return None - - -def _redirect_index(tokens: list[str], target: str) -> int | None: - for index, token in enumerate(tokens): - if token in {">", ">>"} and index + 1 < len(tokens) and tokens[index + 1] == target: - return index - if token in {f">{target}", f">>{target}"}: - return index - return None - - -def _script_from_printf_args(args: list[str]) -> str | None: - if not args: - return None - if args[0] in {"%s\\n", "%s\n"}: - if len(args) < 2: - return None - return "\n".join(args[1:]) + "\n" - if len(args) == 1: - return args[0].replace("\\r", "\r").replace("\\n", "\n").replace("\\t", "\t") - return None - - -def _dockerfile_logical_lines(text: str) -> list[str]: - """Join backslash-continued Dockerfile lines for simple instruction parsing.""" - lines: list[str] = [] - current = "" - for raw_line in text.splitlines(): - line = raw_line.rstrip() - if line.endswith("\\"): - current += line[:-1] + " " - continue - lines.append(current + line) - current = "" - if current: - lines.append(current) - return lines diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py index 623f8560b..c695bee3c 100644 --- a/integrations/tests/test_harbor.py +++ b/integrations/tests/test_harbor.py @@ -4,6 +4,7 @@ import asyncio import json +import os import textwrap from typing import TYPE_CHECKING @@ -14,11 +15,8 @@ from integrations.harbor_runtime import ( _compose_file, _compose_overlay, - _dockerfile_declared_generated_app_files, - _ensure_dockerfile_created_dirs, - _ensure_start_script, - _host_path_for_app_file, - _preserved_image_paths, + _image_workdir, + _materialize_workspace, _read_harbor_reward, _verifier_timeout, ) @@ -139,9 +137,9 @@ async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: Task(env="bench", id=single_task.name), compose, tmp_path / "workspace", + "/app", single_task / "tests", tmp_path / "logs", - [], ): raise AssertionError("compose acquisition should not yield") @@ -172,9 +170,9 @@ async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: Task(env="bench", id=single_task.name), compose, tmp_path / "workspace", + "/app", single_task / "tests", tmp_path / "logs", - [], ): raise AssertionError("compose acquisition should not yield") @@ -193,198 +191,66 @@ def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None: assert _compose_file(env) == compose -def test_compose_overlay_mounts_main_workspace_tests_and_logs(tmp_path: Path) -> None: +def test_compose_overlay_parks_main_and_mounts_workspace_tests_and_logs(tmp_path: Path) -> None: overlay = _compose_overlay( workspace=tmp_path / "workspace", + workdir="/srv/app", tests_dir=tmp_path / "tests", logs=tmp_path / "logs", - preserved_paths=[], ) assert "main:" in overlay assert 'entrypoint: ["sleep"]' in overlay - assert f"{tmp_path / 'workspace'}:/app" in overlay + assert 'working_dir: "/srv/app"' in overlay + assert f"{tmp_path / 'workspace'}:/srv/app" in overlay assert f"{tmp_path / 'tests'}:/tests:ro" in overlay assert f"{tmp_path / 'logs'}:/logs" in overlay -def test_compose_overlay_preserves_image_dependency_subpaths(tmp_path: Path) -> None: - overlay = _compose_overlay( - workspace=tmp_path / "workspace", - tests_dir=tmp_path / "tests", - logs=tmp_path / "logs", - preserved_paths=["/app/node_modules"], - ) - - assert ' - "/app/node_modules"' in overlay - - -def test_preserved_image_paths_detects_node_and_php_dependency_dirs(tmp_path: Path) -> None: - (tmp_path / "package.json").write_text("{}", encoding="utf-8") - (tmp_path / "composer.json").write_text("{}", encoding="utf-8") - - assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/vendor"] - - -def test_preserved_image_paths_detects_node_build_output(tmp_path: Path) -> None: - (tmp_path / "package.json").write_text("{}", encoding="utf-8") - (tmp_path / "Dockerfile").write_text( - "FROM node:20-slim\nRUN npm ci\nRUN npm run build\n", - encoding="utf-8", - ) - - assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/dist"] - - -def test_ensure_start_script_recreates_build_generated_entrypoint(tmp_path: Path) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - (workspace / "docker-entrypoint.sh").write_text("echo start\n", encoding="utf-8") - - _ensure_start_script(workspace) - - start = workspace / "start_app.sh" - assert start.exists() - text = start.read_text(encoding="utf-8") - assert "exec sh /app/docker-entrypoint.sh" in text - - -def test_ensure_start_script_preserves_dockerfile_generated_command(tmp_path: Path) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - (workspace / "docker-entrypoint.sh").write_text('exec "$@"\n', encoding="utf-8") - (workspace / "Dockerfile").write_text( - "FROM python:3.11-slim\n" - "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' " - "'exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app' " - "> /app/start_app.sh && chmod +x /app/start_app.sh\n", - encoding="utf-8", - ) - - _ensure_start_script(workspace) - - text = (workspace / "start_app.sh").read_text(encoding="utf-8") - assert "exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app" in text - assert (workspace / "docker-entrypoint.sh").stat().st_mode & 0o111 - - -def test_ensure_start_script_restores_generated_entrypoint(tmp_path: Path) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - (workspace / "Dockerfile").write_text( - "FROM python:3.11-slim\n" - "RUN printf '#!/bin/sh\\npython -m src.seed --init\\n" - "exec uvicorn src.main:app --host 0.0.0.0 --port 8000\\n' " - "> /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh\n" - "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' " - "'exec /app/docker-entrypoint.sh' > /app/start_app.sh && chmod +x /app/start_app.sh\n", - encoding="utf-8", - ) - - _ensure_start_script(workspace) - - entrypoint = workspace / "docker-entrypoint.sh" - assert entrypoint.exists() - assert entrypoint.stat().st_mode & 0o111 - assert "python -m src.seed --init" in entrypoint.read_text(encoding="utf-8") - assert "exec /app/docker-entrypoint.sh" in (workspace / "start_app.sh").read_text( - encoding="utf-8", - ) - - -def test_ensure_dockerfile_created_dirs_restores_app_dirs(tmp_path: Path) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - (workspace / "Dockerfile").write_text( - "FROM node:20-slim\n" - "RUN mkdir -p static/uploads /app/tmp/cache && mkdir -p /var/lib/ignored\n", - encoding="utf-8", - ) - - _ensure_dockerfile_created_dirs(workspace) - - assert (workspace / "static" / "uploads").is_dir() - assert (workspace / "tmp" / "cache").is_dir() - assert not (workspace / "var" / "lib" / "ignored").exists() +async def test_image_workdir_reads_config_working_dir(monkeypatch: pytest.MonkeyPatch) -> None: + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + assert args == ("image", "inspect", "--format", "{{.Config.WorkingDir}}", "img") + return "/srv/app\n", "" + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) -def test_dockerfile_declared_generated_app_files_detects_seeded_sqlite_db(tmp_path: Path) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - (workspace / "Dockerfile").write_text( - "FROM python:3.11-slim\n" - "ENV DB_PATH=/app/data/salon_workforce.db\n" - "RUN python -m src.seed --init\n", - encoding="utf-8", - ) + assert await _image_workdir("img") == "/srv/app" - assert _dockerfile_declared_generated_app_files(workspace) == [ - "/app/data/salon_workforce.db", - ] - assert _host_path_for_app_file(workspace, "/app/data/salon_workforce.db") == ( - workspace / "data" / "salon_workforce.db" - ) +async def test_image_workdir_defaults_to_app_when_unset(monkeypatch: pytest.MonkeyPatch) -> None: + async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: + return "\n", "" -def test_dockerfile_declared_generated_app_files_ignores_non_app_or_non_db_paths( - tmp_path: Path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - (workspace / "Dockerfile").write_text( - "FROM python:3.11-slim\n" - "ENV DB_PATH=/var/lib/app.db CACHE_PATH=/app/cache\n" - "ENV SOME_DATABASE_PATH=/app/data/app.txt\n", - encoding="utf-8", - ) + monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) - assert _dockerfile_declared_generated_app_files(workspace) == [] - assert _host_path_for_app_file(workspace, "/tmp/app.db") is None + assert await _image_workdir("img") == "/app" -async def test_compose_container_restores_image_generated_db_files( - single_task: Path, +async def test_materialize_workspace_copies_image_workdir_and_owns_it( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: - compose = single_task / "environment" / "docker-compose.yaml" - compose.write_text("services:\n main:\n build: .\n", encoding="utf-8") workspace = tmp_path / "workspace" workspace.mkdir() - (workspace / "Dockerfile").write_text( - "FROM python:3.11-slim\nENV DB_PATH=/app/data/app.db\n", encoding="utf-8" - ) calls: list[tuple[str, ...]] = [] async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]: calls.append(args) - if args[0] == "compose" and args[-3:] == ("ps", "-q", "main"): - return "maincontainer\n", "" - if args[0] == "inspect": - return "sha256:mainimage\n", "" if args[0] == "create": - return "tempcontainer\n", "" + return "tempcid\n", "" return "", "" monkeypatch.setattr("hud.eval.runtime._docker", fake_docker) - runtime = HarborRuntime(single_task.parent) - async with runtime._compose_container( - Task(env="bench", id=single_task.name), - compose, - workspace, - single_task / "tests", - tmp_path / "logs", - [], - ): - pass + await _materialize_workspace("img", workspace, "/app") - assert ("inspect", "--format", "{{.Image}}", "maincontainer") in calls - assert ( - "cp", - "tempcontainer:/app/data/app.db", - str(workspace / "data" / "app.db"), - ) in calls + # Contents of the image's workdir are copied out into the host workspace. + assert ("cp", "tempcid:/app/.", str(workspace)) in calls + # The throwaway container is removed. + assert any(a[0] == "rm" for a in calls) + # On POSIX hosts, ownership is handed to the host user via a chown pass. + if hasattr(os, "getuid"): + assert any(a[0] == "run" and "chown" in a and a[-1] == "/app" for a in calls) def test_read_harbor_reward_prefers_reward_and_score_keys(tmp_path: Path) -> None: @@ -448,7 +314,7 @@ async def fake_exec(*args: str, **kwargs: object) -> FakeProc: monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec) runtime = HarborRuntime(single_task.parent) - result = await runtime._grade("container", logs, "done", verifier_timeout=120.0) + result = await runtime._grade("container", "/app", logs, "done", verifier_timeout=120.0) assert result["score"] == 1.0 assert result["info"]["stdout"] == "verifier out" @@ -481,7 +347,9 @@ async def fake_exec(*args: str, **kwargs: object) -> FakeProc: monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec) runtime = HarborRuntime(single_task.parent) - result = await runtime._grade("container", tmp_path / "logs", None, verifier_timeout=0.05) + result = await runtime._grade( + "container", "/app", tmp_path / "logs", None, verifier_timeout=0.05 + ) assert result["isError"] is True assert "timed out" in result["content"]