From 2b5aefc4b72bc615e7f6616b16e05ab89a3212a2 Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Sun, 28 Jun 2026 19:15:45 -0700
Subject: [PATCH 1/3] Add Harbor runtime support and split the code to make it
 cleaner

---
 docs/v6/advanced/harbor-convert.mdx |  23 +-
 docs/v6/reference/cli.mdx           |   3 +-
 hud/cli/eval.py                     |  44 +-
 hud/cli/tests/test_eval_config.py   |  41 ++
 integrations/__init__.py            |   5 +-
 integrations/harbor.py              |  85 +---
 integrations/harbor_common.py       |  70 +++
 integrations/harbor_runtime.py      | 679 ++++++++++++++++++++++++++++
 integrations/tests/test_harbor.py   | 175 ++++++-
 pyproject.toml                      |   4 +-
 10 files changed, 1040 insertions(+), 89 deletions(-)
 create mode 100644 integrations/harbor_common.py
 create mode 100644 integrations/harbor_runtime.py

diff --git a/docs/v6/advanced/harbor-convert.mdx b/docs/v6/advanced/harbor-convert.mdx
index 5ddb9f4f9..1680fab16 100644
--- a/docs/v6/advanced/harbor-convert.mdx
+++ b/docs/v6/advanced/harbor-convert.mdx
@@ -26,22 +26,29 @@ directly - one row per task dir (`id` = the dir name), sharing one declarative
 ```python
 from integrations.harbor import detect, load
 
-assert detect("./terminal-bench")
-taskset = load("./terminal-bench")
+assert detect("./harbor_tasks")
+taskset = load("./harbor_tasks")
 
 for task in taskset:
     print(task.env, task.id)
 ```
 
-Like every task row, the result carries no placement. Run it by supplying one -
-today that means a substrate already serving the control channel
-(`runtime=Runtime(url)`); a docker provider that builds and runs each task's
-`environment/` image is the planned follow-up:
+Like every task row, the result carries no placement. Run it by supplying one.
+For local Docker-backed Harbor execution, use `HarborRuntime`; it builds the
+task's `environment/` image, runs a fresh container, exposes the workspace
+through HUD's normal shell capability, and grades by running `tests/test.sh`:
 
 ```python
-from hud import Runtime
+from integrations.harbor import HarborRuntime
 
-job = await taskset.run(agent, runtime=Runtime("tcp://127.0.0.1:8765"))
+job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks"))
+```
+
+The eval CLI also detects local Harbor task directories and datasets when using
+local runtime placement:
+
+```bash
+hud eval ./harbor_tasks claude --task-ids cancel-async-tasks --max-steps 30
 ```
 
 ## Export HUD tasks to Harbor
diff --git a/docs/v6/reference/cli.mdx b/docs/v6/reference/cli.mdx
index 6a5f51bbd..852963982 100644
--- a/docs/v6/reference/cli.mdx
+++ b/docs/v6/reference/cli.mdx
@@ -133,7 +133,8 @@ hud sync env                   # sync environment metadata
 ```
 
 External benchmark formats (currently Harbor) load directly into the runtime
-as `Taskset`s - no conversion step. See [Harbor interop](/v6/advanced/harbor-convert).
+as `Taskset`s - no conversion step. Local Harbor directories run with the Harbor
+Docker-backed runtime provider. See [Harbor interop](/v6/advanced/harbor-convert).
 
 ## Inspect
 
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 39afd6edf..9978eb402 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -728,7 +728,32 @@ def _spawn_target(source: Path) -> Path:
     return resolved.parent
 
 
-def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
+def _is_harbor_source(source_path: Path | None) -> bool:
+    if source_path is None or not source_path.exists():
+        return False
+    if not source_path.is_dir():
+        return False
+    from integrations.harbor import detect
+
+    return detect(source_path)
+
+
+def _load_local_taskset(source_path: Path) -> tuple[Any, str]:
+    from hud.eval import Taskset
+
+    if _is_harbor_source(source_path):
+        from integrations.harbor import load
+
+        return load(source_path), "harbor"
+    return Taskset.from_file(source_path), "hud"
+
+
+def _resolve_placement(
+    cfg: EvalConfig,
+    source_path: Path | None,
+    *,
+    source_kind: str = "hud",
+) -> Any:
     """Map the config's ``runtime`` onto a placement for ``Taskset.run``.
 
     "local" spawns each row's env from the source next to the tasks file;
@@ -744,6 +769,10 @@ def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
     if cfg.runtime == "local":
         if source_path is None:
             raise ValueError("local placement requires a local source path")
+        if source_kind == "harbor":
+            from integrations.harbor import HarborRuntime
+
+            return HarborRuntime(source_path)
         return LocalRuntime(_spawn_target(source_path))
     if cfg.runtime == "hud":
         require_api_key("run HUD runtime tunnel evals")
@@ -767,18 +796,19 @@ async def _run_evaluation(cfg: EvalConfig) -> Any:
     if cfg.source is None or cfg.agent_type is None:
         raise ValueError("source and agent_type must be set")
 
-    from hud.eval import Taskset
-
     source_path = Path(cfg.source)
     is_local = source_path.exists()
+    source_kind = "api"
     if is_local:
         hud_console.info(f"Loading tasks from: {cfg.source}")
         try:
-            taskset = Taskset.from_file(source_path)
+            taskset, source_kind = _load_local_taskset(source_path)
         except Exception as e:
             hud_console.error(f"Failed to load tasks from {cfg.source}: {e}")
             raise typer.Exit(1) from e
     else:
+        from hud.eval import Taskset
+
         hud_console.info(f"Loading platform taskset: {cfg.source}")
         try:
             taskset = Taskset.from_api(cfg.source)
@@ -832,7 +862,11 @@ async def _run_evaluation(cfg: EvalConfig) -> Any:
         )
 
     agent = _build_agent(cfg)
-    placement = _resolve_placement(cfg, source_path if is_local else None)
+    placement = _resolve_placement(
+        cfg,
+        source_path if is_local else None,
+        source_kind=source_kind,
+    )
 
     job = await taskset.run(
         agent,
diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py
index 6b94f0b23..e1183860c 100644
--- a/hud/cli/tests/test_eval_config.py
+++ b/hud/cli/tests/test_eval_config.py
@@ -20,6 +20,23 @@
 _ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/anthropic.claude"
 
 
+def _write_harbor_task(root: Path, name: str = "demo-task") -> Path:
+    task = root / name
+    (task / "environment").mkdir(parents=True)
+    (task / "tests").mkdir()
+    (task / "instruction.md").write_text("Fix the demo task.\n", encoding="utf-8")
+    (task / "task.toml").write_text(
+        'schema_version = "1.3"\n\n[task]\nname = "demo/demo-task"\n',
+        encoding="utf-8",
+    )
+    (task / "environment" / "Dockerfile").write_text("FROM python:3.12-slim\n", encoding="utf-8")
+    (task / "tests" / "test.sh").write_text(
+        "#!/usr/bin/env bash\nmkdir -p /logs/verifier\necho 1 > /logs/verifier/reward.txt\n",
+        encoding="utf-8",
+    )
+    return task
+
+
 def test_is_bedrock_arn() -> None:
     assert _is_bedrock_arn(_ARN) is True
     assert _is_bedrock_arn("claude-sonnet-4-6") is False
@@ -136,6 +153,30 @@ def test_resolve_placement_runtime_hud_uses_tunnel(
     assert isinstance(placement, HUDRuntime)
 
 
+def test_load_local_taskset_uses_harbor_loader_for_harbor_dirs(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    taskset, source_kind = eval_mod._load_local_taskset(tmp_path)
+
+    assert source_kind == "harbor"
+    assert len(taskset) == 1
+    assert taskset["demo-task"].id == "demo-task"
+
+
+def test_resolve_placement_local_harbor_uses_harbor_runtime(tmp_path: Path) -> None:
+    from integrations.harbor import HarborRuntime
+
+    _write_harbor_task(tmp_path)
+
+    placement = eval_mod._resolve_placement(
+        EvalConfig(runtime="local"),
+        tmp_path,
+        source_kind="harbor",
+    )
+
+    assert isinstance(placement, HarborRuntime)
+
+
 def test_resolve_placement_remote_uses_hosted_runtime(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,
diff --git a/integrations/__init__.py b/integrations/__init__.py
index c8549e0fe..e4a817b4e 100644
--- a/integrations/__init__.py
+++ b/integrations/__init__.py
@@ -8,8 +8,9 @@
 This package lives outside ``hud`` on purpose: each module is a recipe built
 **only on the public SDK surface** (``Environment``, ``Task``,
 ``Taskset``, ``Runtime``) — that constraint is the proof the core is
-flexible. Copy a module into your project or run it from a checkout; nothing
-in the SDK or CLI imports it.
+flexible. Copy a module into your project or run it from a checkout. The CLI may
+call selected integrations explicitly for polished interop paths, but the
+integration contract itself stays independent of private SDK hooks.
 
 The contract: an integration module exposes ``detect(path) -> bool`` and
 ``load(path) -> Taskset``. Placement stays an execution-time concern — loaders
diff --git a/integrations/harbor.py b/integrations/harbor.py
index 497711e37..903426251 100644
--- a/integrations/harbor.py
+++ b/integrations/harbor.py
@@ -11,11 +11,9 @@
 
 :func:`load` parses a task dir (or a dataset of them) into rows sharing one
 env name per distinct ``environment/`` build context — no codegen, no
-roundtrip. Like every row, the result is runnable
-once a placement is supplied (``runtime=Runtime(url)`` against a served substrate
-today). Providers receive the row being placed, so a docker provider that
-builds and runs each row's ``environment/`` image is the named follow-up —
-expressible without engine changes.
+roundtrip. Like every row, the result is runnable once a placement is supplied.
+Use :class:`HarborRuntime` for local Docker-backed execution of Harbor tasks, or
+``runtime=Runtime(url)`` to attach to a substrate served elsewhere.
 
 :func:`export` is the reverse direction: turn a HUD task source into
 self-contained Harbor task folders (``task.toml`` + ``instruction.md`` +
@@ -40,19 +38,23 @@
 
 from __future__ import annotations
 
-import hashlib
 import json
 import logging
-import re
 import shutil
-import tomllib
-from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
 from hud.environment import Environment
 from hud.environment.server import TaskRunner
 from hud.eval import Task, Taskset
+from integrations.harbor_common import (
+    _HarborTask,
+    _is_harbor_task,
+    _parse_task,
+    _slugify,
+    _task_dirs,
+)
+from integrations.harbor_runtime import HarborRuntime
 
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -74,18 +76,12 @@
     "__pycache__", "*.pyc", ".git", ".venv", "venv", "*.egg-info", ".pytest_cache"
 )
 
-
 # ─── load: Harbor dirs -> Taskset ──────────────────────────────────────
 
 
 def detect(path: str | Path) -> bool:
     """True when *path* is a Harbor task dir or a dataset of them."""
-    root = Path(path)
-    if _is_harbor_task(root):
-        return True
-    if root.is_dir():
-        return any(_is_harbor_task(d) for d in root.iterdir() if d.is_dir())
-    return False
+    return bool(_task_dirs(path))
 
 
 def load(path: str | Path) -> Taskset:
@@ -96,12 +92,8 @@ def load(path: str | Path) -> Taskset:
     context (content-hashed), derived from the dataset name.
     """
     root = Path(path).resolve()
-    if _is_harbor_task(root):
-        task_dirs = [root]
-        dataset_name = root.parent.name
-    else:
-        task_dirs = sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d))
-        dataset_name = root.name
+    task_dirs = _task_dirs(root)
+    dataset_name = root.parent.name if _is_harbor_task(root) else root.name
     if not task_dirs:
         raise ValueError(f"no Harbor tasks found in {path}")
 
@@ -126,54 +118,6 @@ def load(path: str | Path) -> Taskset:
     return Taskset(base_name, tasks)
 
 
-def _slugify(name: str) -> str:
-    """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name."""
-    normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-"))
-    return re.sub(r"-+", "-", normalized).strip("-") or "harbor"
-
-
-def _is_harbor_task(path: Path) -> bool:
-    return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists()
-
-
-def _hash_directory(path: Path) -> str:
-    """Content-hash a directory for grouping tasks by identical environments."""
-    hasher = hashlib.sha256()
-    if not path.exists():
-        return "empty"
-    for file_path in sorted(path.rglob("*")):
-        if file_path.is_file():
-            hasher.update(str(file_path.relative_to(path)).encode())
-            hasher.update(file_path.read_bytes())
-    return hasher.hexdigest()[:16]
-
-
-@dataclass(frozen=True, slots=True)
-class _HarborTask:
-    """One parsed Harbor task dir."""
-
-    task_id: str
-    config: dict[str, Any]
-    env_hash: str
-
-
-def _parse_task(task_dir: Path) -> _HarborTask | None:
-    if not (task_dir / "instruction.md").is_file():
-        LOGGER.warning("failed to read instruction.md in %s", task_dir)
-        return None
-    try:
-        config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8"))
-    except (OSError, tomllib.TOMLDecodeError):
-        LOGGER.warning("failed to parse task.toml in %s", task_dir)
-        config = {}
-    env_dir = task_dir / "environment"
-    return _HarborTask(
-        task_id=task_dir.name,
-        config=config,
-        env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env",
-    )
-
-
 # ─── export: HUD tasks -> Harbor task folders ───────────────────────────
 
 
@@ -443,6 +387,7 @@ async def export(
     "ALLOWED_PROTOCOLS",
     "CONTROL_PORT",
     "DEFAULT_ANSWER_FILE",
+    "HarborRuntime",
     "detect",
     "export",
     "load",
diff --git a/integrations/harbor_common.py b/integrations/harbor_common.py
new file mode 100644
index 000000000..53294e091
--- /dev/null
+++ b/integrations/harbor_common.py
@@ -0,0 +1,70 @@
+"""Shared helpers for Harbor task integration."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import tomllib
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _slugify(name: str) -> str:
+    """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name."""
+    normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-"))
+    return re.sub(r"-+", "-", normalized).strip("-") or "harbor"
+
+
+def _is_harbor_task(path: Path) -> bool:
+    return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists()
+
+
+def _task_dirs(path: str | Path) -> list[Path]:
+    root = Path(path)
+    if _is_harbor_task(root):
+        return [root]
+    if root.is_dir():
+        return sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d))
+    return []
+
+
+def _hash_directory(path: Path) -> str:
+    """Content-hash a directory for grouping tasks by identical environments."""
+    hasher = hashlib.sha256()
+    if not path.exists():
+        return "empty"
+    for file_path in sorted(path.rglob("*")):
+        if file_path.is_file():
+            hasher.update(str(file_path.relative_to(path)).encode())
+            hasher.update(file_path.read_bytes())
+    return hasher.hexdigest()[:16]
+
+
+@dataclass(frozen=True, slots=True)
+class _HarborTask:
+    """One parsed Harbor task dir."""
+
+    task_id: str
+    config: dict[str, Any]
+    env_hash: str
+
+
+def _parse_task(task_dir: Path) -> _HarborTask | None:
+    if not (task_dir / "instruction.md").is_file():
+        LOGGER.warning("failed to read instruction.md in %s", task_dir)
+        return None
+    try:
+        config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8"))
+    except (OSError, tomllib.TOMLDecodeError):
+        LOGGER.warning("failed to parse task.toml in %s", task_dir)
+        config = {}
+    env_dir = task_dir / "environment"
+    return _HarborTask(
+        task_id=task_dir.name,
+        config=config,
+        env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env",
+    )
diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py
new file mode 100644
index 000000000..872399bc1
--- /dev/null
+++ b/integrations/harbor_runtime.py
@@ -0,0 +1,679 @@
+"""Local Docker-backed runtime for Harbor task directories."""
+
+from __future__ import annotations
+
+import contextlib
+import json
+import shlex
+import shutil
+import tempfile
+import uuid
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from hud.environment import Environment
+from hud.environment.workspace import Workspace
+from integrations.harbor_common import _hash_directory, _slugify, _task_dirs
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator, AsyncIterator
+
+    import asyncssh
+
+    from hud.eval import Task
+    from hud.eval.runtime import Runtime
+
+
+class HarborRuntime:
+    """Run Harbor task directories through HUD's local rollout engine.
+
+    The provider builds the Harbor task's ``environment/`` Docker context, runs
+    a fresh container with a writable host workspace mounted at ``/app``, and
+    serves a small HUD control channel from the host process. If the task ships a
+    ``docker-compose.yaml``/``.yml``, the provider starts it with an overlay that
+    keeps the ``main`` service idle while preserving sidecars such as databases.
+    The agent receives normal HUD SSH/SFTP access; shell commands execute inside
+    the main container via ``docker exec`` while file transfer edits the mounted
+    host workspace. Grading runs the Harbor ``tests/test.sh`` inside the same
+    main container and reads ``/logs/verifier/reward.json`` or ``reward.txt``.
+    """
+
+    def __init__(
+        self,
+        path: str | Path,
+        *,
+        ready_timeout: float = 120.0,
+    ) -> None:
+        self.root = Path(path).resolve()
+        self.ready_timeout = ready_timeout
+        self._task_dirs = {task_dir.name: task_dir for task_dir in _task_dirs(self.root)}
+        if not self._task_dirs:
+            raise ValueError(f"no Harbor tasks found in {path}")
+        self._image_cache: dict[Path, str] = {}
+
+    @contextlib.asynccontextmanager
+    async def __call__(self, task: Task) -> AsyncIterator[Runtime]:
+        from hud.eval.runtime import Runtime, _local
+
+        task_dir = self._task_dirs.get(task.id)
+        if task_dir is None:
+            raise KeyError(f"HarborRuntime has no task directory for {task.id!r}")
+        env_dir = task_dir / "environment"
+        tests_dir = task_dir / "tests"
+        if not (env_dir / "Dockerfile").is_file():
+            raise FileNotFoundError(f"Harbor task {task.id!r} has no environment/Dockerfile")
+        if not (tests_dir / "test.sh").is_file():
+            raise FileNotFoundError(f"Harbor task {task.id!r} has no tests/test.sh")
+
+        with tempfile.TemporaryDirectory(prefix=f"hud-harbor-{_slugify(task.id)}-") as tmp:
+            tmp_path = Path(tmp)
+            workspace = tmp_path / "workspace"
+            logs = tmp_path / "logs"
+            shutil.copytree(env_dir, workspace)
+            _ensure_start_script(workspace)
+            _ensure_dockerfile_created_dirs(workspace)
+            preserved_paths = _preserved_image_paths(workspace)
+            logs.mkdir(parents=True, exist_ok=True)
+
+            compose_file = _compose_file(env_dir)
+            if compose_file is not None:
+                async with self._compose_container(
+                    task,
+                    compose_file,
+                    workspace,
+                    tests_dir,
+                    logs,
+                    preserved_paths,
+                ) as (
+                    container,
+                    provider,
+                ):
+                    env = self._environment_for(task, task_dir, workspace, logs, container)
+                    async with _local(env) as runtime:
+                        yield Runtime(
+                            runtime.url,
+                            params={
+                                **runtime.params,
+                                "provider": provider,
+                                "container": container,
+                                "ready_timeout": self.ready_timeout,
+                            },
+                            config=runtime.config,
+                        )
+            else:
+                async with self._single_container(
+                    task,
+                    task_dir,
+                    workspace,
+                    tests_dir,
+                    logs,
+                    preserved_paths,
+                ) as (
+                    container,
+                    provider,
+                ):
+                    env = self._environment_for(task, task_dir, workspace, logs, container)
+                    async with _local(env) as runtime:
+                        yield Runtime(
+                            runtime.url,
+                            params={
+                                **runtime.params,
+                                "provider": provider,
+                                "container": container,
+                                "ready_timeout": self.ready_timeout,
+                            },
+                            config=runtime.config,
+                        )
+
+    @contextlib.asynccontextmanager
+    async def _single_container(
+        self,
+        task: Task,
+        task_dir: Path,
+        workspace: Path,
+        tests_dir: Path,
+        logs: Path,
+        preserved_paths: list[str],
+    ) -> AsyncIterator[tuple[str, str]]:
+        from hud.eval.runtime import _docker
+
+        image = await self._image_for(task_dir)
+        env_dir = task_dir / "environment"
+        await _restore_image_generated_files(image, workspace)
+        container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}"
+        preserved_volume_args = [arg for path in preserved_paths for arg in ("--volume", path)]
+        out, _ = await _docker(
+            "run",
+            "--detach",
+            "--name",
+            container_name,
+            "--workdir",
+            "/app",
+            "--entrypoint",
+            "sleep",
+            "--volume",
+            f"{workspace}:/app",
+            "--volume",
+            f"{tests_dir}:/tests:ro",
+            "--volume",
+            f"{logs}:/logs",
+            *preserved_volume_args,
+            image,
+            "infinity",
+        )
+        container = out.strip()
+        try:
+            yield container, "harbor"
+        finally:
+            await _release_mount_permissions(container)
+            await _docker("rm", "--force", "--volumes", container, check=False)
+            await _docker("image", "rm", image, check=False)
+            self._image_cache.pop(env_dir, None)
+
+    @contextlib.asynccontextmanager
+    async def _compose_container(
+        self,
+        task: Task,
+        compose_file: Path,
+        workspace: Path,
+        tests_dir: Path,
+        logs: Path,
+        preserved_paths: list[str],
+    ) -> AsyncIterator[tuple[str, str]]:
+        from hud.eval.runtime import _docker
+
+        project = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}"
+        overlay = workspace.parent / "compose.hud.yaml"
+        overlay.write_text(
+            _compose_overlay(
+                workspace=workspace,
+                tests_dir=tests_dir,
+                logs=logs,
+                preserved_paths=preserved_paths,
+            ),
+            encoding="utf-8",
+            newline="\n",
+        )
+        compose_args = ("compose", "-f", str(compose_file), "-f", str(overlay), "-p", project)
+        await _docker(*compose_args, "up", "--detach", "--build")
+        out, _ = await _docker(*compose_args, "ps", "-q", "main")
+        container = out.strip()
+        if not container:
+            raise RuntimeError(f"docker compose project {project} did not create a main service")
+        try:
+            yield container, "harbor-compose"
+        finally:
+            await _release_mount_permissions(container)
+            await _docker(
+                *compose_args,
+                "down",
+                "--volumes",
+                "--remove-orphans",
+                "--rmi",
+                "local",
+                check=False,
+            )
+
+    async def _image_for(self, task_dir: Path) -> str:
+        from hud.eval.runtime import _docker
+
+        env_dir = task_dir / "environment"
+        cached = self._image_cache.get(env_dir)
+        if cached is not None:
+            return cached
+        tag = f"hud-harbor:{_hash_directory(env_dir)}"
+        await _docker("build", "--tag", tag, str(env_dir))
+        self._image_cache[env_dir] = tag
+        return tag
+
+    def _environment_for(
+        self,
+        task: Task,
+        task_dir: Path,
+        workspace: Path,
+        logs: Path,
+        container: str,
+    ) -> Environment:
+        env = Environment(task.env)
+        workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path="/app")
+
+        @env.initialize
+        async def _up() -> None:
+            await workspace_daemon.start()
+            env.add_capability(workspace_daemon.capability("shell"))
+
+        @env.shutdown
+        async def _down() -> None:
+            await workspace_daemon.stop()
+
+        @env.template(id=task.id, description=f"Harbor task {task.id}")
+        async def _run_harbor_task() -> AsyncGenerator[Any, Any]:
+            answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8")
+            yield await self._grade(container, logs, answer)
+
+        return env
+
+    async def _grade(self, container: str, logs: Path, answer: Any) -> dict[str, Any]:
+        from hud.eval.runtime import _docker
+
+        answer_file = logs / "agent_answer.txt"
+        answer_file.parent.mkdir(parents=True, exist_ok=True)
+        answer_file.write_text("" if answer is None else str(answer), encoding="utf-8")
+        out, err = await _docker(
+            "exec",
+            "--workdir",
+            "/app",
+            container,
+            "bash",
+            "/tests/test.sh",
+            check=False,
+        )
+        reward, info = _read_harbor_reward(logs / "verifier")
+        info.update(
+            {
+                "stdout": out[-4000:],
+                "stderr": err[-4000:],
+            }
+        )
+        if reward is None:
+            return {
+                "score": 0.0,
+                "isError": True,
+                "content": "Harbor verifier did not write reward.json or reward.txt",
+                "info": info,
+            }
+        return {"score": reward, "info": info}
+
+
+class _DockerWorkspace(Workspace):
+    """Workspace SFTP over a host bind mount, shell commands via docker exec."""
+
+    def __init__(self, *args: Any, container: str, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self._container = container
+
+    async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> None:
+        import asyncio
+
+        command = process.command or "bash -l"
+        proc = await asyncio.create_subprocess_exec(
+            "docker",
+            "exec",
+            "-i",
+            "--workdir",
+            self._guest_path,
+            self._container,
+            "bash",
+            "-lc",
+            command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        try:
+            stdout_data, stderr_data = await asyncio.wait_for(proc.communicate(), timeout=3600.0)
+        except TimeoutError:
+            proc.kill()
+            await proc.wait()
+            process.stderr.write(b"workspace: command timed out after 3600s\n")
+            process.exit(1)
+            return
+        except asyncio.CancelledError:
+            proc.kill()
+            await proc.wait()
+            raise
+
+        if stdout_data:
+            process.stdout.write(stdout_data)
+        if stderr_data:
+            process.stderr.write(stderr_data)
+        process.exit(proc.returncode if proc.returncode is not None else 0)
+
+
+def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]:
+    reward_json = verifier_logs / "reward.json"
+    if reward_json.is_file():
+        data = json.loads(reward_json.read_text(encoding="utf-8"))
+        if isinstance(data, int | float):
+            return float(data), {"reward_file": str(reward_json)}
+        if isinstance(data, dict):
+            for key in ("reward", "score"):
+                value = data.get(key)
+                if isinstance(value, int | float):
+                    return float(value), {"reward_file": str(reward_json), "reward_json": data}
+            numeric = [float(value) for value in data.values() if isinstance(value, int | float)]
+            if numeric:
+                return sum(numeric) / len(numeric), {
+                    "reward_file": str(reward_json),
+                    "reward_json": data,
+                }
+        return None, {"reward_file": str(reward_json), "reward_parse_error": "no numeric reward"}
+
+    reward_txt = verifier_logs / "reward.txt"
+    if reward_txt.is_file():
+        text = reward_txt.read_text(encoding="utf-8").strip()
+        try:
+            return float(text), {"reward_file": str(reward_txt)}
+        except ValueError:
+            return None, {"reward_file": str(reward_txt), "reward_parse_error": text}
+
+    return None, {}
+
+
+async def _release_mount_permissions(container: str) -> None:
+    """Let the host user delete files that container-root created in mounts."""
+    from hud.eval.runtime import _docker
+
+    await _docker(
+        "exec",
+        container,
+        "sh",
+        "-lc",
+        "chmod -R a+rwX /app /logs 2>/dev/null || true",
+        check=False,
+    )
+
+
+def _compose_file(env_dir: Path) -> Path | None:
+    for name in ("docker-compose.yaml", "docker-compose.yml", "compose.yaml", "compose.yml"):
+        path = env_dir / name
+        if path.is_file():
+            return path
+    return None
+
+
+def _compose_overlay(
+    *,
+    workspace: Path,
+    tests_dir: Path,
+    logs: Path,
+    preserved_paths: list[str] | None = None,
+) -> str:
+    """Compose override that keeps Harbor's main service idle for agent work."""
+    preserved_paths = preserved_paths or []
+    volume_lines = [
+        f"      - {json.dumps(f'{workspace}:/app')}",
+        f"      - {json.dumps(f'{tests_dir}:/tests:ro')}",
+        f"      - {json.dumps(f'{logs}:/logs')}",
+    ]
+    volume_lines.extend(f"      - {json.dumps(path)}" for path in preserved_paths)
+    return "\n".join(
+        [
+            "services:",
+            "  main:",
+            "    build:",
+            f"      context: {json.dumps(str(workspace))}",
+            "    working_dir: /app",
+            '    entrypoint: ["sleep"]',
+            '    command: ["infinity"]',
+            "    volumes:",
+            *volume_lines,
+            "",
+        ],
+    )
+
+
+def _preserved_image_paths(workspace: Path) -> list[str]:
+    """Image-populated subpaths that should survive the editable ``/app`` mount."""
+    paths: list[str] = []
+    if (workspace / "package.json").is_file():
+        paths.append("/app/node_modules")
+        if _node_build_output_is_image_populated(workspace, "dist"):
+            paths.append("/app/dist")
+    if (workspace / "composer.json").is_file():
+        paths.append("/app/vendor")
+    return paths
+
+
+def _node_build_output_is_image_populated(workspace: Path, dirname: str) -> bool:
+    if (workspace / dirname).exists():
+        return False
+    dockerfile = workspace / "Dockerfile"
+    if not dockerfile.is_file():
+        return False
+    dockerfile_text = dockerfile.read_text(encoding="utf-8")
+    entrypoint = workspace / "docker-entrypoint.sh"
+    entrypoint_text = entrypoint.read_text(encoding="utf-8") if entrypoint.is_file() else ""
+    return (
+        "npm run build" in dockerfile_text
+        or f"/app/{dirname}" in dockerfile_text
+        or f" {dirname}/" in entrypoint_text
+        or f" {dirname}" in entrypoint_text
+    )
+
+
+def _ensure_start_script(workspace: Path) -> None:
+    """Preserve build-generated /app/start_app.sh hidden by the workspace mount."""
+    start = workspace / "start_app.sh"
+    entrypoint = workspace / "docker-entrypoint.sh"
+    if not entrypoint.is_file():
+        _restore_dockerfile_script(workspace, entrypoint, "/app/docker-entrypoint.sh")
+    if entrypoint.is_file():
+        entrypoint.chmod(entrypoint.stat().st_mode | 0o111)
+    if start.exists():
+        start.chmod(start.stat().st_mode | 0o111)
+        return
+    text = _script_from_dockerfile(workspace, "/app/start_app.sh")
+    if text is None and entrypoint.is_file():
+        text = "#!/usr/bin/env bash\nset -e\ncd /app\nexec sh /app/docker-entrypoint.sh\n"
+    if text is None:
+        return
+    start.write_text(text, encoding="utf-8", newline="\n")
+    start.chmod(0o755)
+
+
+def _ensure_dockerfile_created_dirs(workspace: Path) -> None:
+    """Recreate simple Dockerfile-created ``/app`` dirs hidden by the bind mount."""
+    for path in _dockerfile_created_app_dirs(workspace):
+        path.mkdir(parents=True, exist_ok=True)
+
+
+async def _restore_image_generated_files(image: str, workspace: Path) -> None:
+    """Copy selected build-generated files from the image into the workspace.
+
+    Some Harbor images initialize file-backed databases during ``docker build``.
+    The editable ``/app`` bind mount hides those generated files, so copy them
+    out of the built image before starting the task container.
+    """
+    container_paths = _dockerfile_declared_generated_app_files(workspace)
+    if not container_paths:
+        return
+
+    from hud.eval.runtime import _docker
+
+    out, _ = await _docker("create", image, "true")
+    container = out.strip()
+    try:
+        for container_path in container_paths:
+            host_path = _host_path_for_app_file(workspace, container_path)
+            if host_path is None or host_path.exists():
+                continue
+            host_path.parent.mkdir(parents=True, exist_ok=True)
+            await _docker("cp", f"{container}:{container_path}", str(host_path), check=False)
+    finally:
+        await _docker("rm", "--force", "--volumes", container, check=False)
+
+
+def _dockerfile_declared_generated_app_files(workspace: Path) -> list[str]:
+    """Find Dockerfile-declared file-backed DB paths under ``/app``."""
+    dockerfile = workspace / "Dockerfile"
+    if not dockerfile.is_file():
+        return []
+
+    paths: list[str] = []
+    for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")):
+        stripped = instruction.strip()
+        if not stripped.startswith("ENV "):
+            continue
+        for key, value in _env_pairs(stripped.removeprefix("ENV ").strip()):
+            if not _is_generated_db_env_key(key):
+                continue
+            if _is_app_database_path(value):
+                paths.append(value)
+    return list(dict.fromkeys(paths))
+
+
+def _env_pairs(body: str) -> list[tuple[str, str]]:
+    try:
+        tokens = shlex.split(body)
+    except ValueError:
+        return []
+    if not tokens:
+        return []
+
+    pairs: list[tuple[str, str]] = []
+    if all("=" in token for token in tokens):
+        for token in tokens:
+            key, value = token.split("=", 1)
+            pairs.append((key, value))
+        return pairs
+
+    if len(tokens) >= 2:
+        pairs.append((tokens[0], tokens[1]))
+    return pairs
+
+
+def _is_generated_db_env_key(key: str) -> bool:
+    normalized = key.upper()
+    return normalized in {
+        "DB_PATH",
+        "DATABASE_PATH",
+        "SQLITE_PATH",
+        "SQLITE_DB_PATH",
+        "SQLITE_DATABASE_PATH",
+    } or normalized.endswith(("_DB_PATH", "_DATABASE_PATH", "_SQLITE_PATH"))
+
+
+def _is_app_database_path(path: str) -> bool:
+    lowered = path.lower()
+    return lowered.startswith("/app/") and lowered.endswith((".db", ".sqlite", ".sqlite3"))
+
+
+def _host_path_for_app_file(workspace: Path, container_path: str) -> Path | None:
+    if not container_path.startswith("/app/"):
+        return None
+    rel = container_path.removeprefix("/app/")
+    if rel.startswith("../") or "/../" in rel or rel == "..":
+        return None
+    return workspace / rel
+
+
+def _dockerfile_created_app_dirs(workspace: Path) -> list[Path]:
+    dockerfile = workspace / "Dockerfile"
+    if not dockerfile.is_file():
+        return []
+    paths: list[Path] = []
+    for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")):
+        stripped = instruction.strip()
+        if not stripped.startswith("RUN "):
+            continue
+        command = stripped.removeprefix("RUN ").strip()
+        try:
+            tokens = shlex.split(command)
+        except ValueError:
+            continue
+        index = 0
+        while index < len(tokens):
+            if tokens[index] != "mkdir":
+                index += 1
+                continue
+            index += 1
+            while index < len(tokens):
+                token = tokens[index]
+                if token in {"&&", "||", ";"}:
+                    break
+                if token.startswith("-"):
+                    index += 1
+                    continue
+                host_path = _app_dir_from_mkdir_token(workspace, token)
+                if host_path is not None:
+                    paths.append(host_path)
+                index += 1
+    return paths
+
+
+def _app_dir_from_mkdir_token(workspace: Path, token: str) -> Path | None:
+    if not token or any(char in token for char in "$*?["):
+        return None
+    raw = token.rstrip("/")
+    if raw in {"", "."}:
+        return None
+    if raw.startswith("/app/"):
+        rel = raw.removeprefix("/app/")
+    elif raw == "/app":
+        return workspace
+    elif raw.startswith("/"):
+        return None
+    else:
+        rel = raw
+    if rel.startswith("../") or "/../" in rel or rel == "..":
+        return None
+    return workspace / rel
+
+
+def _restore_dockerfile_script(workspace: Path, host_path: Path, container_path: str) -> None:
+    """Restore a Dockerfile-generated script hidden by a bind mount."""
+    text = _script_from_dockerfile(workspace, container_path)
+    if text is None:
+        return
+    host_path.write_text(text, encoding="utf-8", newline="\n")
+    host_path.chmod(0o755)
+
+
+def _script_from_dockerfile(workspace: Path, container_path: str) -> str | None:
+    """Extract a Dockerfile-generated script from a simple ``RUN printf`` command."""
+    dockerfile = workspace / "Dockerfile"
+    if not dockerfile.is_file():
+        return None
+    for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")):
+        stripped = instruction.strip()
+        if not stripped.startswith("RUN ") or container_path not in stripped:
+            continue
+        command = stripped.removeprefix("RUN ").strip()
+        try:
+            tokens = shlex.split(command)
+        except ValueError:
+            continue
+        redirect = _redirect_index(tokens, container_path)
+        if redirect is None or redirect < 2 or tokens[0] != "printf":
+            continue
+        text = _script_from_printf_args(tokens[1:redirect])
+        if text is not None:
+            return text
+    return None
+
+
+def _redirect_index(tokens: list[str], target: str) -> int | None:
+    for index, token in enumerate(tokens):
+        if token in {">", ">>"} and index + 1 < len(tokens) and tokens[index + 1] == target:
+            return index
+        if token in {f">{target}", f">>{target}"}:
+            return index
+    return None
+
+
+def _script_from_printf_args(args: list[str]) -> str | None:
+    if not args:
+        return None
+    if args[0] in {"%s\\n", "%s\n"}:
+        if len(args) < 2:
+            return None
+        return "\n".join(args[1:]) + "\n"
+    if len(args) == 1:
+        return args[0].replace("\\r", "\r").replace("\\n", "\n").replace("\\t", "\t")
+    return None
+
+
+def _dockerfile_logical_lines(text: str) -> list[str]:
+    """Join backslash-continued Dockerfile lines for simple instruction parsing."""
+    lines: list[str] = []
+    current = ""
+    for raw_line in text.splitlines():
+        line = raw_line.rstrip()
+        if line.endswith("\\"):
+            current += line[:-1] + " "
+            continue
+        lines.append(current + line)
+        current = ""
+    if current:
+        lines.append(current)
+    return lines
diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py
index b7343b517..c67b7d924 100644
--- a/integrations/tests/test_harbor.py
+++ b/integrations/tests/test_harbor.py
@@ -7,7 +7,16 @@
 
 import pytest
 
-from integrations.harbor import detect, export, load
+from integrations.harbor import HarborRuntime, detect, export, load
+from integrations.harbor_runtime import (
+    _compose_file,
+    _compose_overlay,
+    _dockerfile_declared_generated_app_files,
+    _ensure_dockerfile_created_dirs,
+    _ensure_start_script,
+    _host_path_for_app_file,
+    _preserved_image_paths,
+)
 
 from .conftest import make_harbor_task
 
@@ -74,6 +83,170 @@ def test_load_skips_unparseable_toml_but_keeps_the_rest(tmp_path: Path) -> None:
     assert {task.id for task in taskset} == {"good", "broken"}
 
 
+def test_harbor_runtime_accepts_dataset_dirs(single_task: Path) -> None:
+    runtime = HarborRuntime(single_task.parent)
+
+    assert single_task.name in runtime._task_dirs
+
+
+def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None:
+    env = tmp_path / "environment"
+    env.mkdir()
+    compose = env / "docker-compose.yaml"
+    compose.write_text("services: {}\n", encoding="utf-8")
+
+    assert _compose_file(env) == compose
+
+
+def test_compose_overlay_mounts_main_workspace_tests_and_logs(tmp_path: Path) -> None:
+    overlay = _compose_overlay(
+        workspace=tmp_path / "workspace",
+        tests_dir=tmp_path / "tests",
+        logs=tmp_path / "logs",
+        preserved_paths=[],
+    )
+
+    assert "main:" in overlay
+    assert 'entrypoint: ["sleep"]' in overlay
+    assert f"{tmp_path / 'workspace'}:/app" in overlay
+    assert f"{tmp_path / 'tests'}:/tests:ro" in overlay
+    assert f"{tmp_path / 'logs'}:/logs" in overlay
+
+
+def test_compose_overlay_preserves_image_dependency_subpaths(tmp_path: Path) -> None:
+    overlay = _compose_overlay(
+        workspace=tmp_path / "workspace",
+        tests_dir=tmp_path / "tests",
+        logs=tmp_path / "logs",
+        preserved_paths=["/app/node_modules"],
+    )
+
+    assert '      - "/app/node_modules"' in overlay
+
+
+def test_preserved_image_paths_detects_node_and_php_dependency_dirs(tmp_path: Path) -> None:
+    (tmp_path / "package.json").write_text("{}", encoding="utf-8")
+    (tmp_path / "composer.json").write_text("{}", encoding="utf-8")
+
+    assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/vendor"]
+
+
+def test_preserved_image_paths_detects_node_build_output(tmp_path: Path) -> None:
+    (tmp_path / "package.json").write_text("{}", encoding="utf-8")
+    (tmp_path / "Dockerfile").write_text(
+        "FROM node:20-slim\nRUN npm ci\nRUN npm run build\n",
+        encoding="utf-8",
+    )
+
+    assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/dist"]
+
+
+def test_ensure_start_script_recreates_build_generated_entrypoint(tmp_path: Path) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    (workspace / "docker-entrypoint.sh").write_text("echo start\n", encoding="utf-8")
+
+    _ensure_start_script(workspace)
+
+    start = workspace / "start_app.sh"
+    assert start.exists()
+    text = start.read_text(encoding="utf-8")
+    assert "exec sh /app/docker-entrypoint.sh" in text
+
+
+def test_ensure_start_script_preserves_dockerfile_generated_command(tmp_path: Path) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    (workspace / "docker-entrypoint.sh").write_text('exec "$@"\n', encoding="utf-8")
+    (workspace / "Dockerfile").write_text(
+        "FROM python:3.11-slim\n"
+        "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' "
+        "'exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app' "
+        "> /app/start_app.sh && chmod +x /app/start_app.sh\n",
+        encoding="utf-8",
+    )
+
+    _ensure_start_script(workspace)
+
+    text = (workspace / "start_app.sh").read_text(encoding="utf-8")
+    assert "exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app" in text
+    assert (workspace / "docker-entrypoint.sh").stat().st_mode & 0o111
+
+
+def test_ensure_start_script_restores_generated_entrypoint(tmp_path: Path) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    (workspace / "Dockerfile").write_text(
+        "FROM python:3.11-slim\n"
+        "RUN printf '#!/bin/sh\\npython -m src.seed --init\\n"
+        "exec uvicorn src.main:app --host 0.0.0.0 --port 8000\\n' "
+        "> /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh\n"
+        "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' "
+        "'exec /app/docker-entrypoint.sh' > /app/start_app.sh && chmod +x /app/start_app.sh\n",
+        encoding="utf-8",
+    )
+
+    _ensure_start_script(workspace)
+
+    entrypoint = workspace / "docker-entrypoint.sh"
+    assert entrypoint.exists()
+    assert entrypoint.stat().st_mode & 0o111
+    assert "python -m src.seed --init" in entrypoint.read_text(encoding="utf-8")
+    assert "exec /app/docker-entrypoint.sh" in (workspace / "start_app.sh").read_text(
+        encoding="utf-8",
+    )
+
+
+def test_ensure_dockerfile_created_dirs_restores_app_dirs(tmp_path: Path) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    (workspace / "Dockerfile").write_text(
+        "FROM node:20-slim\n"
+        "RUN mkdir -p static/uploads /app/tmp/cache && mkdir -p /var/lib/ignored\n",
+        encoding="utf-8",
+    )
+
+    _ensure_dockerfile_created_dirs(workspace)
+
+    assert (workspace / "static" / "uploads").is_dir()
+    assert (workspace / "tmp" / "cache").is_dir()
+    assert not (workspace / "var" / "lib" / "ignored").exists()
+
+
+def test_dockerfile_declared_generated_app_files_detects_seeded_sqlite_db(tmp_path: Path) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    (workspace / "Dockerfile").write_text(
+        "FROM python:3.11-slim\n"
+        "ENV DB_PATH=/app/data/salon_workforce.db\n"
+        "RUN python -m src.seed --init\n",
+        encoding="utf-8",
+    )
+
+    assert _dockerfile_declared_generated_app_files(workspace) == [
+        "/app/data/salon_workforce.db",
+    ]
+    assert _host_path_for_app_file(workspace, "/app/data/salon_workforce.db") == (
+        workspace / "data" / "salon_workforce.db"
+    )
+
+
+def test_dockerfile_declared_generated_app_files_ignores_non_app_or_non_db_paths(
+    tmp_path: Path,
+) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    (workspace / "Dockerfile").write_text(
+        "FROM python:3.11-slim\n"
+        "ENV DB_PATH=/var/lib/app.db CACHE_PATH=/app/cache\n"
+        "ENV SOME_DATABASE_PATH=/app/data/app.txt\n",
+        encoding="utf-8",
+    )
+
+    assert _dockerfile_declared_generated_app_files(workspace) == []
+    assert _host_path_for_app_file(workspace, "/tmp/app.db") is None
+
+
 # ─── export: HUD tasks -> Harbor task folders ───────────────────────────
 
 _ENV_PY = """\
diff --git a/pyproject.toml b/pyproject.toml
index 819732290..1bca6168e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,6 @@ build-backend = "hatchling.build"
 exclude = [
     "docs/",
     "cookbooks/",
-    "integrations/",
     "**/checkpoints/",
     "**/*.safetensors",
     "**/*.ckpt",
@@ -85,6 +84,7 @@ allow-direct-references = true
 [tool.hatch.build.targets.sdist]
 include = [
     "hud/**",
+    "integrations/**",
     "README.md",
     "LICENSE",
     "pyproject.toml"
@@ -102,7 +102,7 @@ exclude = [
 ]
 
 [tool.hatch.build.targets.wheel]
-packages = ["hud"]
+packages = ["hud", "integrations"]
 
 # Ensure py.typed is included in the package
 [tool.hatch.build.targets.wheel.force-include]

From 4a41ff0283f417549440474461dd7af3bf25f99a Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Tue, 30 Jun 2026 19:01:42 -0700
Subject: [PATCH 2/3] Refine Harbor runtime CLI integration

---
 docs/v6/advanced/harbor-convert.mdx |  11 +-
 docs/v6/reference/cli.mdx           |   6 +-
 hud/cli/eval.py                     |  86 ++++++----
 hud/cli/tests/test_eval_config.py   |  59 ++++++-
 integrations/__init__.py            |  12 +-
 integrations/harbor_runtime.py      | 163 ++++++++++---------
 integrations/tests/test_harbor.py   | 241 ++++++++++++++++++++++++++++
 7 files changed, 452 insertions(+), 126 deletions(-)

diff --git a/docs/v6/advanced/harbor-convert.mdx b/docs/v6/advanced/harbor-convert.mdx
index 1680fab16..21dbd721b 100644
--- a/docs/v6/advanced/harbor-convert.mdx
+++ b/docs/v6/advanced/harbor-convert.mdx
@@ -9,8 +9,9 @@ task dirs - is a *frontend* that loads into the same primitives (`Environment`,
 `Task`, `Taskset`). Integrations are **loaders, not converters**: no codegen
 roundtrip to run foreign tasks. The Harbor integration lives in the SDK repo at
 [`integrations/harbor.py`](https://github.com/hud-evals/hud-python/blob/main/integrations/harbor.py)
-- a recipe built only on the public SDK surface; copy it into your project or
-run it from a checkout.
+- a public-surface loader that maps Harbor folders into SDK primitives. The
+included `HarborRuntime` is maintained with the SDK for local Docker execution;
+copy the loader into your project or run it from a checkout.
 
 ## Prerequisites
 
@@ -44,11 +45,11 @@ from integrations.harbor import HarborRuntime
 job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks"))
 ```
 
-The eval CLI also detects local Harbor task directories and datasets when using
-local runtime placement:
+The eval CLI can run local Harbor task directories and datasets when you opt
+into the Harbor source format:
 
 ```bash
-hud eval ./harbor_tasks claude --task-ids cancel-async-tasks --max-steps 30
+hud eval ./harbor_tasks claude --format harbor --task-ids cancel-async-tasks --max-steps 30
 ```
 
 ## Export HUD tasks to Harbor
diff --git a/docs/v6/reference/cli.mdx b/docs/v6/reference/cli.mdx
index 852963982..8b18431b1 100644
--- a/docs/v6/reference/cli.mdx
+++ b/docs/v6/reference/cli.mdx
@@ -105,6 +105,7 @@ For a platform taskset, pass its name or id directly: `hud eval "My Tasks" claud
 | `--config`, `-c` | Agent config `key=value` (repeatable). |
 | `--verbose`, `-v` | Show agent logs (step progress, tool calls) for batch runs too. |
 | `--very-verbose`, `-vv` | Debug-level logs. |
+| `--format` | Task source format: `hud` (default) or `harbor`. |
 | `--runtime` | Placement: `local`, `hud` (HUD runtime tunnel), or `tcp://host:port`. Defaults to `local` for a tasks file; platform tasksets default to remote hosted execution. |
 | `--remote` | Run the whole rollout remotely on the HUD platform. |
 | `--yes`, `-y` | Skip confirmation prompt. |
@@ -133,8 +134,9 @@ hud sync env                   # sync environment metadata
 ```
 
 External benchmark formats (currently Harbor) load directly into the runtime
-as `Taskset`s - no conversion step. Local Harbor directories run with the Harbor
-Docker-backed runtime provider. See [Harbor interop](/v6/advanced/harbor-convert).
+as `Taskset`s - no conversion step. For local Harbor directories, opt in with
+`--format harbor` so the CLI uses the Harbor loader and Docker-backed runtime
+provider. See [Harbor interop](/v6/advanced/harbor-convert).
 
 ## Inspect
 
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 9978eb402..01bf0883c 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -63,6 +63,7 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
 
 _CONFIG_PATH = ".hud_eval.toml"
 _PLACEMENT_CONFLICT_ERROR = "--runtime and --remote are mutually exclusive placement options"
+_SOURCE_FORMATS = ("hud", "harbor")
 
 
 def _resolve_env_vars(obj: Any) -> Any:
@@ -167,6 +168,7 @@ class AgentPreset:
 # very_verbose = true
 # auto_respond = true
 # gateway = false  # Route LLM API calls through HUD Gateway
+# format = "hud"  # hud or harbor
 # runtime = "local"  # local, hud, or tcp://host:port
 # remote = false  # Run the whole rollout remotely on HUD
 
@@ -264,6 +266,7 @@ class EvalConfig(BaseModel):
         "group_size",
         "auto_respond",
         "gateway",
+        "format",
         "runtime",
         "remote",
     }
@@ -279,6 +282,9 @@ class EvalConfig(BaseModel):
     auto_respond: bool | None = None
     group_size: int = 1
     gateway: bool = False
+    #: Source format. ``None``/``hud`` means normal HUD task source loading;
+    #: ``harbor`` opts into the Harbor integration loader/runtime.
+    format: str | None = None
     #: Placement: "local" (spawn each row's env from the source), "hud"
     #: (HUD runtime tunnel), or a tcp:// url of an already-served env.
     #: ``None`` means "infer from the source": a local file runs locally, a
@@ -306,6 +312,20 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
                 ) from None
         return v
 
+    @field_validator("format", mode="before")
+    @classmethod
+    def _parse_format(cls, v: Any) -> str | None:
+        if v is None:
+            return None
+        if not isinstance(v, str):
+            return v
+        normalized = v.strip().lower()
+        if normalized in ("", "hud"):
+            return None
+        if normalized in _SOURCE_FORMATS:
+            return normalized
+        raise ValueError(f"Invalid format: {v}. Must be one of: {', '.join(_SOURCE_FORMATS)}")
+
     def source_is_local_file(self) -> bool:
         """Whether ``source`` points at an on-disk taskset (vs. a platform slug/id)."""
         return self.source is not None and Path(self.source).exists()
@@ -319,6 +339,13 @@ def resolve_runtime(self) -> EvalConfig:
         ``--runtime`` is always honored, except ``local`` against a platform
         taskset, which has no env to spawn.
         """
+        if self.format == "harbor":
+            if not self.source_is_local_file():
+                hud_console.error("--format harbor requires a local Harbor task directory")
+                raise typer.Exit(1)
+            if self.remote or (self.runtime is not None and self.runtime != "local"):
+                hud_console.error("--format harbor currently supports only local runtime placement")
+                raise typer.Exit(1)
         if self.runtime is None:
             if self.source_is_local_file():
                 return self.model_copy(update={"runtime": "local"})
@@ -502,6 +529,7 @@ def merge_cli(
         gateway: bool = False,
         config: list[str] | None = None,
         task_ids: str | None = None,
+        format: str | None = None,
         runtime: str | None = None,
         remote: bool = False,
     ) -> EvalConfig:
@@ -517,6 +545,7 @@ def merge_cli(
                 "max_concurrent": max_concurrent,
                 "max_steps": max_steps,
                 "group_size": group_size,
+                "format": format,
                 "runtime": runtime,
             }.items()
             if value is not None
@@ -604,6 +633,8 @@ def display(self) -> None:
         table.add_column("Value", style="green")
 
         table.add_row("source", str(self.source or "-"))
+        if self.format:
+            table.add_row("format", self.format)
         table.add_row("runtime", str(self.runtime or "-"))
         table.add_row("agent", self.agent_type.value if self.agent_type else "-")
         if self.task_ids:
@@ -728,32 +759,29 @@ def _spawn_target(source: Path) -> Path:
     return resolved.parent
 
 
-def _is_harbor_source(source_path: Path | None) -> bool:
-    if source_path is None or not source_path.exists():
-        return False
-    if not source_path.is_dir():
-        return False
-    from integrations.harbor import detect
-
-    return detect(source_path)
-
-
-def _load_local_taskset(source_path: Path) -> tuple[Any, str]:
+def _load_local_taskset(source_path: Path, source_format: str | None) -> Any:
     from hud.eval import Taskset
 
-    if _is_harbor_source(source_path):
+    format_name = source_format or "hud"
+    if format_name == "hud":
+        taskset = Taskset.from_file(source_path)
+        if len(taskset) == 0:
+            from integrations.harbor import detect
+
+            if detect(source_path):
+                hud_console.hint(
+                    f"{source_path} looks like a Harbor task directory; "
+                    "rerun with --format harbor to load it."
+                )
+        return taskset
+    if format_name == "harbor":
         from integrations.harbor import load
 
-        return load(source_path), "harbor"
-    return Taskset.from_file(source_path), "hud"
+        return load(source_path)
+    raise ValueError(f"unsupported task source format: {format_name}")
 
 
-def _resolve_placement(
-    cfg: EvalConfig,
-    source_path: Path | None,
-    *,
-    source_kind: str = "hud",
-) -> Any:
+def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
     """Map the config's ``runtime`` onto a placement for ``Taskset.run``.
 
     "local" spawns each row's env from the source next to the tasks file;
@@ -769,7 +797,7 @@ def _resolve_placement(
     if cfg.runtime == "local":
         if source_path is None:
             raise ValueError("local placement requires a local source path")
-        if source_kind == "harbor":
+        if cfg.format == "harbor":
             from integrations.harbor import HarborRuntime
 
             return HarborRuntime(source_path)
@@ -798,11 +826,10 @@ async def _run_evaluation(cfg: EvalConfig) -> Any:
 
     source_path = Path(cfg.source)
     is_local = source_path.exists()
-    source_kind = "api"
     if is_local:
         hud_console.info(f"Loading tasks from: {cfg.source}")
         try:
-            taskset, source_kind = _load_local_taskset(source_path)
+            taskset = _load_local_taskset(source_path, cfg.format)
         except Exception as e:
             hud_console.error(f"Failed to load tasks from {cfg.source}: {e}")
             raise typer.Exit(1) from e
@@ -862,11 +889,7 @@ async def _run_evaluation(cfg: EvalConfig) -> Any:
         )
 
     agent = _build_agent(cfg)
-    placement = _resolve_placement(
-        cfg,
-        source_path if is_local else None,
-        source_kind=source_kind,
-    )
+    placement = _resolve_placement(cfg, source_path if is_local else None)
 
     job = await taskset.run(
         agent,
@@ -922,6 +945,11 @@ def eval_command(
     gateway: bool = typer.Option(
         False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
     ),
+    format: str | None = typer.Option(
+        None,
+        "--format",
+        help="Task source format: hud (default) or harbor.",
+    ),
     runtime: str | None = typer.Option(
         None,
         "--runtime",
@@ -942,6 +970,7 @@ def eval_command(
         hud eval "My Tasks" claude-sonnet-4-6 --full   # Platform taskset, run on the platform
         hud eval tasks.json claude --config max_tokens=32768
         hud eval tasks.json claude --gateway           # Route LLM calls through HUD Gateway
+        hud eval ./harbor_tasks claude --format harbor # Run Harbor task dirs locally
         hud eval tasks.json claude-sonnet-4-6 --runtime hud  # Use HUD runtime tunnel
         hud eval tasks.json claude-sonnet-4-6 --remote       # Execute rollout remotely
     """
@@ -972,6 +1001,7 @@ def eval_command(
             group_size=group_size,
             config=config,
             gateway=gateway,
+            format=format,
             runtime=runtime,
             remote=remote,
         )
diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py
index e1183860c..bbd0d4f96 100644
--- a/hud/cli/tests/test_eval_config.py
+++ b/hud/cli/tests/test_eval_config.py
@@ -153,30 +153,77 @@ def test_resolve_placement_runtime_hud_uses_tunnel(
     assert isinstance(placement, HUDRuntime)
 
 
-def test_load_local_taskset_uses_harbor_loader_for_harbor_dirs(tmp_path: Path) -> None:
+def test_load_local_taskset_uses_hud_loader_by_default(tmp_path: Path) -> None:
     _write_harbor_task(tmp_path)
 
-    taskset, source_kind = eval_mod._load_local_taskset(tmp_path)
+    taskset = eval_mod._load_local_taskset(tmp_path, None)
+
+    assert len(taskset) == 0
+
+
+def test_load_local_taskset_hints_harbor_format_on_zero_task_harbor_dir(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    _write_harbor_task(tmp_path)
+    hints: list[str] = []
+    monkeypatch.setattr(eval_mod.hud_console, "hint", lambda message, **_: hints.append(message))
+
+    taskset = eval_mod._load_local_taskset(tmp_path, None)
+
+    assert len(taskset) == 0
+    assert any("--format harbor" in hint for hint in hints)
+
+
+def test_load_local_taskset_rejects_unknown_format(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="unsupported task source format"):
+        eval_mod._load_local_taskset(tmp_path, "unknown")
+
+
+def test_load_local_taskset_uses_harbor_loader_when_format_is_harbor(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    taskset = eval_mod._load_local_taskset(tmp_path, "harbor")
 
-    assert source_kind == "harbor"
     assert len(taskset) == 1
     assert taskset["demo-task"].id == "demo-task"
 
 
-def test_resolve_placement_local_harbor_uses_harbor_runtime(tmp_path: Path) -> None:
+def test_resolve_placement_local_harbor_format_uses_harbor_runtime(tmp_path: Path) -> None:
     from integrations.harbor import HarborRuntime
 
     _write_harbor_task(tmp_path)
 
     placement = eval_mod._resolve_placement(
-        EvalConfig(runtime="local"),
+        EvalConfig(runtime="local", format="harbor"),
         tmp_path,
-        source_kind="harbor",
     )
 
     assert isinstance(placement, HarborRuntime)
 
 
+def test_resolve_placement_local_hud_format_uses_local_runtime(tmp_path: Path) -> None:
+    from hud.eval import LocalRuntime
+
+    _write_harbor_task(tmp_path)
+
+    placement = eval_mod._resolve_placement(EvalConfig(runtime="local"), tmp_path)
+
+    assert isinstance(placement, LocalRuntime)
+
+
+def test_harbor_format_rejects_nonlocal_source() -> None:
+    with pytest.raises(typer.Exit):
+        EvalConfig(source="platform/taskset", format="harbor").resolve_runtime()
+
+
+def test_harbor_format_rejects_nonlocal_runtime(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    with pytest.raises(typer.Exit):
+        EvalConfig(source=str(tmp_path), format="harbor", runtime="hud").resolve_runtime()
+
+
 def test_resolve_placement_remote_uses_hosted_runtime(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,
diff --git a/integrations/__init__.py b/integrations/__init__.py
index e4a817b4e..baa460f4d 100644
--- a/integrations/__init__.py
+++ b/integrations/__init__.py
@@ -5,12 +5,12 @@
 primitives. Integrations are **loaders, not converters**: no codegen roundtrip
 to run foreign tasks.
 
-This package lives outside ``hud`` on purpose: each module is a recipe built
-**only on the public SDK surface** (``Environment``, ``Task``,
-``Taskset``, ``Runtime``) — that constraint is the proof the core is
-flexible. Copy a module into your project or run it from a checkout. The CLI may
-call selected integrations explicitly for polished interop paths, but the
-integration contract itself stays independent of private SDK hooks.
+This package lives outside ``hud`` on purpose: loaders are recipes built on the
+public SDK surface (``Environment``, ``Task``, ``Taskset``, ``Runtime``). Copy a
+loader into your project or run it from a checkout. The CLI may call selected
+integrations explicitly for polished interop paths. A repo-maintained
+integration may also expose a local provider for that explicit CLI path; that
+provider is SDK implementation code, not the portable loader contract.
 
 The contract: an integration module exposes ``detect(path) -> bool`` and
 ``load(path) -> Taskset``. Placement stays an execution-time concern — loaders
diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py
index 872399bc1..5d644525a 100644
--- a/integrations/harbor_runtime.py
+++ b/integrations/harbor_runtime.py
@@ -2,12 +2,15 @@
 
 from __future__ import annotations
 
+import asyncio
 import contextlib
 import json
 import shlex
 import shutil
 import tempfile
+import tomllib
 import uuid
+from collections.abc import AsyncGenerator  # noqa: TC003 - env.template resolves this at runtime.
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
@@ -16,7 +19,7 @@
 from integrations.harbor_common import _hash_directory, _slugify, _task_dirs
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, AsyncIterator
+    from collections.abc import AsyncIterator
 
     import asyncssh
 
@@ -35,7 +38,8 @@ class HarborRuntime:
     The agent receives normal HUD SSH/SFTP access; shell commands execute inside
     the main container via ``docker exec`` while file transfer edits the mounted
     host workspace. Grading runs the Harbor ``tests/test.sh`` inside the same
-    main container and reads ``/logs/verifier/reward.json`` or ``reward.txt``.
+    main container, bounded by the task's ``[verifier] timeout_sec``, and reads
+    ``/logs/verifier/reward.json`` or ``reward.txt``.
     """
 
     def __init__(
@@ -49,7 +53,6 @@ def __init__(
         self._task_dirs = {task_dir.name: task_dir for task_dir in _task_dirs(self.root)}
         if not self._task_dirs:
             raise ValueError(f"no Harbor tasks found in {path}")
-        self._image_cache: dict[Path, str] = {}
 
     @contextlib.asynccontextmanager
     async def __call__(self, task: Task) -> AsyncIterator[Runtime]:
@@ -77,53 +80,26 @@ async def __call__(self, task: Task) -> AsyncIterator[Runtime]:
 
             compose_file = _compose_file(env_dir)
             if compose_file is not None:
-                async with self._compose_container(
-                    task,
-                    compose_file,
-                    workspace,
-                    tests_dir,
-                    logs,
-                    preserved_paths,
-                ) as (
-                    container,
-                    provider,
-                ):
-                    env = self._environment_for(task, task_dir, workspace, logs, container)
-                    async with _local(env) as runtime:
-                        yield Runtime(
-                            runtime.url,
-                            params={
-                                **runtime.params,
-                                "provider": provider,
-                                "container": container,
-                                "ready_timeout": self.ready_timeout,
-                            },
-                            config=runtime.config,
-                        )
+                acquire = self._compose_container(
+                    task, compose_file, workspace, tests_dir, logs, preserved_paths
+                )
             else:
-                async with self._single_container(
-                    task,
-                    task_dir,
-                    workspace,
-                    tests_dir,
-                    logs,
-                    preserved_paths,
-                ) as (
-                    container,
-                    provider,
-                ):
-                    env = self._environment_for(task, task_dir, workspace, logs, container)
-                    async with _local(env) as runtime:
-                        yield Runtime(
-                            runtime.url,
-                            params={
-                                **runtime.params,
-                                "provider": provider,
-                                "container": container,
-                                "ready_timeout": self.ready_timeout,
-                            },
-                            config=runtime.config,
-                        )
+                acquire = self._single_container(
+                    task, task_dir, workspace, tests_dir, logs, preserved_paths
+                )
+            async with acquire as (container, provider):
+                env = self._environment_for(task, task_dir, workspace, logs, container)
+                async with _local(env) as runtime:
+                    yield Runtime(
+                        runtime.url,
+                        params={
+                            **runtime.params,
+                            "provider": provider,
+                            "container": container,
+                            "ready_timeout": self.ready_timeout,
+                        },
+                        config=runtime.config,
+                    )
 
     @contextlib.asynccontextmanager
     async def _single_container(
@@ -137,8 +113,8 @@ async def _single_container(
     ) -> AsyncIterator[tuple[str, str]]:
         from hud.eval.runtime import _docker
 
-        image = await self._image_for(task_dir)
         env_dir = task_dir / "environment"
+        image = await self._build_image(env_dir)
         await _restore_image_generated_files(image, workspace)
         container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}"
         preserved_volume_args = [arg for path in preserved_paths for arg in ("--volume", path)]
@@ -165,10 +141,10 @@ async def _single_container(
         try:
             yield container, "harbor"
         finally:
-            await _release_mount_permissions(container)
+            with contextlib.suppress(Exception):
+                await _release_mount_permissions(container)
             await _docker("rm", "--force", "--volumes", container, check=False)
             await _docker("image", "rm", image, check=False)
-            self._image_cache.pop(env_dir, None)
 
     @contextlib.asynccontextmanager
     async def _compose_container(
@@ -195,15 +171,23 @@ async def _compose_container(
             newline="\n",
         )
         compose_args = ("compose", "-f", str(compose_file), "-f", str(overlay), "-p", project)
-        await _docker(*compose_args, "up", "--detach", "--build")
-        out, _ = await _docker(*compose_args, "ps", "-q", "main")
-        container = out.strip()
-        if not container:
-            raise RuntimeError(f"docker compose project {project} did not create a main service")
+        container = ""
         try:
+            await _docker(*compose_args, "up", "--detach", "--build")
+            out, _ = await _docker(*compose_args, "ps", "-q", "main")
+            container = out.strip()
+            if not container:
+                raise RuntimeError(
+                    f"docker compose project {project} did not create a main service"
+                )
+            if _dockerfile_declared_generated_app_files(workspace):
+                image_out, _ = await _docker("inspect", "--format", "{{.Image}}", container)
+                await _restore_image_generated_files(image_out.strip(), workspace)
             yield container, "harbor-compose"
         finally:
-            await _release_mount_permissions(container)
+            if container:
+                with contextlib.suppress(Exception):
+                    await _release_mount_permissions(container)
             await _docker(
                 *compose_args,
                 "down",
@@ -214,16 +198,11 @@ async def _compose_container(
                 check=False,
             )
 
-    async def _image_for(self, task_dir: Path) -> str:
+    async def _build_image(self, env_dir: Path) -> str:
         from hud.eval.runtime import _docker
 
-        env_dir = task_dir / "environment"
-        cached = self._image_cache.get(env_dir)
-        if cached is not None:
-            return cached
-        tag = f"hud-harbor:{_hash_directory(env_dir)}"
+        tag = f"hud-harbor:{_hash_directory(env_dir)}-{uuid.uuid4().hex[:8]}"
         await _docker("build", "--tag", tag, str(env_dir))
-        self._image_cache[env_dir] = tag
         return tag
 
     def _environment_for(
@@ -236,6 +215,7 @@ def _environment_for(
     ) -> Environment:
         env = Environment(task.env)
         workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path="/app")
+        verifier_timeout = _verifier_timeout(task_dir)
 
         @env.initialize
         async def _up() -> None:
@@ -249,25 +229,42 @@ async def _down() -> None:
         @env.template(id=task.id, description=f"Harbor task {task.id}")
         async def _run_harbor_task() -> AsyncGenerator[Any, Any]:
             answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8")
-            yield await self._grade(container, logs, answer)
+            yield await self._grade(container, logs, answer, verifier_timeout=verifier_timeout)
 
         return env
 
-    async def _grade(self, container: str, logs: Path, answer: Any) -> dict[str, Any]:
-        from hud.eval.runtime import _docker
-
+    async def _grade(
+        self, container: str, logs: Path, answer: Any, *, verifier_timeout: float
+    ) -> dict[str, Any]:
         answer_file = logs / "agent_answer.txt"
         answer_file.parent.mkdir(parents=True, exist_ok=True)
         answer_file.write_text("" if answer is None else str(answer), encoding="utf-8")
-        out, err = await _docker(
+        proc = await asyncio.create_subprocess_exec(
+            "docker",
             "exec",
             "--workdir",
             "/app",
             container,
             "bash",
             "/tests/test.sh",
-            check=False,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
         )
+        try:
+            out_bytes, err_bytes = await asyncio.wait_for(
+                proc.communicate(), timeout=verifier_timeout
+            )
+        except TimeoutError:
+            proc.kill()
+            await proc.wait()
+            return {
+                "score": 0.0,
+                "isError": True,
+                "content": f"Harbor verifier timed out after {verifier_timeout:.0f}s",
+                "info": {"verifier_timeout_sec": verifier_timeout},
+            }
+        out = out_bytes.decode("utf-8", "replace")
+        err = err_bytes.decode("utf-8", "replace")
         reward, info = _read_harbor_reward(logs / "verifier")
         info.update(
             {
@@ -293,8 +290,6 @@ def __init__(self, *args: Any, container: str, **kwargs: Any) -> None:
         self._container = container
 
     async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> None:
-        import asyncio
-
         command = process.command or "bash -l"
         proc = await asyncio.create_subprocess_exec(
             "docker",
@@ -329,6 +324,22 @@ async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> No
         process.exit(proc.returncode if proc.returncode is not None else 0)
 
 
+_DEFAULT_VERIFIER_TIMEOUT = 600.0
+
+
+def _verifier_timeout(task_dir: Path) -> float:
+    """The task's ``[verifier] timeout_sec``, or the Harbor default."""
+    try:
+        config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8"))
+    except (OSError, tomllib.TOMLDecodeError):
+        return _DEFAULT_VERIFIER_TIMEOUT
+    verifier = config.get("verifier")
+    timeout = verifier.get("timeout_sec") if isinstance(verifier, dict) else None
+    if isinstance(timeout, int | float) and not isinstance(timeout, bool) and timeout > 0:
+        return float(timeout)
+    return _DEFAULT_VERIFIER_TIMEOUT
+
+
 def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]:
     reward_json = verifier_logs / "reward.json"
     if reward_json.is_file():
@@ -340,12 +351,6 @@ def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, An
                 value = data.get(key)
                 if isinstance(value, int | float):
                     return float(value), {"reward_file": str(reward_json), "reward_json": data}
-            numeric = [float(value) for value in data.values() if isinstance(value, int | float)]
-            if numeric:
-                return sum(numeric) / len(numeric), {
-                    "reward_file": str(reward_json),
-                    "reward_json": data,
-                }
         return None, {"reward_file": str(reward_json), "reward_parse_error": "no numeric reward"}
 
     reward_txt = verifier_logs / "reward.txt"
diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py
index c67b7d924..623f8560b 100644
--- a/integrations/tests/test_harbor.py
+++ b/integrations/tests/test_harbor.py
@@ -2,11 +2,14 @@
 
 from __future__ import annotations
 
+import asyncio
+import json
 import textwrap
 from typing import TYPE_CHECKING
 
 import pytest
 
+from hud.eval import Task
 from integrations.harbor import HarborRuntime, detect, export, load
 from integrations.harbor_runtime import (
     _compose_file,
@@ -16,6 +19,8 @@
     _ensure_start_script,
     _host_path_for_app_file,
     _preserved_image_paths,
+    _read_harbor_reward,
+    _verifier_timeout,
 )
 
 from .conftest import make_harbor_task
@@ -89,6 +94,96 @@ def test_harbor_runtime_accepts_dataset_dirs(single_task: Path) -> None:
     assert single_task.name in runtime._task_dirs
 
 
+async def test_harbor_runtime_builds_unique_images_per_acquisition(
+    single_task: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    calls: list[tuple[tuple[str, ...], bool]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append((args, check))
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+    runtime = HarborRuntime(single_task.parent)
+
+    first = await runtime._build_image(single_task / "environment")
+    second = await runtime._build_image(single_task / "environment")
+
+    assert first != second
+    assert first.startswith("hud-harbor:")
+    assert second.startswith("hud-harbor:")
+    assert [args[2] for args, _ in calls] == [first, second]
+
+
+async def test_compose_container_cleans_up_after_failed_up(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    compose = single_task / "environment" / "docker-compose.yaml"
+    compose.write_text("services:\n  main:\n    build: .\n", encoding="utf-8")
+    calls: list[tuple[tuple[str, ...], bool]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append((args, check))
+        if args[-3:] == ("up", "--detach", "--build"):
+            raise RuntimeError("compose failed")
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+    runtime = HarborRuntime(single_task.parent)
+
+    with pytest.raises(RuntimeError, match="compose failed"):
+        async with runtime._compose_container(
+            Task(env="bench", id=single_task.name),
+            compose,
+            tmp_path / "workspace",
+            single_task / "tests",
+            tmp_path / "logs",
+            [],
+        ):
+            raise AssertionError("compose acquisition should not yield")
+
+    assert any(
+        args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False
+        for args, check in calls
+    )
+
+
+async def test_compose_container_cleans_up_when_main_service_is_missing(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    compose = single_task / "environment" / "docker-compose.yaml"
+    compose.write_text("services:\n  api:\n    build: .\n", encoding="utf-8")
+    calls: list[tuple[tuple[str, ...], bool]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append((args, check))
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+    runtime = HarborRuntime(single_task.parent)
+
+    with pytest.raises(RuntimeError, match="did not create a main service"):
+        async with runtime._compose_container(
+            Task(env="bench", id=single_task.name),
+            compose,
+            tmp_path / "workspace",
+            single_task / "tests",
+            tmp_path / "logs",
+            [],
+        ):
+            raise AssertionError("compose acquisition should not yield")
+
+    assert any(
+        args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False
+        for args, check in calls
+    )
+
+
 def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None:
     env = tmp_path / "environment"
     env.mkdir()
@@ -247,6 +342,152 @@ def test_dockerfile_declared_generated_app_files_ignores_non_app_or_non_db_paths
     assert _host_path_for_app_file(workspace, "/tmp/app.db") is None
 
 
+async def test_compose_container_restores_image_generated_db_files(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    compose = single_task / "environment" / "docker-compose.yaml"
+    compose.write_text("services:\n  main:\n    build: .\n", encoding="utf-8")
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    (workspace / "Dockerfile").write_text(
+        "FROM python:3.11-slim\nENV DB_PATH=/app/data/app.db\n", encoding="utf-8"
+    )
+    calls: list[tuple[str, ...]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append(args)
+        if args[0] == "compose" and args[-3:] == ("ps", "-q", "main"):
+            return "maincontainer\n", ""
+        if args[0] == "inspect":
+            return "sha256:mainimage\n", ""
+        if args[0] == "create":
+            return "tempcontainer\n", ""
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+    runtime = HarborRuntime(single_task.parent)
+
+    async with runtime._compose_container(
+        Task(env="bench", id=single_task.name),
+        compose,
+        workspace,
+        single_task / "tests",
+        tmp_path / "logs",
+        [],
+    ):
+        pass
+
+    assert ("inspect", "--format", "{{.Image}}", "maincontainer") in calls
+    assert (
+        "cp",
+        "tempcontainer:/app/data/app.db",
+        str(workspace / "data" / "app.db"),
+    ) in calls
+
+
+def test_read_harbor_reward_prefers_reward_and_score_keys(tmp_path: Path) -> None:
+    verifier = tmp_path / "verifier"
+    verifier.mkdir()
+    (verifier / "reward.json").write_text(json.dumps({"reward": 0.5, "total": 5}), "utf-8")
+
+    reward, info = _read_harbor_reward(verifier)
+
+    assert reward == 0.5
+    assert info["reward_json"] == {"reward": 0.5, "total": 5}
+
+
+def test_read_harbor_reward_rejects_dict_without_reward_or_score(tmp_path: Path) -> None:
+    verifier = tmp_path / "verifier"
+    verifier.mkdir()
+    (verifier / "reward.json").write_text(json.dumps({"passed": 3, "total": 5}), "utf-8")
+
+    reward, info = _read_harbor_reward(verifier)
+
+    assert reward is None
+    assert info["reward_parse_error"] == "no numeric reward"
+
+
+def test_verifier_timeout_reads_task_toml(single_task: Path) -> None:
+    assert _verifier_timeout(single_task) == 120.0
+
+
+def test_verifier_timeout_defaults_when_missing_or_invalid(tmp_path: Path) -> None:
+    no_verifier = tmp_path / "no-verifier"
+    no_verifier.mkdir()
+    (no_verifier / "task.toml").write_text('[metadata]\ncategory = "systems"\n', "utf-8")
+    broken = tmp_path / "broken"
+    broken.mkdir()
+    (broken / "task.toml").write_text("not toml [", "utf-8")
+
+    assert _verifier_timeout(no_verifier) == 600.0
+    assert _verifier_timeout(broken) == 600.0
+    assert _verifier_timeout(tmp_path / "missing") == 600.0
+
+
+async def test_grade_reads_reward_after_verifier_completes(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    logs = tmp_path / "logs"
+    (logs / "verifier").mkdir(parents=True)
+    (logs / "verifier" / "reward.txt").write_text("1.0\n", "utf-8")
+
+    class FakeProc:
+        returncode = 0
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            return b"verifier out", b""
+
+    async def fake_exec(*args: str, **kwargs: object) -> FakeProc:
+        assert args[:2] == ("docker", "exec")
+        return FakeProc()
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec)
+    runtime = HarborRuntime(single_task.parent)
+
+    result = await runtime._grade("container", logs, "done", verifier_timeout=120.0)
+
+    assert result["score"] == 1.0
+    assert result["info"]["stdout"] == "verifier out"
+    assert (logs / "agent_answer.txt").read_text("utf-8") == "done"
+
+
+async def test_grade_times_out_when_verifier_hangs(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    class FakeProc:
+        killed = False
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            await asyncio.sleep(3600)
+            raise AssertionError("unreachable")
+
+        def kill(self) -> None:
+            self.killed = True
+
+        async def wait(self) -> int:
+            return -9
+
+    proc = FakeProc()
+
+    async def fake_exec(*args: str, **kwargs: object) -> FakeProc:
+        return proc
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec)
+    runtime = HarborRuntime(single_task.parent)
+
+    result = await runtime._grade("container", tmp_path / "logs", None, verifier_timeout=0.05)
+
+    assert result["isError"] is True
+    assert "timed out" in result["content"]
+    assert proc.killed
+
+
 # ─── export: HUD tasks -> Harbor task folders ───────────────────────────
 
 _ENV_PY = """\

From 455f23c96e21c5039cae063819da4415ac97af9b Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Fri, 3 Jul 2026 11:15:37 +0800
Subject: [PATCH 3/3] Generalize Harbor runtime via image-workdir
 materialization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the Dockerfile-parsing fidelity heuristics (start-script
recreation, mkdir-dir restoration, node_modules/vendor/dist submounts,
seeded-SQLite restoration, and the hardcoded /app workdir) with a single
mechanism: after building the task image, copy its actual working
directory onto the host workspace and bind-mount that back over the same
guest path. The workspace is then the image's real workdir — source plus
every build-generated file, with original mode bits — so nothing the
build produced is shadowed by the editable mount, and the guest path is
derived from the image's WORKDIR instead of assumed to be /app.

This removes ~330 lines of corpus-tuned parsing and makes the runner
faithful to any Harbor/terminal-bench image shape rather than the
NOVATStyle export specifically. Validated on real Docker across all six
artifact classes the heuristics used to cover (compose+postgres,
gunicorn start script, node_modules, mkdir dirs, node dist, seeded
sqlite): all return reward 0.0 with is_error false and clean teardown.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 integrations/harbor_runtime.py    | 430 +++++++-----------------------
 integrations/tests/test_harbor.py | 200 +++-----------
 2 files changed, 129 insertions(+), 501 deletions(-)

diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py
index 5d644525a..50277726c 100644
--- a/integrations/harbor_runtime.py
+++ b/integrations/harbor_runtime.py
@@ -5,8 +5,7 @@
 import asyncio
 import contextlib
 import json
-import shlex
-import shutil
+import os
 import tempfile
 import tomllib
 import uuid
@@ -30,16 +29,19 @@
 class HarborRuntime:
     """Run Harbor task directories through HUD's local rollout engine.
 
-    The provider builds the Harbor task's ``environment/`` Docker context, runs
-    a fresh container with a writable host workspace mounted at ``/app``, and
-    serves a small HUD control channel from the host process. If the task ships a
-    ``docker-compose.yaml``/``.yml``, the provider starts it with an overlay that
-    keeps the ``main`` service idle while preserving sidecars such as databases.
-    The agent receives normal HUD SSH/SFTP access; shell commands execute inside
-    the main container via ``docker exec`` while file transfer edits the mounted
-    host workspace. Grading runs the Harbor ``tests/test.sh`` inside the same
-    main container, bounded by the task's ``[verifier] timeout_sec``, and reads
-    ``/logs/verifier/reward.json`` or ``reward.txt``.
+    The provider builds the Harbor task's ``environment/`` Docker context, then
+    materializes the built image's working directory onto a writable host
+    workspace and bind-mounts it back over the same guest path. Because the
+    workspace is the image's actual working directory (source *plus* every file
+    the build generated — start scripts, installed dependencies, compiled output,
+    seeded databases — with their original mode bits), the agent sees exactly
+    what the image would run, and edits made over SFTP are visible to the running
+    process. If the task ships a ``docker-compose.yaml``/``.yml``, the provider
+    starts it with an overlay that keeps the ``main`` service idle while
+    preserving sidecars such as databases. Shell commands execute inside the main
+    container via ``docker exec``. Grading runs the Harbor ``tests/test.sh``
+    inside the same main container, bounded by the task's ``[verifier]
+    timeout_sec``, and reads ``/logs/verifier/reward.json`` or ``reward.txt``.
     """
 
     def __init__(
@@ -56,7 +58,7 @@ def __init__(
 
     @contextlib.asynccontextmanager
     async def __call__(self, task: Task) -> AsyncIterator[Runtime]:
-        from hud.eval.runtime import Runtime, _local
+        from hud.eval.runtime import Runtime, _docker, _local
 
         task_dir = self._task_dirs.get(task.id)
         if task_dir is None:
@@ -72,23 +74,23 @@ async def __call__(self, task: Task) -> AsyncIterator[Runtime]:
             tmp_path = Path(tmp)
             workspace = tmp_path / "workspace"
             logs = tmp_path / "logs"
-            shutil.copytree(env_dir, workspace)
-            _ensure_start_script(workspace)
-            _ensure_dockerfile_created_dirs(workspace)
-            preserved_paths = _preserved_image_paths(workspace)
+            workspace.mkdir()
             logs.mkdir(parents=True, exist_ok=True)
 
+            image = await self._build_image(env_dir)
+            workdir = await _image_workdir(image)
+            await _materialize_workspace(image, workspace, workdir)
+
             compose_file = _compose_file(env_dir)
             if compose_file is not None:
+                await _docker("image", "rm", image, check=False)
                 acquire = self._compose_container(
-                    task, compose_file, workspace, tests_dir, logs, preserved_paths
+                    task, compose_file, workspace, workdir, tests_dir, logs
                 )
             else:
-                acquire = self._single_container(
-                    task, task_dir, workspace, tests_dir, logs, preserved_paths
-                )
+                acquire = self._single_container(task, image, workspace, workdir, tests_dir, logs)
             async with acquire as (container, provider):
-                env = self._environment_for(task, task_dir, workspace, logs, container)
+                env = self._environment_for(task, task_dir, workspace, workdir, logs, container)
                 async with _local(env) as runtime:
                     yield Runtime(
                         runtime.url,
@@ -105,35 +107,30 @@ async def __call__(self, task: Task) -> AsyncIterator[Runtime]:
     async def _single_container(
         self,
         task: Task,
-        task_dir: Path,
+        image: str,
         workspace: Path,
+        workdir: str,
         tests_dir: Path,
         logs: Path,
-        preserved_paths: list[str],
     ) -> AsyncIterator[tuple[str, str]]:
         from hud.eval.runtime import _docker
 
-        env_dir = task_dir / "environment"
-        image = await self._build_image(env_dir)
-        await _restore_image_generated_files(image, workspace)
         container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}"
-        preserved_volume_args = [arg for path in preserved_paths for arg in ("--volume", path)]
         out, _ = await _docker(
             "run",
             "--detach",
             "--name",
             container_name,
             "--workdir",
-            "/app",
+            workdir,
             "--entrypoint",
             "sleep",
             "--volume",
-            f"{workspace}:/app",
+            f"{workspace}:{workdir}",
             "--volume",
             f"{tests_dir}:/tests:ro",
             "--volume",
             f"{logs}:/logs",
-            *preserved_volume_args,
             image,
             "infinity",
         )
@@ -142,7 +139,7 @@ async def _single_container(
             yield container, "harbor"
         finally:
             with contextlib.suppress(Exception):
-                await _release_mount_permissions(container)
+                await _release_mount_permissions(container, workdir)
             await _docker("rm", "--force", "--volumes", container, check=False)
             await _docker("image", "rm", image, check=False)
 
@@ -152,21 +149,16 @@ async def _compose_container(
         task: Task,
         compose_file: Path,
         workspace: Path,
+        workdir: str,
         tests_dir: Path,
         logs: Path,
-        preserved_paths: list[str],
     ) -> AsyncIterator[tuple[str, str]]:
         from hud.eval.runtime import _docker
 
         project = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}"
         overlay = workspace.parent / "compose.hud.yaml"
         overlay.write_text(
-            _compose_overlay(
-                workspace=workspace,
-                tests_dir=tests_dir,
-                logs=logs,
-                preserved_paths=preserved_paths,
-            ),
+            _compose_overlay(workspace=workspace, workdir=workdir, tests_dir=tests_dir, logs=logs),
             encoding="utf-8",
             newline="\n",
         )
@@ -180,14 +172,11 @@ async def _compose_container(
                 raise RuntimeError(
                     f"docker compose project {project} did not create a main service"
                 )
-            if _dockerfile_declared_generated_app_files(workspace):
-                image_out, _ = await _docker("inspect", "--format", "{{.Image}}", container)
-                await _restore_image_generated_files(image_out.strip(), workspace)
             yield container, "harbor-compose"
         finally:
             if container:
                 with contextlib.suppress(Exception):
-                    await _release_mount_permissions(container)
+                    await _release_mount_permissions(container, workdir)
             await _docker(
                 *compose_args,
                 "down",
@@ -210,11 +199,12 @@ def _environment_for(
         task: Task,
         task_dir: Path,
         workspace: Path,
+        workdir: str,
         logs: Path,
         container: str,
     ) -> Environment:
         env = Environment(task.env)
-        workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path="/app")
+        workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path=workdir)
         verifier_timeout = _verifier_timeout(task_dir)
 
         @env.initialize
@@ -229,12 +219,14 @@ async def _down() -> None:
         @env.template(id=task.id, description=f"Harbor task {task.id}")
         async def _run_harbor_task() -> AsyncGenerator[Any, Any]:
             answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8")
-            yield await self._grade(container, logs, answer, verifier_timeout=verifier_timeout)
+            yield await self._grade(
+                container, workdir, logs, answer, verifier_timeout=verifier_timeout
+            )
 
         return env
 
     async def _grade(
-        self, container: str, logs: Path, answer: Any, *, verifier_timeout: float
+        self, container: str, workdir: str, logs: Path, answer: Any, *, verifier_timeout: float
     ) -> dict[str, Any]:
         answer_file = logs / "agent_answer.txt"
         answer_file.parent.mkdir(parents=True, exist_ok=True)
@@ -243,7 +235,7 @@ async def _grade(
             "docker",
             "exec",
             "--workdir",
-            "/app",
+            workdir,
             container,
             "bash",
             "/tests/test.sh",
@@ -340,6 +332,48 @@ def _verifier_timeout(task_dir: Path) -> float:
     return _DEFAULT_VERIFIER_TIMEOUT
 
 
+async def _image_workdir(image: str) -> str:
+    """The image's configured ``WORKDIR``, or ``/app`` when it declares none."""
+    from hud.eval.runtime import _docker
+
+    out, _ = await _docker("image", "inspect", "--format", "{{.Config.WorkingDir}}", image)
+    return out.strip() or "/app"
+
+
+async def _materialize_workspace(image: str, workspace: Path, workdir: str) -> None:
+    """Copy the built image's ``workdir`` onto the host workspace, then own it.
+
+    The ``workdir`` bind mount would otherwise shadow everything the Docker build
+    generated there (start scripts, installed dependencies, compiled output,
+    seeded databases). Copying the image's actual ``workdir`` out first makes the
+    mounted workspace a faithful, editable copy of what the image runs. Files
+    arrive owned by the container's build user; hand them to the host user so the
+    agent can edit them over SFTP and teardown can remove them.
+    """
+    from hud.eval.runtime import _docker
+
+    out, _ = await _docker("create", image, "true")
+    container = out.strip()
+    try:
+        await _docker("cp", f"{container}:{workdir}/.", str(workspace))
+    finally:
+        await _docker("rm", "--force", "--volumes", container, check=False)
+
+    if hasattr(os, "getuid"):
+        await _docker(
+            "run",
+            "--rm",
+            "--volume",
+            f"{workspace}:{workdir}",
+            image,
+            "chown",
+            "-R",
+            f"{os.getuid()}:{os.getgid()}",
+            workdir,
+            check=False,
+        )
+
+
 def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]:
     reward_json = verifier_logs / "reward.json"
     if reward_json.is_file():
@@ -364,7 +398,7 @@ def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, An
     return None, {}
 
 
-async def _release_mount_permissions(container: str) -> None:
+async def _release_mount_permissions(container: str, workdir: str) -> None:
     """Let the host user delete files that container-root created in mounts."""
     from hud.eval.runtime import _docker
 
@@ -373,7 +407,7 @@ async def _release_mount_permissions(container: str) -> None:
         container,
         "sh",
         "-lc",
-        "chmod -R a+rwX /app /logs 2>/dev/null || true",
+        f"chmod -R a+rwX {workdir} /logs 2>/dev/null || true",
         check=False,
     )
 
@@ -386,299 +420,25 @@ def _compose_file(env_dir: Path) -> Path | None:
     return None
 
 
-def _compose_overlay(
-    *,
-    workspace: Path,
-    tests_dir: Path,
-    logs: Path,
-    preserved_paths: list[str] | None = None,
-) -> str:
-    """Compose override that keeps Harbor's main service idle for agent work."""
-    preserved_paths = preserved_paths or []
-    volume_lines = [
-        f"      - {json.dumps(f'{workspace}:/app')}",
-        f"      - {json.dumps(f'{tests_dir}:/tests:ro')}",
-        f"      - {json.dumps(f'{logs}:/logs')}",
-    ]
-    volume_lines.extend(f"      - {json.dumps(path)}" for path in preserved_paths)
+def _compose_overlay(*, workspace: Path, workdir: str, tests_dir: Path, logs: Path) -> str:
+    """Compose override that keeps Harbor's main service idle for agent work.
+
+    Only ``main`` is touched: it is parked on ``sleep`` with the materialized
+    workspace mounted over its working directory, and the Harbor ``/tests`` and
+    ``/logs`` paths bound in. Every other service (databases, caches) is
+    inherited from the task's own compose file unchanged.
+    """
     return "\n".join(
         [
             "services:",
             "  main:",
-            "    build:",
-            f"      context: {json.dumps(str(workspace))}",
-            "    working_dir: /app",
+            f"    working_dir: {json.dumps(workdir)}",
             '    entrypoint: ["sleep"]',
             '    command: ["infinity"]',
             "    volumes:",
-            *volume_lines,
+            f"      - {json.dumps(f'{workspace}:{workdir}')}",
+            f"      - {json.dumps(f'{tests_dir}:/tests:ro')}",
+            f"      - {json.dumps(f'{logs}:/logs')}",
             "",
         ],
     )
-
-
-def _preserved_image_paths(workspace: Path) -> list[str]:
-    """Image-populated subpaths that should survive the editable ``/app`` mount."""
-    paths: list[str] = []
-    if (workspace / "package.json").is_file():
-        paths.append("/app/node_modules")
-        if _node_build_output_is_image_populated(workspace, "dist"):
-            paths.append("/app/dist")
-    if (workspace / "composer.json").is_file():
-        paths.append("/app/vendor")
-    return paths
-
-
-def _node_build_output_is_image_populated(workspace: Path, dirname: str) -> bool:
-    if (workspace / dirname).exists():
-        return False
-    dockerfile = workspace / "Dockerfile"
-    if not dockerfile.is_file():
-        return False
-    dockerfile_text = dockerfile.read_text(encoding="utf-8")
-    entrypoint = workspace / "docker-entrypoint.sh"
-    entrypoint_text = entrypoint.read_text(encoding="utf-8") if entrypoint.is_file() else ""
-    return (
-        "npm run build" in dockerfile_text
-        or f"/app/{dirname}" in dockerfile_text
-        or f" {dirname}/" in entrypoint_text
-        or f" {dirname}" in entrypoint_text
-    )
-
-
-def _ensure_start_script(workspace: Path) -> None:
-    """Preserve build-generated /app/start_app.sh hidden by the workspace mount."""
-    start = workspace / "start_app.sh"
-    entrypoint = workspace / "docker-entrypoint.sh"
-    if not entrypoint.is_file():
-        _restore_dockerfile_script(workspace, entrypoint, "/app/docker-entrypoint.sh")
-    if entrypoint.is_file():
-        entrypoint.chmod(entrypoint.stat().st_mode | 0o111)
-    if start.exists():
-        start.chmod(start.stat().st_mode | 0o111)
-        return
-    text = _script_from_dockerfile(workspace, "/app/start_app.sh")
-    if text is None and entrypoint.is_file():
-        text = "#!/usr/bin/env bash\nset -e\ncd /app\nexec sh /app/docker-entrypoint.sh\n"
-    if text is None:
-        return
-    start.write_text(text, encoding="utf-8", newline="\n")
-    start.chmod(0o755)
-
-
-def _ensure_dockerfile_created_dirs(workspace: Path) -> None:
-    """Recreate simple Dockerfile-created ``/app`` dirs hidden by the bind mount."""
-    for path in _dockerfile_created_app_dirs(workspace):
-        path.mkdir(parents=True, exist_ok=True)
-
-
-async def _restore_image_generated_files(image: str, workspace: Path) -> None:
-    """Copy selected build-generated files from the image into the workspace.
-
-    Some Harbor images initialize file-backed databases during ``docker build``.
-    The editable ``/app`` bind mount hides those generated files, so copy them
-    out of the built image before starting the task container.
-    """
-    container_paths = _dockerfile_declared_generated_app_files(workspace)
-    if not container_paths:
-        return
-
-    from hud.eval.runtime import _docker
-
-    out, _ = await _docker("create", image, "true")
-    container = out.strip()
-    try:
-        for container_path in container_paths:
-            host_path = _host_path_for_app_file(workspace, container_path)
-            if host_path is None or host_path.exists():
-                continue
-            host_path.parent.mkdir(parents=True, exist_ok=True)
-            await _docker("cp", f"{container}:{container_path}", str(host_path), check=False)
-    finally:
-        await _docker("rm", "--force", "--volumes", container, check=False)
-
-
-def _dockerfile_declared_generated_app_files(workspace: Path) -> list[str]:
-    """Find Dockerfile-declared file-backed DB paths under ``/app``."""
-    dockerfile = workspace / "Dockerfile"
-    if not dockerfile.is_file():
-        return []
-
-    paths: list[str] = []
-    for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")):
-        stripped = instruction.strip()
-        if not stripped.startswith("ENV "):
-            continue
-        for key, value in _env_pairs(stripped.removeprefix("ENV ").strip()):
-            if not _is_generated_db_env_key(key):
-                continue
-            if _is_app_database_path(value):
-                paths.append(value)
-    return list(dict.fromkeys(paths))
-
-
-def _env_pairs(body: str) -> list[tuple[str, str]]:
-    try:
-        tokens = shlex.split(body)
-    except ValueError:
-        return []
-    if not tokens:
-        return []
-
-    pairs: list[tuple[str, str]] = []
-    if all("=" in token for token in tokens):
-        for token in tokens:
-            key, value = token.split("=", 1)
-            pairs.append((key, value))
-        return pairs
-
-    if len(tokens) >= 2:
-        pairs.append((tokens[0], tokens[1]))
-    return pairs
-
-
-def _is_generated_db_env_key(key: str) -> bool:
-    normalized = key.upper()
-    return normalized in {
-        "DB_PATH",
-        "DATABASE_PATH",
-        "SQLITE_PATH",
-        "SQLITE_DB_PATH",
-        "SQLITE_DATABASE_PATH",
-    } or normalized.endswith(("_DB_PATH", "_DATABASE_PATH", "_SQLITE_PATH"))
-
-
-def _is_app_database_path(path: str) -> bool:
-    lowered = path.lower()
-    return lowered.startswith("/app/") and lowered.endswith((".db", ".sqlite", ".sqlite3"))
-
-
-def _host_path_for_app_file(workspace: Path, container_path: str) -> Path | None:
-    if not container_path.startswith("/app/"):
-        return None
-    rel = container_path.removeprefix("/app/")
-    if rel.startswith("../") or "/../" in rel or rel == "..":
-        return None
-    return workspace / rel
-
-
-def _dockerfile_created_app_dirs(workspace: Path) -> list[Path]:
-    dockerfile = workspace / "Dockerfile"
-    if not dockerfile.is_file():
-        return []
-    paths: list[Path] = []
-    for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")):
-        stripped = instruction.strip()
-        if not stripped.startswith("RUN "):
-            continue
-        command = stripped.removeprefix("RUN ").strip()
-        try:
-            tokens = shlex.split(command)
-        except ValueError:
-            continue
-        index = 0
-        while index < len(tokens):
-            if tokens[index] != "mkdir":
-                index += 1
-                continue
-            index += 1
-            while index < len(tokens):
-                token = tokens[index]
-                if token in {"&&", "||", ";"}:
-                    break
-                if token.startswith("-"):
-                    index += 1
-                    continue
-                host_path = _app_dir_from_mkdir_token(workspace, token)
-                if host_path is not None:
-                    paths.append(host_path)
-                index += 1
-    return paths
-
-
-def _app_dir_from_mkdir_token(workspace: Path, token: str) -> Path | None:
-    if not token or any(char in token for char in "$*?["):
-        return None
-    raw = token.rstrip("/")
-    if raw in {"", "."}:
-        return None
-    if raw.startswith("/app/"):
-        rel = raw.removeprefix("/app/")
-    elif raw == "/app":
-        return workspace
-    elif raw.startswith("/"):
-        return None
-    else:
-        rel = raw
-    if rel.startswith("../") or "/../" in rel or rel == "..":
-        return None
-    return workspace / rel
-
-
-def _restore_dockerfile_script(workspace: Path, host_path: Path, container_path: str) -> None:
-    """Restore a Dockerfile-generated script hidden by a bind mount."""
-    text = _script_from_dockerfile(workspace, container_path)
-    if text is None:
-        return
-    host_path.write_text(text, encoding="utf-8", newline="\n")
-    host_path.chmod(0o755)
-
-
-def _script_from_dockerfile(workspace: Path, container_path: str) -> str | None:
-    """Extract a Dockerfile-generated script from a simple ``RUN printf`` command."""
-    dockerfile = workspace / "Dockerfile"
-    if not dockerfile.is_file():
-        return None
-    for instruction in _dockerfile_logical_lines(dockerfile.read_text(encoding="utf-8")):
-        stripped = instruction.strip()
-        if not stripped.startswith("RUN ") or container_path not in stripped:
-            continue
-        command = stripped.removeprefix("RUN ").strip()
-        try:
-            tokens = shlex.split(command)
-        except ValueError:
-            continue
-        redirect = _redirect_index(tokens, container_path)
-        if redirect is None or redirect < 2 or tokens[0] != "printf":
-            continue
-        text = _script_from_printf_args(tokens[1:redirect])
-        if text is not None:
-            return text
-    return None
-
-
-def _redirect_index(tokens: list[str], target: str) -> int | None:
-    for index, token in enumerate(tokens):
-        if token in {">", ">>"} and index + 1 < len(tokens) and tokens[index + 1] == target:
-            return index
-        if token in {f">{target}", f">>{target}"}:
-            return index
-    return None
-
-
-def _script_from_printf_args(args: list[str]) -> str | None:
-    if not args:
-        return None
-    if args[0] in {"%s\\n", "%s\n"}:
-        if len(args) < 2:
-            return None
-        return "\n".join(args[1:]) + "\n"
-    if len(args) == 1:
-        return args[0].replace("\\r", "\r").replace("\\n", "\n").replace("\\t", "\t")
-    return None
-
-
-def _dockerfile_logical_lines(text: str) -> list[str]:
-    """Join backslash-continued Dockerfile lines for simple instruction parsing."""
-    lines: list[str] = []
-    current = ""
-    for raw_line in text.splitlines():
-        line = raw_line.rstrip()
-        if line.endswith("\\"):
-            current += line[:-1] + " "
-            continue
-        lines.append(current + line)
-        current = ""
-    if current:
-        lines.append(current)
-    return lines
diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py
index 623f8560b..c695bee3c 100644
--- a/integrations/tests/test_harbor.py
+++ b/integrations/tests/test_harbor.py
@@ -4,6 +4,7 @@
 
 import asyncio
 import json
+import os
 import textwrap
 from typing import TYPE_CHECKING
 
@@ -14,11 +15,8 @@
 from integrations.harbor_runtime import (
     _compose_file,
     _compose_overlay,
-    _dockerfile_declared_generated_app_files,
-    _ensure_dockerfile_created_dirs,
-    _ensure_start_script,
-    _host_path_for_app_file,
-    _preserved_image_paths,
+    _image_workdir,
+    _materialize_workspace,
     _read_harbor_reward,
     _verifier_timeout,
 )
@@ -139,9 +137,9 @@ async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
             Task(env="bench", id=single_task.name),
             compose,
             tmp_path / "workspace",
+            "/app",
             single_task / "tests",
             tmp_path / "logs",
-            [],
         ):
             raise AssertionError("compose acquisition should not yield")
 
@@ -172,9 +170,9 @@ async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
             Task(env="bench", id=single_task.name),
             compose,
             tmp_path / "workspace",
+            "/app",
             single_task / "tests",
             tmp_path / "logs",
-            [],
         ):
             raise AssertionError("compose acquisition should not yield")
 
@@ -193,198 +191,66 @@ def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None:
     assert _compose_file(env) == compose
 
 
-def test_compose_overlay_mounts_main_workspace_tests_and_logs(tmp_path: Path) -> None:
+def test_compose_overlay_parks_main_and_mounts_workspace_tests_and_logs(tmp_path: Path) -> None:
     overlay = _compose_overlay(
         workspace=tmp_path / "workspace",
+        workdir="/srv/app",
         tests_dir=tmp_path / "tests",
         logs=tmp_path / "logs",
-        preserved_paths=[],
     )
 
     assert "main:" in overlay
     assert 'entrypoint: ["sleep"]' in overlay
-    assert f"{tmp_path / 'workspace'}:/app" in overlay
+    assert 'working_dir: "/srv/app"' in overlay
+    assert f"{tmp_path / 'workspace'}:/srv/app" in overlay
     assert f"{tmp_path / 'tests'}:/tests:ro" in overlay
     assert f"{tmp_path / 'logs'}:/logs" in overlay
 
 
-def test_compose_overlay_preserves_image_dependency_subpaths(tmp_path: Path) -> None:
-    overlay = _compose_overlay(
-        workspace=tmp_path / "workspace",
-        tests_dir=tmp_path / "tests",
-        logs=tmp_path / "logs",
-        preserved_paths=["/app/node_modules"],
-    )
-
-    assert '      - "/app/node_modules"' in overlay
-
-
-def test_preserved_image_paths_detects_node_and_php_dependency_dirs(tmp_path: Path) -> None:
-    (tmp_path / "package.json").write_text("{}", encoding="utf-8")
-    (tmp_path / "composer.json").write_text("{}", encoding="utf-8")
-
-    assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/vendor"]
-
-
-def test_preserved_image_paths_detects_node_build_output(tmp_path: Path) -> None:
-    (tmp_path / "package.json").write_text("{}", encoding="utf-8")
-    (tmp_path / "Dockerfile").write_text(
-        "FROM node:20-slim\nRUN npm ci\nRUN npm run build\n",
-        encoding="utf-8",
-    )
-
-    assert _preserved_image_paths(tmp_path) == ["/app/node_modules", "/app/dist"]
-
-
-def test_ensure_start_script_recreates_build_generated_entrypoint(tmp_path: Path) -> None:
-    workspace = tmp_path / "workspace"
-    workspace.mkdir()
-    (workspace / "docker-entrypoint.sh").write_text("echo start\n", encoding="utf-8")
-
-    _ensure_start_script(workspace)
-
-    start = workspace / "start_app.sh"
-    assert start.exists()
-    text = start.read_text(encoding="utf-8")
-    assert "exec sh /app/docker-entrypoint.sh" in text
-
-
-def test_ensure_start_script_preserves_dockerfile_generated_command(tmp_path: Path) -> None:
-    workspace = tmp_path / "workspace"
-    workspace.mkdir()
-    (workspace / "docker-entrypoint.sh").write_text('exec "$@"\n', encoding="utf-8")
-    (workspace / "Dockerfile").write_text(
-        "FROM python:3.11-slim\n"
-        "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' "
-        "'exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app' "
-        "> /app/start_app.sh && chmod +x /app/start_app.sh\n",
-        encoding="utf-8",
-    )
-
-    _ensure_start_script(workspace)
-
-    text = (workspace / "start_app.sh").read_text(encoding="utf-8")
-    assert "exec /app/docker-entrypoint.sh gunicorn --bind 0.0.0.0:8000 src.main:app" in text
-    assert (workspace / "docker-entrypoint.sh").stat().st_mode & 0o111
-
-
-def test_ensure_start_script_restores_generated_entrypoint(tmp_path: Path) -> None:
-    workspace = tmp_path / "workspace"
-    workspace.mkdir()
-    (workspace / "Dockerfile").write_text(
-        "FROM python:3.11-slim\n"
-        "RUN printf '#!/bin/sh\\npython -m src.seed --init\\n"
-        "exec uvicorn src.main:app --host 0.0.0.0 --port 8000\\n' "
-        "> /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh\n"
-        "RUN printf '%s\\n' '#!/usr/bin/env bash' 'set -e' 'cd /app' "
-        "'exec /app/docker-entrypoint.sh' > /app/start_app.sh && chmod +x /app/start_app.sh\n",
-        encoding="utf-8",
-    )
-
-    _ensure_start_script(workspace)
-
-    entrypoint = workspace / "docker-entrypoint.sh"
-    assert entrypoint.exists()
-    assert entrypoint.stat().st_mode & 0o111
-    assert "python -m src.seed --init" in entrypoint.read_text(encoding="utf-8")
-    assert "exec /app/docker-entrypoint.sh" in (workspace / "start_app.sh").read_text(
-        encoding="utf-8",
-    )
-
-
-def test_ensure_dockerfile_created_dirs_restores_app_dirs(tmp_path: Path) -> None:
-    workspace = tmp_path / "workspace"
-    workspace.mkdir()
-    (workspace / "Dockerfile").write_text(
-        "FROM node:20-slim\n"
-        "RUN mkdir -p static/uploads /app/tmp/cache && mkdir -p /var/lib/ignored\n",
-        encoding="utf-8",
-    )
-
-    _ensure_dockerfile_created_dirs(workspace)
-
-    assert (workspace / "static" / "uploads").is_dir()
-    assert (workspace / "tmp" / "cache").is_dir()
-    assert not (workspace / "var" / "lib" / "ignored").exists()
+async def test_image_workdir_reads_config_working_dir(monkeypatch: pytest.MonkeyPatch) -> None:
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        assert args == ("image", "inspect", "--format", "{{.Config.WorkingDir}}", "img")
+        return "/srv/app\n", ""
 
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
 
-def test_dockerfile_declared_generated_app_files_detects_seeded_sqlite_db(tmp_path: Path) -> None:
-    workspace = tmp_path / "workspace"
-    workspace.mkdir()
-    (workspace / "Dockerfile").write_text(
-        "FROM python:3.11-slim\n"
-        "ENV DB_PATH=/app/data/salon_workforce.db\n"
-        "RUN python -m src.seed --init\n",
-        encoding="utf-8",
-    )
+    assert await _image_workdir("img") == "/srv/app"
 
-    assert _dockerfile_declared_generated_app_files(workspace) == [
-        "/app/data/salon_workforce.db",
-    ]
-    assert _host_path_for_app_file(workspace, "/app/data/salon_workforce.db") == (
-        workspace / "data" / "salon_workforce.db"
-    )
 
+async def test_image_workdir_defaults_to_app_when_unset(monkeypatch: pytest.MonkeyPatch) -> None:
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        return "\n", ""
 
-def test_dockerfile_declared_generated_app_files_ignores_non_app_or_non_db_paths(
-    tmp_path: Path,
-) -> None:
-    workspace = tmp_path / "workspace"
-    workspace.mkdir()
-    (workspace / "Dockerfile").write_text(
-        "FROM python:3.11-slim\n"
-        "ENV DB_PATH=/var/lib/app.db CACHE_PATH=/app/cache\n"
-        "ENV SOME_DATABASE_PATH=/app/data/app.txt\n",
-        encoding="utf-8",
-    )
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
 
-    assert _dockerfile_declared_generated_app_files(workspace) == []
-    assert _host_path_for_app_file(workspace, "/tmp/app.db") is None
+    assert await _image_workdir("img") == "/app"
 
 
-async def test_compose_container_restores_image_generated_db_files(
-    single_task: Path,
+async def test_materialize_workspace_copies_image_workdir_and_owns_it(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    compose = single_task / "environment" / "docker-compose.yaml"
-    compose.write_text("services:\n  main:\n    build: .\n", encoding="utf-8")
     workspace = tmp_path / "workspace"
     workspace.mkdir()
-    (workspace / "Dockerfile").write_text(
-        "FROM python:3.11-slim\nENV DB_PATH=/app/data/app.db\n", encoding="utf-8"
-    )
     calls: list[tuple[str, ...]] = []
 
     async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
         calls.append(args)
-        if args[0] == "compose" and args[-3:] == ("ps", "-q", "main"):
-            return "maincontainer\n", ""
-        if args[0] == "inspect":
-            return "sha256:mainimage\n", ""
         if args[0] == "create":
-            return "tempcontainer\n", ""
+            return "tempcid\n", ""
         return "", ""
 
     monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
-    runtime = HarborRuntime(single_task.parent)
 
-    async with runtime._compose_container(
-        Task(env="bench", id=single_task.name),
-        compose,
-        workspace,
-        single_task / "tests",
-        tmp_path / "logs",
-        [],
-    ):
-        pass
+    await _materialize_workspace("img", workspace, "/app")
 
-    assert ("inspect", "--format", "{{.Image}}", "maincontainer") in calls
-    assert (
-        "cp",
-        "tempcontainer:/app/data/app.db",
-        str(workspace / "data" / "app.db"),
-    ) in calls
+    # Contents of the image's workdir are copied out into the host workspace.
+    assert ("cp", "tempcid:/app/.", str(workspace)) in calls
+    # The throwaway container is removed.
+    assert any(a[0] == "rm" for a in calls)
+    # On POSIX hosts, ownership is handed to the host user via a chown pass.
+    if hasattr(os, "getuid"):
+        assert any(a[0] == "run" and "chown" in a and a[-1] == "/app" for a in calls)
 
 
 def test_read_harbor_reward_prefers_reward_and_score_keys(tmp_path: Path) -> None:
@@ -448,7 +314,7 @@ async def fake_exec(*args: str, **kwargs: object) -> FakeProc:
     monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec)
     runtime = HarborRuntime(single_task.parent)
 
-    result = await runtime._grade("container", logs, "done", verifier_timeout=120.0)
+    result = await runtime._grade("container", "/app", logs, "done", verifier_timeout=120.0)
 
     assert result["score"] == 1.0
     assert result["info"]["stdout"] == "verifier out"
@@ -481,7 +347,9 @@ async def fake_exec(*args: str, **kwargs: object) -> FakeProc:
     monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec)
     runtime = HarborRuntime(single_task.parent)
 
-    result = await runtime._grade("container", tmp_path / "logs", None, verifier_timeout=0.05)
+    result = await runtime._grade(
+        "container", "/app", tmp_path / "logs", None, verifier_timeout=0.05
+    )
 
     assert result["isError"] is True
     assert "timed out" in result["content"]