diff --git a/docs/v6/advanced/harbor-convert.mdx b/docs/v6/advanced/harbor-convert.mdx
index 5ddb9f4f..21dbd721 100644
--- a/docs/v6/advanced/harbor-convert.mdx
+++ b/docs/v6/advanced/harbor-convert.mdx
@@ -9,8 +9,9 @@ task dirs - is a *frontend* that loads into the same primitives (`Environment`,
 `Task`, `Taskset`). Integrations are **loaders, not converters**: no codegen
 roundtrip to run foreign tasks. The Harbor integration lives in the SDK repo at
 [`integrations/harbor.py`](https://github.com/hud-evals/hud-python/blob/main/integrations/harbor.py)
-- a recipe built only on the public SDK surface; copy it into your project or
-run it from a checkout.
+- a public-surface loader that maps Harbor folders into SDK primitives. The
+included `HarborRuntime` is maintained with the SDK for local Docker execution;
+copy the loader into your project or run it from a checkout.
 
 ## Prerequisites
 
@@ -26,22 +27,29 @@ directly - one row per task dir (`id` = the dir name), sharing one declarative
 ```python
 from integrations.harbor import detect, load
 
-assert detect("./terminal-bench")
-taskset = load("./terminal-bench")
+assert detect("./harbor_tasks")
+taskset = load("./harbor_tasks")
 
 for task in taskset:
     print(task.env, task.id)
 ```
 
-Like every task row, the result carries no placement. Run it by supplying one -
-today that means a substrate already serving the control channel
-(`runtime=Runtime(url)`); a docker provider that builds and runs each task's
-`environment/` image is the planned follow-up:
+Like every task row, the result carries no placement. Run it by supplying one.
+For local Docker-backed Harbor execution, use `HarborRuntime`; it builds the
+task's `environment/` image, runs a fresh container, exposes the workspace
+through HUD's normal shell capability, and grades by running `tests/test.sh`:
 
 ```python
-from hud import Runtime
+from integrations.harbor import HarborRuntime
 
-job = await taskset.run(agent, runtime=Runtime("tcp://127.0.0.1:8765"))
+job = await taskset.run(agent, runtime=HarborRuntime("./harbor_tasks"))
+```
+
+The eval CLI can run local Harbor task directories and datasets when you opt
+into the Harbor source format:
+
+```bash
+hud eval ./harbor_tasks claude --format harbor --task-ids cancel-async-tasks --max-steps 30
 ```
 
 ## Export HUD tasks to Harbor
diff --git a/docs/v6/reference/cli.mdx b/docs/v6/reference/cli.mdx
index 6a5f51bb..8b18431b 100644
--- a/docs/v6/reference/cli.mdx
+++ b/docs/v6/reference/cli.mdx
@@ -105,6 +105,7 @@ For a platform taskset, pass its name or id directly: `hud eval "My Tasks" claud
 | `--config`, `-c` | Agent config `key=value` (repeatable). |
 | `--verbose`, `-v` | Show agent logs (step progress, tool calls) for batch runs too. |
 | `--very-verbose`, `-vv` | Debug-level logs. |
+| `--format` | Task source format: `hud` (default) or `harbor`. |
 | `--runtime` | Placement: `local`, `hud` (HUD runtime tunnel), or `tcp://host:port`. Defaults to `local` for a tasks file; platform tasksets default to remote hosted execution. |
 | `--remote` | Run the whole rollout remotely on the HUD platform. |
 | `--yes`, `-y` | Skip confirmation prompt. |
@@ -133,7 +134,9 @@ hud sync env                   # sync environment metadata
 ```
 
 External benchmark formats (currently Harbor) load directly into the runtime
-as `Taskset`s - no conversion step. See [Harbor interop](/v6/advanced/harbor-convert).
+as `Taskset`s - no conversion step. For local Harbor directories, opt in with
+`--format harbor` so the CLI uses the Harbor loader and Docker-backed runtime
+provider. See [Harbor interop](/v6/advanced/harbor-convert).
 
 ## Inspect
 
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 39afd6ed..01bf0883 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -63,6 +63,7 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
 
 _CONFIG_PATH = ".hud_eval.toml"
 _PLACEMENT_CONFLICT_ERROR = "--runtime and --remote are mutually exclusive placement options"
+_SOURCE_FORMATS = ("hud", "harbor")
 
 
 def _resolve_env_vars(obj: Any) -> Any:
@@ -167,6 +168,7 @@ class AgentPreset:
 # very_verbose = true
 # auto_respond = true
 # gateway = false  # Route LLM API calls through HUD Gateway
+# format = "hud"  # hud or harbor
 # runtime = "local"  # local, hud, or tcp://host:port
 # remote = false  # Run the whole rollout remotely on HUD
 
@@ -264,6 +266,7 @@ class EvalConfig(BaseModel):
         "group_size",
         "auto_respond",
         "gateway",
+        "format",
         "runtime",
         "remote",
     }
@@ -279,6 +282,9 @@ class EvalConfig(BaseModel):
     auto_respond: bool | None = None
     group_size: int = 1
     gateway: bool = False
+    #: Source format. ``None``/``hud`` means normal HUD task source loading;
+    #: ``harbor`` opts into the Harbor integration loader/runtime.
+    format: str | None = None
     #: Placement: "local" (spawn each row's env from the source), "hud"
     #: (HUD runtime tunnel), or a tcp:// url of an already-served env.
     #: ``None`` means "infer from the source": a local file runs locally, a
@@ -306,6 +312,20 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
                 ) from None
         return v
 
+    @field_validator("format", mode="before")
+    @classmethod
+    def _parse_format(cls, v: Any) -> str | None:
+        if v is None:
+            return None
+        if not isinstance(v, str):
+            return v
+        normalized = v.strip().lower()
+        if normalized in ("", "hud"):
+            return None
+        if normalized in _SOURCE_FORMATS:
+            return normalized
+        raise ValueError(f"Invalid format: {v}. Must be one of: {', '.join(_SOURCE_FORMATS)}")
+
     def source_is_local_file(self) -> bool:
         """Whether ``source`` points at an on-disk taskset (vs. a platform slug/id)."""
         return self.source is not None and Path(self.source).exists()
@@ -319,6 +339,13 @@ def resolve_runtime(self) -> EvalConfig:
         ``--runtime`` is always honored, except ``local`` against a platform
         taskset, which has no env to spawn.
         """
+        if self.format == "harbor":
+            if not self.source_is_local_file():
+                hud_console.error("--format harbor requires a local Harbor task directory")
+                raise typer.Exit(1)
+            if self.remote or (self.runtime is not None and self.runtime != "local"):
+                hud_console.error("--format harbor currently supports only local runtime placement")
+                raise typer.Exit(1)
         if self.runtime is None:
             if self.source_is_local_file():
                 return self.model_copy(update={"runtime": "local"})
@@ -502,6 +529,7 @@ def merge_cli(
         gateway: bool = False,
         config: list[str] | None = None,
         task_ids: str | None = None,
+        format: str | None = None,
         runtime: str | None = None,
         remote: bool = False,
     ) -> EvalConfig:
@@ -517,6 +545,7 @@ def merge_cli(
                 "max_concurrent": max_concurrent,
                 "max_steps": max_steps,
                 "group_size": group_size,
+                "format": format,
                 "runtime": runtime,
             }.items()
             if value is not None
@@ -604,6 +633,8 @@ def display(self) -> None:
         table.add_column("Value", style="green")
 
         table.add_row("source", str(self.source or "-"))
+        if self.format:
+            table.add_row("format", self.format)
         table.add_row("runtime", str(self.runtime or "-"))
         table.add_row("agent", self.agent_type.value if self.agent_type else "-")
         if self.task_ids:
@@ -728,6 +759,28 @@ def _spawn_target(source: Path) -> Path:
     return resolved.parent
 
 
+def _load_local_taskset(source_path: Path, source_format: str | None) -> Any:
+    from hud.eval import Taskset
+
+    format_name = source_format or "hud"
+    if format_name == "hud":
+        taskset = Taskset.from_file(source_path)
+        if len(taskset) == 0:
+            from integrations.harbor import detect
+
+            if detect(source_path):
+                hud_console.hint(
+                    f"{source_path} looks like a Harbor task directory; "
+                    "rerun with --format harbor to load it."
+                )
+        return taskset
+    if format_name == "harbor":
+        from integrations.harbor import load
+
+        return load(source_path)
+    raise ValueError(f"unsupported task source format: {format_name}")
+
+
 def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
     """Map the config's ``runtime`` onto a placement for ``Taskset.run``.
 
@@ -744,6 +797,10 @@ def _resolve_placement(cfg: EvalConfig, source_path: Path | None) -> Any:
     if cfg.runtime == "local":
         if source_path is None:
             raise ValueError("local placement requires a local source path")
+        if cfg.format == "harbor":
+            from integrations.harbor import HarborRuntime
+
+            return HarborRuntime(source_path)
         return LocalRuntime(_spawn_target(source_path))
     if cfg.runtime == "hud":
         require_api_key("run HUD runtime tunnel evals")
@@ -767,18 +824,18 @@ async def _run_evaluation(cfg: EvalConfig) -> Any:
     if cfg.source is None or cfg.agent_type is None:
         raise ValueError("source and agent_type must be set")
 
-    from hud.eval import Taskset
-
     source_path = Path(cfg.source)
     is_local = source_path.exists()
     if is_local:
         hud_console.info(f"Loading tasks from: {cfg.source}")
         try:
-            taskset = Taskset.from_file(source_path)
+            taskset = _load_local_taskset(source_path, cfg.format)
         except Exception as e:
             hud_console.error(f"Failed to load tasks from {cfg.source}: {e}")
             raise typer.Exit(1) from e
     else:
+        from hud.eval import Taskset
+
         hud_console.info(f"Loading platform taskset: {cfg.source}")
         try:
             taskset = Taskset.from_api(cfg.source)
@@ -888,6 +945,11 @@ def eval_command(
     gateway: bool = typer.Option(
         False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
     ),
+    format: str | None = typer.Option(
+        None,
+        "--format",
+        help="Task source format: hud (default) or harbor.",
+    ),
     runtime: str | None = typer.Option(
         None,
         "--runtime",
@@ -908,6 +970,7 @@ def eval_command(
         hud eval "My Tasks" claude-sonnet-4-6 --full   # Platform taskset, run on the platform
         hud eval tasks.json claude --config max_tokens=32768
         hud eval tasks.json claude --gateway           # Route LLM calls through HUD Gateway
+        hud eval ./harbor_tasks claude --format harbor # Run Harbor task dirs locally
         hud eval tasks.json claude-sonnet-4-6 --runtime hud  # Use HUD runtime tunnel
         hud eval tasks.json claude-sonnet-4-6 --remote       # Execute rollout remotely
     """
@@ -938,6 +1001,7 @@ def eval_command(
             group_size=group_size,
             config=config,
             gateway=gateway,
+            format=format,
             runtime=runtime,
             remote=remote,
         )
diff --git a/hud/cli/tests/test_eval_config.py b/hud/cli/tests/test_eval_config.py
index 6b94f0b2..bbd0d4f9 100644
--- a/hud/cli/tests/test_eval_config.py
+++ b/hud/cli/tests/test_eval_config.py
@@ -20,6 +20,23 @@
 _ARN = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/anthropic.claude"
 
 
+def _write_harbor_task(root: Path, name: str = "demo-task") -> Path:
+    task = root / name
+    (task / "environment").mkdir(parents=True)
+    (task / "tests").mkdir()
+    (task / "instruction.md").write_text("Fix the demo task.\n", encoding="utf-8")
+    (task / "task.toml").write_text(
+        'schema_version = "1.3"\n\n[task]\nname = "demo/demo-task"\n',
+        encoding="utf-8",
+    )
+    (task / "environment" / "Dockerfile").write_text("FROM python:3.12-slim\n", encoding="utf-8")
+    (task / "tests" / "test.sh").write_text(
+        "#!/usr/bin/env bash\nmkdir -p /logs/verifier\necho 1 > /logs/verifier/reward.txt\n",
+        encoding="utf-8",
+    )
+    return task
+
+
 def test_is_bedrock_arn() -> None:
     assert _is_bedrock_arn(_ARN) is True
     assert _is_bedrock_arn("claude-sonnet-4-6") is False
@@ -136,6 +153,77 @@ def test_resolve_placement_runtime_hud_uses_tunnel(
     assert isinstance(placement, HUDRuntime)
 
 
+def test_load_local_taskset_uses_hud_loader_by_default(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    taskset = eval_mod._load_local_taskset(tmp_path, None)
+
+    assert len(taskset) == 0
+
+
+def test_load_local_taskset_hints_harbor_format_on_zero_task_harbor_dir(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    _write_harbor_task(tmp_path)
+    hints: list[str] = []
+    monkeypatch.setattr(eval_mod.hud_console, "hint", lambda message, **_: hints.append(message))
+
+    taskset = eval_mod._load_local_taskset(tmp_path, None)
+
+    assert len(taskset) == 0
+    assert any("--format harbor" in hint for hint in hints)
+
+
+def test_load_local_taskset_rejects_unknown_format(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="unsupported task source format"):
+        eval_mod._load_local_taskset(tmp_path, "unknown")
+
+
+def test_load_local_taskset_uses_harbor_loader_when_format_is_harbor(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    taskset = eval_mod._load_local_taskset(tmp_path, "harbor")
+
+    assert len(taskset) == 1
+    assert taskset["demo-task"].id == "demo-task"
+
+
+def test_resolve_placement_local_harbor_format_uses_harbor_runtime(tmp_path: Path) -> None:
+    from integrations.harbor import HarborRuntime
+
+    _write_harbor_task(tmp_path)
+
+    placement = eval_mod._resolve_placement(
+        EvalConfig(runtime="local", format="harbor"),
+        tmp_path,
+    )
+
+    assert isinstance(placement, HarborRuntime)
+
+
+def test_resolve_placement_local_hud_format_uses_local_runtime(tmp_path: Path) -> None:
+    from hud.eval import LocalRuntime
+
+    _write_harbor_task(tmp_path)
+
+    placement = eval_mod._resolve_placement(EvalConfig(runtime="local"), tmp_path)
+
+    assert isinstance(placement, LocalRuntime)
+
+
+def test_harbor_format_rejects_nonlocal_source() -> None:
+    with pytest.raises(typer.Exit):
+        EvalConfig(source="platform/taskset", format="harbor").resolve_runtime()
+
+
+def test_harbor_format_rejects_nonlocal_runtime(tmp_path: Path) -> None:
+    _write_harbor_task(tmp_path)
+
+    with pytest.raises(typer.Exit):
+        EvalConfig(source=str(tmp_path), format="harbor", runtime="hud").resolve_runtime()
+
+
 def test_resolve_placement_remote_uses_hosted_runtime(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,
diff --git a/integrations/__init__.py b/integrations/__init__.py
index c8549e0f..baa460f4 100644
--- a/integrations/__init__.py
+++ b/integrations/__init__.py
@@ -5,11 +5,12 @@
 primitives. Integrations are **loaders, not converters**: no codegen roundtrip
 to run foreign tasks.
 
-This package lives outside ``hud`` on purpose: each module is a recipe built
-**only on the public SDK surface** (``Environment``, ``Task``,
-``Taskset``, ``Runtime``) — that constraint is the proof the core is
-flexible. Copy a module into your project or run it from a checkout; nothing
-in the SDK or CLI imports it.
+This package lives outside ``hud`` on purpose: loaders are recipes built on the
+public SDK surface (``Environment``, ``Task``, ``Taskset``, ``Runtime``). Copy a
+loader into your project or run it from a checkout. The CLI may call selected
+integrations explicitly for polished interop paths. A repo-maintained
+integration may also expose a local provider for that explicit CLI path; that
+provider is SDK implementation code, not the portable loader contract.
 
 The contract: an integration module exposes ``detect(path) -> bool`` and
 ``load(path) -> Taskset``. Placement stays an execution-time concern — loaders
diff --git a/integrations/harbor.py b/integrations/harbor.py
index 497711e3..90342625 100644
--- a/integrations/harbor.py
+++ b/integrations/harbor.py
@@ -11,11 +11,9 @@
 
 :func:`load` parses a task dir (or a dataset of them) into rows sharing one
 env name per distinct ``environment/`` build context — no codegen, no
-roundtrip. Like every row, the result is runnable
-once a placement is supplied (``runtime=Runtime(url)`` against a served substrate
-today). Providers receive the row being placed, so a docker provider that
-builds and runs each row's ``environment/`` image is the named follow-up —
-expressible without engine changes.
+roundtrip. Like every row, the result is runnable once a placement is supplied.
+Use :class:`HarborRuntime` for local Docker-backed execution of Harbor tasks, or
+``runtime=Runtime(url)`` to attach to a substrate served elsewhere.
 
 :func:`export` is the reverse direction: turn a HUD task source into
 self-contained Harbor task folders (``task.toml`` + ``instruction.md`` +
@@ -40,19 +38,23 @@
 
 from __future__ import annotations
 
-import hashlib
 import json
 import logging
-import re
 import shutil
-import tomllib
-from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
 from hud.environment import Environment
 from hud.environment.server import TaskRunner
 from hud.eval import Task, Taskset
+from integrations.harbor_common import (
+    _HarborTask,
+    _is_harbor_task,
+    _parse_task,
+    _slugify,
+    _task_dirs,
+)
+from integrations.harbor_runtime import HarborRuntime
 
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -74,18 +76,12 @@
     "__pycache__", "*.pyc", ".git", ".venv", "venv", "*.egg-info", ".pytest_cache"
 )
 
-
 # ─── load: Harbor dirs -> Taskset ──────────────────────────────────────
 
 
 def detect(path: str | Path) -> bool:
     """True when *path* is a Harbor task dir or a dataset of them."""
-    root = Path(path)
-    if _is_harbor_task(root):
-        return True
-    if root.is_dir():
-        return any(_is_harbor_task(d) for d in root.iterdir() if d.is_dir())
-    return False
+    return bool(_task_dirs(path))
 
 
 def load(path: str | Path) -> Taskset:
@@ -96,12 +92,8 @@ def load(path: str | Path) -> Taskset:
     context (content-hashed), derived from the dataset name.
     """
     root = Path(path).resolve()
-    if _is_harbor_task(root):
-        task_dirs = [root]
-        dataset_name = root.parent.name
-    else:
-        task_dirs = sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d))
-        dataset_name = root.name
+    task_dirs = _task_dirs(root)
+    dataset_name = root.parent.name if _is_harbor_task(root) else root.name
     if not task_dirs:
         raise ValueError(f"no Harbor tasks found in {path}")
 
@@ -126,54 +118,6 @@ def load(path: str | Path) -> Taskset:
     return Taskset(base_name, tasks)
 
 
-def _slugify(name: str) -> str:
-    """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name."""
-    normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-"))
-    return re.sub(r"-+", "-", normalized).strip("-") or "harbor"
-
-
-def _is_harbor_task(path: Path) -> bool:
-    return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists()
-
-
-def _hash_directory(path: Path) -> str:
-    """Content-hash a directory for grouping tasks by identical environments."""
-    hasher = hashlib.sha256()
-    if not path.exists():
-        return "empty"
-    for file_path in sorted(path.rglob("*")):
-        if file_path.is_file():
-            hasher.update(str(file_path.relative_to(path)).encode())
-            hasher.update(file_path.read_bytes())
-    return hasher.hexdigest()[:16]
-
-
-@dataclass(frozen=True, slots=True)
-class _HarborTask:
-    """One parsed Harbor task dir."""
-
-    task_id: str
-    config: dict[str, Any]
-    env_hash: str
-
-
-def _parse_task(task_dir: Path) -> _HarborTask | None:
-    if not (task_dir / "instruction.md").is_file():
-        LOGGER.warning("failed to read instruction.md in %s", task_dir)
-        return None
-    try:
-        config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8"))
-    except (OSError, tomllib.TOMLDecodeError):
-        LOGGER.warning("failed to parse task.toml in %s", task_dir)
-        config = {}
-    env_dir = task_dir / "environment"
-    return _HarborTask(
-        task_id=task_dir.name,
-        config=config,
-        env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env",
-    )
-
-
 # ─── export: HUD tasks -> Harbor task folders ───────────────────────────
 
 
@@ -443,6 +387,7 @@ async def export(
     "ALLOWED_PROTOCOLS",
     "CONTROL_PORT",
     "DEFAULT_ANSWER_FILE",
+    "HarborRuntime",
     "detect",
     "export",
     "load",
diff --git a/integrations/harbor_common.py b/integrations/harbor_common.py
new file mode 100644
index 00000000..53294e09
--- /dev/null
+++ b/integrations/harbor_common.py
@@ -0,0 +1,70 @@
+"""Shared helpers for Harbor task integration."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import tomllib
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _slugify(name: str) -> str:
+    """A valid env name (lowercase ``[a-z0-9-]``) from a dataset dir name."""
+    normalized = re.sub(r"[^a-z0-9-]", "", name.strip().lower().replace(" ", "-").replace("_", "-"))
+    return re.sub(r"-+", "-", normalized).strip("-") or "harbor"
+
+
+def _is_harbor_task(path: Path) -> bool:
+    return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists()
+
+
+def _task_dirs(path: str | Path) -> list[Path]:
+    root = Path(path)
+    if _is_harbor_task(root):
+        return [root]
+    if root.is_dir():
+        return sorted(d for d in root.iterdir() if d.is_dir() and _is_harbor_task(d))
+    return []
+
+
+def _hash_directory(path: Path) -> str:
+    """Content-hash a directory for grouping tasks by identical environments."""
+    hasher = hashlib.sha256()
+    if not path.exists():
+        return "empty"
+    for file_path in sorted(path.rglob("*")):
+        if file_path.is_file():
+            hasher.update(str(file_path.relative_to(path)).encode())
+            hasher.update(file_path.read_bytes())
+    return hasher.hexdigest()[:16]
+
+
+@dataclass(frozen=True, slots=True)
+class _HarborTask:
+    """One parsed Harbor task dir."""
+
+    task_id: str
+    config: dict[str, Any]
+    env_hash: str
+
+
+def _parse_task(task_dir: Path) -> _HarborTask | None:
+    if not (task_dir / "instruction.md").is_file():
+        LOGGER.warning("failed to read instruction.md in %s", task_dir)
+        return None
+    try:
+        config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8"))
+    except (OSError, tomllib.TOMLDecodeError):
+        LOGGER.warning("failed to parse task.toml in %s", task_dir)
+        config = {}
+    env_dir = task_dir / "environment"
+    return _HarborTask(
+        task_id=task_dir.name,
+        config=config,
+        env_hash=_hash_directory(env_dir) if env_dir.exists() else "no-env",
+    )
diff --git a/integrations/harbor_runtime.py b/integrations/harbor_runtime.py
new file mode 100644
index 00000000..50277726
--- /dev/null
+++ b/integrations/harbor_runtime.py
@@ -0,0 +1,444 @@
+"""Local Docker-backed runtime for Harbor task directories."""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import json
+import os
+import tempfile
+import tomllib
+import uuid
+from collections.abc import AsyncGenerator  # noqa: TC003 - env.template resolves this at runtime.
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from hud.environment import Environment
+from hud.environment.workspace import Workspace
+from integrations.harbor_common import _hash_directory, _slugify, _task_dirs
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncIterator
+
+    import asyncssh
+
+    from hud.eval import Task
+    from hud.eval.runtime import Runtime
+
+
+class HarborRuntime:
+    """Run Harbor task directories through HUD's local rollout engine.
+
+    The provider builds the Harbor task's ``environment/`` Docker context, then
+    materializes the built image's working directory onto a writable host
+    workspace and bind-mounts it back over the same guest path. Because the
+    workspace is the image's actual working directory (source *plus* every file
+    the build generated — start scripts, installed dependencies, compiled output,
+    seeded databases — with their original mode bits), the agent sees exactly
+    what the image would run, and edits made over SFTP are visible to the running
+    process. If the task ships a ``docker-compose.yaml``/``.yml``, the provider
+    starts it with an overlay that keeps the ``main`` service idle while
+    preserving sidecars such as databases. Shell commands execute inside the main
+    container via ``docker exec``. Grading runs the Harbor ``tests/test.sh``
+    inside the same main container, bounded by the task's ``[verifier]
+    timeout_sec``, and reads ``/logs/verifier/reward.json`` or ``reward.txt``.
+    """
+
+    def __init__(
+        self,
+        path: str | Path,
+        *,
+        ready_timeout: float = 120.0,
+    ) -> None:
+        self.root = Path(path).resolve()
+        self.ready_timeout = ready_timeout
+        self._task_dirs = {task_dir.name: task_dir for task_dir in _task_dirs(self.root)}
+        if not self._task_dirs:
+            raise ValueError(f"no Harbor tasks found in {path}")
+
+    @contextlib.asynccontextmanager
+    async def __call__(self, task: Task) -> AsyncIterator[Runtime]:
+        from hud.eval.runtime import Runtime, _docker, _local
+
+        task_dir = self._task_dirs.get(task.id)
+        if task_dir is None:
+            raise KeyError(f"HarborRuntime has no task directory for {task.id!r}")
+        env_dir = task_dir / "environment"
+        tests_dir = task_dir / "tests"
+        if not (env_dir / "Dockerfile").is_file():
+            raise FileNotFoundError(f"Harbor task {task.id!r} has no environment/Dockerfile")
+        if not (tests_dir / "test.sh").is_file():
+            raise FileNotFoundError(f"Harbor task {task.id!r} has no tests/test.sh")
+
+        with tempfile.TemporaryDirectory(prefix=f"hud-harbor-{_slugify(task.id)}-") as tmp:
+            tmp_path = Path(tmp)
+            workspace = tmp_path / "workspace"
+            logs = tmp_path / "logs"
+            workspace.mkdir()
+            logs.mkdir(parents=True, exist_ok=True)
+
+            image = await self._build_image(env_dir)
+            workdir = await _image_workdir(image)
+            await _materialize_workspace(image, workspace, workdir)
+
+            compose_file = _compose_file(env_dir)
+            if compose_file is not None:
+                await _docker("image", "rm", image, check=False)
+                acquire = self._compose_container(
+                    task, compose_file, workspace, workdir, tests_dir, logs
+                )
+            else:
+                acquire = self._single_container(task, image, workspace, workdir, tests_dir, logs)
+            async with acquire as (container, provider):
+                env = self._environment_for(task, task_dir, workspace, workdir, logs, container)
+                async with _local(env) as runtime:
+                    yield Runtime(
+                        runtime.url,
+                        params={
+                            **runtime.params,
+                            "provider": provider,
+                            "container": container,
+                            "ready_timeout": self.ready_timeout,
+                        },
+                        config=runtime.config,
+                    )
+
+    @contextlib.asynccontextmanager
+    async def _single_container(
+        self,
+        task: Task,
+        image: str,
+        workspace: Path,
+        workdir: str,
+        tests_dir: Path,
+        logs: Path,
+    ) -> AsyncIterator[tuple[str, str]]:
+        from hud.eval.runtime import _docker
+
+        container_name = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}"
+        out, _ = await _docker(
+            "run",
+            "--detach",
+            "--name",
+            container_name,
+            "--workdir",
+            workdir,
+            "--entrypoint",
+            "sleep",
+            "--volume",
+            f"{workspace}:{workdir}",
+            "--volume",
+            f"{tests_dir}:/tests:ro",
+            "--volume",
+            f"{logs}:/logs",
+            image,
+            "infinity",
+        )
+        container = out.strip()
+        try:
+            yield container, "harbor"
+        finally:
+            with contextlib.suppress(Exception):
+                await _release_mount_permissions(container, workdir)
+            await _docker("rm", "--force", "--volumes", container, check=False)
+            await _docker("image", "rm", image, check=False)
+
+    @contextlib.asynccontextmanager
+    async def _compose_container(
+        self,
+        task: Task,
+        compose_file: Path,
+        workspace: Path,
+        workdir: str,
+        tests_dir: Path,
+        logs: Path,
+    ) -> AsyncIterator[tuple[str, str]]:
+        from hud.eval.runtime import _docker
+
+        project = f"hud-harbor-{_slugify(task.id)}-{uuid.uuid4().hex[:8]}"
+        overlay = workspace.parent / "compose.hud.yaml"
+        overlay.write_text(
+            _compose_overlay(workspace=workspace, workdir=workdir, tests_dir=tests_dir, logs=logs),
+            encoding="utf-8",
+            newline="\n",
+        )
+        compose_args = ("compose", "-f", str(compose_file), "-f", str(overlay), "-p", project)
+        container = ""
+        try:
+            await _docker(*compose_args, "up", "--detach", "--build")
+            out, _ = await _docker(*compose_args, "ps", "-q", "main")
+            container = out.strip()
+            if not container:
+                raise RuntimeError(
+                    f"docker compose project {project} did not create a main service"
+                )
+            yield container, "harbor-compose"
+        finally:
+            if container:
+                with contextlib.suppress(Exception):
+                    await _release_mount_permissions(container, workdir)
+            await _docker(
+                *compose_args,
+                "down",
+                "--volumes",
+                "--remove-orphans",
+                "--rmi",
+                "local",
+                check=False,
+            )
+
+    async def _build_image(self, env_dir: Path) -> str:
+        from hud.eval.runtime import _docker
+
+        tag = f"hud-harbor:{_hash_directory(env_dir)}-{uuid.uuid4().hex[:8]}"
+        await _docker("build", "--tag", tag, str(env_dir))
+        return tag
+
+    def _environment_for(
+        self,
+        task: Task,
+        task_dir: Path,
+        workspace: Path,
+        workdir: str,
+        logs: Path,
+        container: str,
+    ) -> Environment:
+        env = Environment(task.env)
+        workspace_daemon = _DockerWorkspace(workspace, container=container, guest_path=workdir)
+        verifier_timeout = _verifier_timeout(task_dir)
+
+        @env.initialize
+        async def _up() -> None:
+            await workspace_daemon.start()
+            env.add_capability(workspace_daemon.capability("shell"))
+
+        @env.shutdown
+        async def _down() -> None:
+            await workspace_daemon.stop()
+
+        @env.template(id=task.id, description=f"Harbor task {task.id}")
+        async def _run_harbor_task() -> AsyncGenerator[Any, Any]:
+            answer = yield (task_dir / "instruction.md").read_text(encoding="utf-8")
+            yield await self._grade(
+                container, workdir, logs, answer, verifier_timeout=verifier_timeout
+            )
+
+        return env
+
+    async def _grade(
+        self, container: str, workdir: str, logs: Path, answer: Any, *, verifier_timeout: float
+    ) -> dict[str, Any]:
+        answer_file = logs / "agent_answer.txt"
+        answer_file.parent.mkdir(parents=True, exist_ok=True)
+        answer_file.write_text("" if answer is None else str(answer), encoding="utf-8")
+        proc = await asyncio.create_subprocess_exec(
+            "docker",
+            "exec",
+            "--workdir",
+            workdir,
+            container,
+            "bash",
+            "/tests/test.sh",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        try:
+            out_bytes, err_bytes = await asyncio.wait_for(
+                proc.communicate(), timeout=verifier_timeout
+            )
+        except TimeoutError:
+            proc.kill()
+            await proc.wait()
+            return {
+                "score": 0.0,
+                "isError": True,
+                "content": f"Harbor verifier timed out after {verifier_timeout:.0f}s",
+                "info": {"verifier_timeout_sec": verifier_timeout},
+            }
+        out = out_bytes.decode("utf-8", "replace")
+        err = err_bytes.decode("utf-8", "replace")
+        reward, info = _read_harbor_reward(logs / "verifier")
+        info.update(
+            {
+                "stdout": out[-4000:],
+                "stderr": err[-4000:],
+            }
+        )
+        if reward is None:
+            return {
+                "score": 0.0,
+                "isError": True,
+                "content": "Harbor verifier did not write reward.json or reward.txt",
+                "info": info,
+            }
+        return {"score": reward, "info": info}
+
+
+class _DockerWorkspace(Workspace):
+    """Workspace SFTP over a host bind mount, shell commands via docker exec."""
+
+    def __init__(self, *args: Any, container: str, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self._container = container
+
+    async def _handle_process(self, process: asyncssh.SSHServerProcess[bytes]) -> None:
+        command = process.command or "bash -l"
+        proc = await asyncio.create_subprocess_exec(
+            "docker",
+            "exec",
+            "-i",
+            "--workdir",
+            self._guest_path,
+            self._container,
+            "bash",
+            "-lc",
+            command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        try:
+            stdout_data, stderr_data = await asyncio.wait_for(proc.communicate(), timeout=3600.0)
+        except TimeoutError:
+            proc.kill()
+            await proc.wait()
+            process.stderr.write(b"workspace: command timed out after 3600s\n")
+            process.exit(1)
+            return
+        except asyncio.CancelledError:
+            proc.kill()
+            await proc.wait()
+            raise
+
+        if stdout_data:
+            process.stdout.write(stdout_data)
+        if stderr_data:
+            process.stderr.write(stderr_data)
+        process.exit(proc.returncode if proc.returncode is not None else 0)
+
+
+_DEFAULT_VERIFIER_TIMEOUT = 600.0
+
+
+def _verifier_timeout(task_dir: Path) -> float:
+    """The task's ``[verifier] timeout_sec``, or the Harbor default."""
+    try:
+        config: dict[str, Any] = tomllib.loads((task_dir / "task.toml").read_text("utf-8"))
+    except (OSError, tomllib.TOMLDecodeError):
+        return _DEFAULT_VERIFIER_TIMEOUT
+    verifier = config.get("verifier")
+    timeout = verifier.get("timeout_sec") if isinstance(verifier, dict) else None
+    if isinstance(timeout, int | float) and not isinstance(timeout, bool) and timeout > 0:
+        return float(timeout)
+    return _DEFAULT_VERIFIER_TIMEOUT
+
+
+async def _image_workdir(image: str) -> str:
+    """The image's configured ``WORKDIR``, or ``/app`` when it declares none."""
+    from hud.eval.runtime import _docker
+
+    out, _ = await _docker("image", "inspect", "--format", "{{.Config.WorkingDir}}", image)
+    return out.strip() or "/app"
+
+
+async def _materialize_workspace(image: str, workspace: Path, workdir: str) -> None:
+    """Copy the built image's ``workdir`` onto the host workspace, then own it.
+
+    The ``workdir`` bind mount would otherwise shadow everything the Docker build
+    generated there (start scripts, installed dependencies, compiled output,
+    seeded databases). Copying the image's actual ``workdir`` out first makes the
+    mounted workspace a faithful, editable copy of what the image runs. Files
+    arrive owned by the container's build user; hand them to the host user so the
+    agent can edit them over SFTP and teardown can remove them.
+    """
+    from hud.eval.runtime import _docker
+
+    out, _ = await _docker("create", image, "true")
+    container = out.strip()
+    try:
+        await _docker("cp", f"{container}:{workdir}/.", str(workspace))
+    finally:
+        await _docker("rm", "--force", "--volumes", container, check=False)
+
+    if hasattr(os, "getuid"):
+        await _docker(
+            "run",
+            "--rm",
+            "--volume",
+            f"{workspace}:{workdir}",
+            image,
+            "chown",
+            "-R",
+            f"{os.getuid()}:{os.getgid()}",
+            workdir,
+            check=False,
+        )
+
+
+def _read_harbor_reward(verifier_logs: Path) -> tuple[float | None, dict[str, Any]]:
+    reward_json = verifier_logs / "reward.json"
+    if reward_json.is_file():
+        data = json.loads(reward_json.read_text(encoding="utf-8"))
+        if isinstance(data, int | float):
+            return float(data), {"reward_file": str(reward_json)}
+        if isinstance(data, dict):
+            for key in ("reward", "score"):
+                value = data.get(key)
+                if isinstance(value, int | float):
+                    return float(value), {"reward_file": str(reward_json), "reward_json": data}
+        return None, {"reward_file": str(reward_json), "reward_parse_error": "no numeric reward"}
+
+    reward_txt = verifier_logs / "reward.txt"
+    if reward_txt.is_file():
+        text = reward_txt.read_text(encoding="utf-8").strip()
+        try:
+            return float(text), {"reward_file": str(reward_txt)}
+        except ValueError:
+            return None, {"reward_file": str(reward_txt), "reward_parse_error": text}
+
+    return None, {}
+
+
+async def _release_mount_permissions(container: str, workdir: str) -> None:
+    """Let the host user delete files that container-root created in mounts."""
+    from hud.eval.runtime import _docker
+
+    await _docker(
+        "exec",
+        container,
+        "sh",
+        "-lc",
+        f"chmod -R a+rwX {workdir} /logs 2>/dev/null || true",
+        check=False,
+    )
+
+
+def _compose_file(env_dir: Path) -> Path | None:
+    for name in ("docker-compose.yaml", "docker-compose.yml", "compose.yaml", "compose.yml"):
+        path = env_dir / name
+        if path.is_file():
+            return path
+    return None
+
+
+def _compose_overlay(*, workspace: Path, workdir: str, tests_dir: Path, logs: Path) -> str:
+    """Compose override that keeps Harbor's main service idle for agent work.
+
+    Only ``main`` is touched: it is parked on ``sleep`` with the materialized
+    workspace mounted over its working directory, and the Harbor ``/tests`` and
+    ``/logs`` paths bound in. Every other service (databases, caches) is
+    inherited from the task's own compose file unchanged.
+    """
+    return "\n".join(
+        [
+            "services:",
+            "  main:",
+            f"    working_dir: {json.dumps(workdir)}",
+            '    entrypoint: ["sleep"]',
+            '    command: ["infinity"]',
+            "    volumes:",
+            f"      - {json.dumps(f'{workspace}:{workdir}')}",
+            f"      - {json.dumps(f'{tests_dir}:/tests:ro')}",
+            f"      - {json.dumps(f'{logs}:/logs')}",
+            "",
+        ],
+    )
diff --git a/integrations/tests/test_harbor.py b/integrations/tests/test_harbor.py
index b7343b51..c695bee3 100644
--- a/integrations/tests/test_harbor.py
+++ b/integrations/tests/test_harbor.py
@@ -2,12 +2,24 @@
 
 from __future__ import annotations
 
+import asyncio
+import json
+import os
 import textwrap
 from typing import TYPE_CHECKING
 
 import pytest
 
-from integrations.harbor import detect, export, load
+from hud.eval import Task
+from integrations.harbor import HarborRuntime, detect, export, load
+from integrations.harbor_runtime import (
+    _compose_file,
+    _compose_overlay,
+    _image_workdir,
+    _materialize_workspace,
+    _read_harbor_reward,
+    _verifier_timeout,
+)
 
 from .conftest import make_harbor_task
 
@@ -74,6 +86,276 @@ def test_load_skips_unparseable_toml_but_keeps_the_rest(tmp_path: Path) -> None:
     assert {task.id for task in taskset} == {"good", "broken"}
 
 
+def test_harbor_runtime_accepts_dataset_dirs(single_task: Path) -> None:
+    runtime = HarborRuntime(single_task.parent)
+
+    assert single_task.name in runtime._task_dirs
+
+
+async def test_harbor_runtime_builds_unique_images_per_acquisition(
+    single_task: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    calls: list[tuple[tuple[str, ...], bool]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append((args, check))
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+    runtime = HarborRuntime(single_task.parent)
+
+    first = await runtime._build_image(single_task / "environment")
+    second = await runtime._build_image(single_task / "environment")
+
+    assert first != second
+    assert first.startswith("hud-harbor:")
+    assert second.startswith("hud-harbor:")
+    assert [args[2] for args, _ in calls] == [first, second]
+
+
+async def test_compose_container_cleans_up_after_failed_up(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    compose = single_task / "environment" / "docker-compose.yaml"
+    compose.write_text("services:\n  main:\n    build: .\n", encoding="utf-8")
+    calls: list[tuple[tuple[str, ...], bool]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append((args, check))
+        if args[-3:] == ("up", "--detach", "--build"):
+            raise RuntimeError("compose failed")
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+    runtime = HarborRuntime(single_task.parent)
+
+    with pytest.raises(RuntimeError, match="compose failed"):
+        async with runtime._compose_container(
+            Task(env="bench", id=single_task.name),
+            compose,
+            tmp_path / "workspace",
+            "/app",
+            single_task / "tests",
+            tmp_path / "logs",
+        ):
+            raise AssertionError("compose acquisition should not yield")
+
+    assert any(
+        args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False
+        for args, check in calls
+    )
+
+
+async def test_compose_container_cleans_up_when_main_service_is_missing(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    compose = single_task / "environment" / "docker-compose.yaml"
+    compose.write_text("services:\n  api:\n    build: .\n", encoding="utf-8")
+    calls: list[tuple[tuple[str, ...], bool]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append((args, check))
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+    runtime = HarborRuntime(single_task.parent)
+
+    with pytest.raises(RuntimeError, match="did not create a main service"):
+        async with runtime._compose_container(
+            Task(env="bench", id=single_task.name),
+            compose,
+            tmp_path / "workspace",
+            "/app",
+            single_task / "tests",
+            tmp_path / "logs",
+        ):
+            raise AssertionError("compose acquisition should not yield")
+
+    assert any(
+        args[-5:] == ("down", "--volumes", "--remove-orphans", "--rmi", "local") and check is False
+        for args, check in calls
+    )
+
+
+def test_compose_file_detection_prefers_harbor_names(tmp_path: Path) -> None:
+    env = tmp_path / "environment"
+    env.mkdir()
+    compose = env / "docker-compose.yaml"
+    compose.write_text("services: {}\n", encoding="utf-8")
+
+    assert _compose_file(env) == compose
+
+
+def test_compose_overlay_parks_main_and_mounts_workspace_tests_and_logs(tmp_path: Path) -> None:
+    overlay = _compose_overlay(
+        workspace=tmp_path / "workspace",
+        workdir="/srv/app",
+        tests_dir=tmp_path / "tests",
+        logs=tmp_path / "logs",
+    )
+
+    assert "main:" in overlay
+    assert 'entrypoint: ["sleep"]' in overlay
+    assert 'working_dir: "/srv/app"' in overlay
+    assert f"{tmp_path / 'workspace'}:/srv/app" in overlay
+    assert f"{tmp_path / 'tests'}:/tests:ro" in overlay
+    assert f"{tmp_path / 'logs'}:/logs" in overlay
+
+
+async def test_image_workdir_reads_config_working_dir(monkeypatch: pytest.MonkeyPatch) -> None:
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        assert args == ("image", "inspect", "--format", "{{.Config.WorkingDir}}", "img")
+        return "/srv/app\n", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+
+    assert await _image_workdir("img") == "/srv/app"
+
+
+async def test_image_workdir_defaults_to_app_when_unset(monkeypatch: pytest.MonkeyPatch) -> None:
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        return "\n", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+
+    assert await _image_workdir("img") == "/app"
+
+
+async def test_materialize_workspace_copies_image_workdir_and_owns_it(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    calls: list[tuple[str, ...]] = []
+
+    async def fake_docker(*args: str, check: bool = True) -> tuple[str, str]:
+        calls.append(args)
+        if args[0] == "create":
+            return "tempcid\n", ""
+        return "", ""
+
+    monkeypatch.setattr("hud.eval.runtime._docker", fake_docker)
+
+    await _materialize_workspace("img", workspace, "/app")
+
+    # Contents of the image's workdir are copied out into the host workspace.
+    assert ("cp", "tempcid:/app/.", str(workspace)) in calls
+    # The throwaway container is removed.
+    assert any(a[0] == "rm" for a in calls)
+    # On POSIX hosts, ownership is handed to the host user via a chown pass.
+    if hasattr(os, "getuid"):
+        assert any(a[0] == "run" and "chown" in a and a[-1] == "/app" for a in calls)
+
+
+def test_read_harbor_reward_prefers_reward_and_score_keys(tmp_path: Path) -> None:
+    verifier = tmp_path / "verifier"
+    verifier.mkdir()
+    (verifier / "reward.json").write_text(json.dumps({"reward": 0.5, "total": 5}), "utf-8")
+
+    reward, info = _read_harbor_reward(verifier)
+
+    assert reward == 0.5
+    assert info["reward_json"] == {"reward": 0.5, "total": 5}
+
+
+def test_read_harbor_reward_rejects_dict_without_reward_or_score(tmp_path: Path) -> None:
+    verifier = tmp_path / "verifier"
+    verifier.mkdir()
+    (verifier / "reward.json").write_text(json.dumps({"passed": 3, "total": 5}), "utf-8")
+
+    reward, info = _read_harbor_reward(verifier)
+
+    assert reward is None
+    assert info["reward_parse_error"] == "no numeric reward"
+
+
+def test_verifier_timeout_reads_task_toml(single_task: Path) -> None:
+    assert _verifier_timeout(single_task) == 120.0
+
+
+def test_verifier_timeout_defaults_when_missing_or_invalid(tmp_path: Path) -> None:
+    no_verifier = tmp_path / "no-verifier"
+    no_verifier.mkdir()
+    (no_verifier / "task.toml").write_text('[metadata]\ncategory = "systems"\n', "utf-8")
+    broken = tmp_path / "broken"
+    broken.mkdir()
+    (broken / "task.toml").write_text("not toml [", "utf-8")
+
+    assert _verifier_timeout(no_verifier) == 600.0
+    assert _verifier_timeout(broken) == 600.0
+    assert _verifier_timeout(tmp_path / "missing") == 600.0
+
+
+async def test_grade_reads_reward_after_verifier_completes(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    logs = tmp_path / "logs"
+    (logs / "verifier").mkdir(parents=True)
+    (logs / "verifier" / "reward.txt").write_text("1.0\n", "utf-8")
+
+    class FakeProc:
+        returncode = 0
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            return b"verifier out", b""
+
+    async def fake_exec(*args: str, **kwargs: object) -> FakeProc:
+        assert args[:2] == ("docker", "exec")
+        return FakeProc()
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec)
+    runtime = HarborRuntime(single_task.parent)
+
+    result = await runtime._grade("container", "/app", logs, "done", verifier_timeout=120.0)
+
+    assert result["score"] == 1.0
+    assert result["info"]["stdout"] == "verifier out"
+    assert (logs / "agent_answer.txt").read_text("utf-8") == "done"
+
+
+async def test_grade_times_out_when_verifier_hangs(
+    single_task: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    class FakeProc:
+        killed = False
+
+        async def communicate(self) -> tuple[bytes, bytes]:
+            await asyncio.sleep(3600)
+            raise AssertionError("unreachable")
+
+        def kill(self) -> None:
+            self.killed = True
+
+        async def wait(self) -> int:
+            return -9
+
+    proc = FakeProc()
+
+    async def fake_exec(*args: str, **kwargs: object) -> FakeProc:
+        return proc
+
+    monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_exec)
+    runtime = HarborRuntime(single_task.parent)
+
+    result = await runtime._grade(
+        "container", "/app", tmp_path / "logs", None, verifier_timeout=0.05
+    )
+
+    assert result["isError"] is True
+    assert "timed out" in result["content"]
+    assert proc.killed
+
+
 # ─── export: HUD tasks -> Harbor task folders ───────────────────────────
 
 _ENV_PY = """\
diff --git a/pyproject.toml b/pyproject.toml
index 81973229..1bca6168 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,6 @@ build-backend = "hatchling.build"
 exclude = [
     "docs/",
     "cookbooks/",
-    "integrations/",
     "**/checkpoints/",
     "**/*.safetensors",
     "**/*.ckpt",
@@ -85,6 +84,7 @@ allow-direct-references = true
 [tool.hatch.build.targets.sdist]
 include = [
     "hud/**",
+    "integrations/**",
     "README.md",
     "LICENSE",
     "pyproject.toml"
@@ -102,7 +102,7 @@ exclude = [
 ]
 
 [tool.hatch.build.targets.wheel]
-packages = ["hud"]
+packages = ["hud", "integrations"]
 
 # Ensure py.typed is included in the package
 [tool.hatch.build.targets.wheel.force-include]