diff --git a/src/tether/finetune/hooks/libero_drop_gate.py b/src/tether/finetune/hooks/libero_drop_gate.py index 4ed4796..1dfad9b 100644 --- a/src/tether/finetune/hooks/libero_drop_gate.py +++ b/src/tether/finetune/hooks/libero_drop_gate.py @@ -29,9 +29,22 @@ benchmark command. - Runs LOCALLY with whatever hardware the orchestrator has. No Modal here — if Modal is needed, wire it into the CLI not the hook. -- If LIBERO infra is unavailable (no env, no GPU), logs a warning and - skips the gate (doesn't abort). This is intentional: we don't want - a missing optional dep to kill an otherwise-successful distill run. +- If LIBERO infra is unavailable, logs a warning and skips the gate + (doesn't abort). This is intentional: we don't want a missing optional + dep to kill an otherwise-successful distill run. + +## Harness availability (read this) + +The rollout harness (`tether.libero_harness`) is NOT shipped in the OSS +package today (LIBERO was archived 2026-04-17), so in a stock install this +gate ALWAYS takes the "unavailable" path and cannot verify task success. +To make that honest rather than silent: + + - the outcome is recorded on `report.libero_gate_status` (`passed`, + `failed`, `skipped_unavailable`, …) so a SKIP is never mistaken for a + PASS, and + - `libero_gate_require=True` makes an unavailable harness ABORT the ship + (fail-closed) instead of silently skipping. ## Configuration via ctx.config.extra_lerobot_args @@ -39,6 +52,8 @@ - `libero_gate_tasks: int = 8` — how many tasks to eval - `libero_gate_rollouts_per_task: int = 3` — episodes per task - `libero_gate_skip: bool = False` — disable the gate entirely +- `libero_gate_require: bool = False` — abort if the harness can't run + (fail-closed) instead of skipping unverified """ from __future__ import annotations @@ -69,20 +84,31 @@ def libero_drop_gate(ctx, **payload) -> None: """ cfg = ctx.config extra = cfg.extra_lerobot_args or {} + report = payload.get("report") + + def _set_status(status: str) -> None: + if report is not None: + report.libero_gate_status = status if extra.get("libero_gate_skip"): logger.info("[libero_gate] skipped via extra_lerobot_args.libero_gate_skip") + _set_status("skipped_disabled") return # Only meaningful for distill runs — guard against accidental attach # to a fine-tune run where there's no teacher. if getattr(cfg, "phase", "train") != "distill": logger.debug("[libero_gate] phase != 'distill'; skipping") + _set_status("skipped_phase") return threshold = float(extra.get("libero_gate_threshold_pp", DEFAULT_GATE_THRESHOLD_PP)) num_tasks = int(extra.get("libero_gate_tasks", DEFAULT_NUM_TASKS)) rollouts = int(extra.get("libero_gate_rollouts_per_task", DEFAULT_ROLLOUTS_PER_TASK)) + # Fail-closed option: when set, an unavailable LIBERO harness ABORTS the + # ship instead of silently skipping. Lets a production distill demand real + # task-success verification rather than trusting an inert gate. + require = bool(extra.get("libero_gate_require", False)) teacher_export = cfg.teacher_export student_ckpt = payload.get("final_checkpoint_path") @@ -92,6 +118,7 @@ def libero_drop_gate(ctx, **payload) -> None: "skipping (teacher=%r, student=%r)", teacher_export, student_ckpt, ) + _set_status("skipped_missing_inputs") return try: @@ -102,13 +129,33 @@ def libero_drop_gate(ctx, **payload) -> None: rollouts_per_task=rollouts, ) except _LiberoUnavailable as e: - logger.warning( - "[libero_gate] LIBERO infra unavailable (%s); skipping gate. " - "Distill will ship without task-success verification.", e, - ) + # IMPORTANT: this is the path every build hits today — the LIBERO + # harness module is not present, so the gate cannot verify task + # success. We record skipped_unavailable so a SKIP is never read as a + # PASS, and honor libero_gate_require for callers who want fail-closed. + _set_status("skipped_unavailable") + if require: + logger.error( + "[libero_gate] LIBERO harness unavailable (%s) and " + "libero_gate_require=True — aborting ship.", e, + ) + ctx.extra["force_abort"] = True + ctx.extra["abort_reason"] = ( + f"LIBERO task-success gate REQUIRED but harness unavailable: {e}. " + f"Set extra_lerobot_args.libero_gate_require=False to ship " + f"without task-success verification." + ) + else: + logger.warning( + "[libero_gate] LIBERO harness unavailable (%s); gate did NOT " + "verify task success. Distill ships UNVERIFIED (report " + "libero_gate_status=skipped_unavailable). Set " + "libero_gate_require=True to fail closed.", e, + ) return except Exception as e: logger.exception("[libero_gate] rollouts crashed: %s", e) + _set_status("crashed") ctx.extra["force_abort"] = True ctx.extra["abort_reason"] = f"libero_gate crashed: {type(e).__name__}: {e}" return @@ -120,11 +167,11 @@ def libero_drop_gate(ctx, **payload) -> None: ) # Surface the number into the PostprocessReport regardless of pass/fail. - report = payload.get("report") if report is not None: report.libero_drop_pp = drop_pp if drop_pp > threshold: + _set_status("failed") ctx.extra["force_abort"] = True ctx.extra["abort_reason"] = ( f"LIBERO drop {drop_pp:.2f}pp exceeds gate threshold {threshold:.2f}pp. " @@ -134,6 +181,7 @@ def libero_drop_gate(ctx, **payload) -> None: f"export artifact is NOT shipped." ) else: + _set_status("passed") logger.info( "[libero_gate] PASS: drop %.2fpp <= threshold %.2fpp", drop_pp, threshold, diff --git a/src/tether/finetune/postprocess.py b/src/tether/finetune/postprocess.py index fc97952..20a4394 100644 --- a/src/tether/finetune/postprocess.py +++ b/src/tether/finetune/postprocess.py @@ -44,6 +44,11 @@ class PostprocessReport: libero_drop_pp: float | None = None """Task-success delta (student − teacher) in percentage points. Set by the libero_drop_gate hook when phase='distill'; None otherwise.""" + libero_gate_status: str | None = None + """Outcome of the LIBERO drop-gate, so a SKIP is never mistaken for a + PASS: one of 'passed', 'failed', 'crashed', 'skipped_disabled', + 'skipped_phase', 'skipped_missing_inputs', 'skipped_unavailable'. None + means the gate never attached (non-distill run).""" errors: list[str] = field(default_factory=list) diff --git a/tests/test_libero_drop_gate.py b/tests/test_libero_drop_gate.py new file mode 100644 index 0000000..7fb5aee --- /dev/null +++ b/tests/test_libero_drop_gate.py @@ -0,0 +1,89 @@ +"""LIBERO drop-gate must not silently green-light an unverified distill. + +The rollout harness (tether.libero_harness) isn't shipped today, so the gate +always hits the "unavailable" path. These tests pin that the outcome is +*recorded* (skipped_unavailable, never mistaken for a pass) and that +libero_gate_require=True makes it fail closed. +""" +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from tether.finetune.hooks import libero_drop_gate as gate_mod +from tether.finetune.hooks.libero_drop_gate import _LiberoUnavailable, libero_drop_gate +from tether.finetune.postprocess import PostprocessReport + + +def _ctx(**extra_args): + cfg = SimpleNamespace( + extra_lerobot_args=dict(extra_args), + phase="distill", + teacher_export="teacher.onnx", + ) + return SimpleNamespace(config=cfg, extra={}) + + +def _payload(): + return {"final_checkpoint_path": "student.ckpt", "report": PostprocessReport()} + + +def test_unavailable_harness_is_recorded_not_silent(monkeypatch): + monkeypatch.setattr( + gate_mod, "_run_teacher_student_rollouts", + lambda **k: (_ for _ in ()).throw(_LiberoUnavailable("no harness")), + ) + ctx, payload = _ctx(), _payload() + libero_drop_gate(ctx, **payload) + assert payload["report"].libero_gate_status == "skipped_unavailable" + assert "force_abort" not in ctx.extra # default: permissive, ships + + +def test_require_makes_unavailable_fail_closed(monkeypatch): + monkeypatch.setattr( + gate_mod, "_run_teacher_student_rollouts", + lambda **k: (_ for _ in ()).throw(_LiberoUnavailable("no harness")), + ) + ctx, payload = _ctx(libero_gate_require=True), _payload() + libero_drop_gate(ctx, **payload) + assert ctx.extra.get("force_abort") is True + assert "harness unavailable" in ctx.extra["abort_reason"] + assert payload["report"].libero_gate_status == "skipped_unavailable" + + +def test_pass_records_status_and_drop(monkeypatch): + monkeypatch.setattr( + gate_mod, "_run_teacher_student_rollouts", + lambda **k: (0.90, 0.89), # 1pp drop, under 5pp threshold + ) + ctx, payload = _ctx(), _payload() + libero_drop_gate(ctx, **payload) + assert payload["report"].libero_gate_status == "passed" + assert payload["report"].libero_drop_pp == pytest.approx(1.0, abs=1e-6) + assert "force_abort" not in ctx.extra + + +def test_fail_aborts(monkeypatch): + monkeypatch.setattr( + gate_mod, "_run_teacher_student_rollouts", + lambda **k: (0.90, 0.80), # 10pp drop, over 5pp threshold + ) + ctx, payload = _ctx(), _payload() + libero_drop_gate(ctx, **payload) + assert payload["report"].libero_gate_status == "failed" + assert ctx.extra.get("force_abort") is True + + +def test_skip_flag_recorded(monkeypatch): + ctx, payload = _ctx(libero_gate_skip=True), _payload() + libero_drop_gate(ctx, **payload) + assert payload["report"].libero_gate_status == "skipped_disabled" + assert "force_abort" not in ctx.extra + + +def test_non_distill_phase_recorded(): + ctx, payload = _ctx(), _payload() + ctx.config.phase = "train" + libero_drop_gate(ctx, **payload) + assert payload["report"].libero_gate_status == "skipped_phase"