From 970d08e066321690c3df6b0fce872bdd3178e11f Mon Sep 17 00:00:00 2001
From: shauryr <shauryr@gmail.com>
Date: Fri, 29 May 2026 21:44:02 +0000
Subject: [PATCH] Add eval.sh wrapper to preserve /opt/env PATH for agent
 self-evals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The codex CLI runs every shell command via `bash -lc "..."` (login
shell), which sources /etc/profile + ~/.bashrc and overwrites PATH —
stripping the apptainer-injected /opt/env/local/bin entry where the
bind-mounted `vllm` CLI lives. As a result agents see
`vllm: command not found` and inspect_ai can't spawn a local server.

Observed in both V1 (6h) and V2 (12h clean-slate) data-eng pilots:
agents rediscover the issue and manually prefix commands with
`PATH=/opt/env/local/bin:$PATH ...`, burning ~5 min of exploration
each time.

This adds a small `eval.sh` wrapper that re-asserts PATH and execs
`python3 evaluate.py "$@"`. Copied into the task workspace only when
POST_TRAIN_BENCH_PROMPT=data_eng_prompt, so default-prompt runs are
unchanged. The data_eng_prompt.txt update to actually USE the wrapper
lives in feature/v2-discipline (separate PR).
---
 src/eval/general/eval.sh | 17 +++++++++++++++++
 src/run_task.sh          |  7 +++++++
 2 files changed, 24 insertions(+)
 create mode 100755 src/eval/general/eval.sh

diff --git a/src/eval/general/eval.sh b/src/eval/general/eval.sh
new file mode 100755
index 0000000..a1164bd
--- /dev/null
+++ b/src/eval/general/eval.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Wrapper around evaluate.py for data-engineering agent runs.
+#
+# Why this exists: the bind-mounted Python env at /opt/env contains the
+# `vllm` CLI binary at /opt/env/local/bin/vllm, and run_task.sh injects
+# that directory into PATH via `apptainer exec --env PATH=...`. However,
+# the codex CLI runs every shell command through `bash -lc "..."` (login
+# shell), which sources /etc/profile + ~/.bashrc and *overwrites* PATH
+# with the container's defaults — stripping out /opt/env/local/bin. As a
+# result the agent sees `vllm: command not found` and inspect_ai cannot
+# spawn its local vLLM server.
+#
+# This wrapper re-asserts the bind-mounted env on PATH and forwards all
+# arguments to evaluate.py. Agents should call `bash eval.sh ...` instead
+# of `python3 evaluate.py ...` for self-evals.
+export PATH="/opt/env/local/bin:/opt/env/bin:${PATH}"
+exec python3 /home/ben/task/evaluate.py "$@"
diff --git a/src/run_task.sh b/src/run_task.sh
index 7fd8108..d48f28d 100644
--- a/src/run_task.sh
+++ b/src/run_task.sh
@@ -73,6 +73,13 @@ if [ "$POST_TRAIN_BENCH_PROMPT" = "data_eng_prompt" ]; then
     cp src/eval/general/train_sft.py "${JOB_DIR}/task/"
     cp src/eval/general/dataset_audit.py "${JOB_DIR}/task/"
     cp src/eval/general/publish_experiment.py "${JOB_DIR}/task/"
+    # eval.sh wrapper: codex's `bash -lc` overwrites PATH and strips
+    # /opt/env/local/bin, so calling `python3 evaluate.py` directly fails
+    # to find the bind-mounted `vllm` CLI. This wrapper re-asserts PATH
+    # before exec'ing evaluate.py. Agents should `bash eval.sh ...` for
+    # self-evals.
+    cp src/eval/general/eval.sh "${JOB_DIR}/task/"
+    chmod +x "${JOB_DIR}/task/eval.sh"
     mkdir -p "${JOB_DIR}/task/experiments"
 fi