Skip to content

Commit 2af2bf8

Browse files
Add docker workspace support to GAIA evaluation
Follow same pattern as SWE-bench to support both docker and remote workspace types. This allows local testing and debugging without requiring remote runtime. - Add DockerWorkspace import - Support workspace_type='docker' with optional building - Keep remote workspace as default for kube workflow - Use universal gaia-with-mcp image for all instances Co-authored-by: openhands <openhands@all-hands.dev>
1 parent bc65fbc commit 2af2bf8

File tree

1 file changed

+59
-30
lines changed

1 file changed

+59
-30
lines changed

benchmarks/gaia/run_infer.py

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
)
3838
from openhands.sdk.workspace import RemoteWorkspace
3939
from openhands.tools.preset.default import get_default_tools
40-
from openhands.workspace import APIRemoteWorkspace
40+
from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
4141

4242

4343
logger = get_logger(__name__)
@@ -119,44 +119,73 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
119119
"""Create workspace and copy necessary files."""
120120
logger.info(f"Preparing workspace for instance {instance.id}")
121121

122-
# GAIA uses remote workspace with a universal agent server image
123-
# (unlike SWE-bench which needs per-instance images)
124-
if self.metadata.workspace_type != "remote":
125-
raise ValueError(
126-
f"GAIA only supports workspace_type='remote', got '{self.metadata.workspace_type}'"
127-
)
128-
129-
runtime_api_key = os.getenv("RUNTIME_API_KEY")
130-
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
131-
if not runtime_api_key:
132-
raise ValueError(
133-
"RUNTIME_API_KEY environment variable is not set for remote workspace"
134-
)
135-
136122
# GAIA uses a universal agent server image (one image for all instances)
137123
# Built from nikolaik/python-nodejs:python3.12-nodejs22 base
138124
# Using MCP-enabled image to avoid 1-18 minute startup delays
139125
# Using binary target (not binary-minimal) to include Chromium for browser operations
140126
# Note: binary target doesn't add target suffix to tag, so it's just gaia-with-mcp
127+
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
141128
agent_server_image = f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-with-mcp"
142129

143-
if not image_exists(agent_server_image):
144-
raise RuntimeError(
145-
f"Agent server image {agent_server_image} does not exist in container registry. "
146-
f"Run 'benchmarks/gaia/build_images.py --push' to build and push it first."
130+
if self.metadata.workspace_type == "docker":
131+
# For local testing, use DockerWorkspace with pre-built GAIA image
132+
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
133+
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
134+
if not SKIP_BUILD:
135+
from benchmarks.utils.build_utils import build_image
136+
137+
logger.info(
138+
"Building GAIA workspace image. This may take a while...\n"
139+
"You can run benchmarks/gaia/build_images.py and set "
140+
"SKIP_BUILD=1 to skip building and use pre-built image."
141+
)
142+
output = build_image(
143+
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
144+
target_image=EVAL_AGENT_SERVER_IMAGE,
145+
custom_tag="gaia-with-mcp",
146+
target="binary",
147+
push=False,
148+
)
149+
logger.info(f"Image build output: {output}")
150+
assert output.error is None, f"Image build failed: {output.error}"
151+
if agent_server_image not in output.tags:
152+
raise RuntimeError(
153+
f"Built image tags {output.tags} do not include expected tag "
154+
f"{agent_server_image}"
155+
)
156+
157+
workspace = DockerWorkspace(
158+
server_image=agent_server_image,
159+
working_dir="/workspace",
147160
)
161+
elif self.metadata.workspace_type == "remote":
162+
runtime_api_key = os.getenv("RUNTIME_API_KEY")
163+
if not runtime_api_key:
164+
raise ValueError(
165+
"RUNTIME_API_KEY environment variable is not set for remote workspace"
166+
)
148167

149-
logger.info(
150-
f"Using remote workspace with GAIA image {agent_server_image} (sdk sha: {sdk_short_sha})"
151-
)
152-
workspace = APIRemoteWorkspace(
153-
runtime_api_url=os.getenv(
154-
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
155-
),
156-
runtime_api_key=runtime_api_key,
157-
server_image=agent_server_image,
158-
target_type="binary", # GAIA images use binary target
159-
)
168+
if not image_exists(agent_server_image):
169+
raise RuntimeError(
170+
f"Agent server image {agent_server_image} does not exist in container registry. "
171+
f"Run 'benchmarks/gaia/build_images.py --push' to build and push it first."
172+
)
173+
174+
logger.info(
175+
f"Using remote workspace with GAIA image {agent_server_image} (sdk sha: {sdk_short_sha})"
176+
)
177+
workspace = APIRemoteWorkspace(
178+
runtime_api_url=os.getenv(
179+
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
180+
),
181+
runtime_api_key=runtime_api_key,
182+
server_image=agent_server_image,
183+
target_type="binary", # GAIA images use binary target
184+
)
185+
else:
186+
raise ValueError(
187+
f"Unsupported workspace_type: {self.metadata.workspace_type}"
188+
)
160189

161190
# Create workspace directory
162191
workspace.execute_command("mkdir -p /workspace")

0 commit comments

Comments
 (0)