1919 type : choice
2020 options :
2121 - ' 1'
22- - ' 2'
23- - ' 10'
2422 - ' 50'
25- - ' 100'
23+ - ' 200'
24+ - ' 500'
2625 model_stubs :
2726 description : Comma-separated model stubs to evaluate (must be allowlisted)
2827 required : false
4544 EVAL_AGENT_IMAGE : ghcr.io/openhands/eval-agent-server
4645 EVAL_AGENT_TARGET : source-minimal
4746 # Polling configuration for workflow status checks
48- MAX_POLL_ATTEMPTS : ' 80 ' # 80 attempts × 60s = 80 minutes max wait
47+ MAX_POLL_ATTEMPTS : ' 600 ' # 600 attempts × 60s = 10 hours max wait
4948 POLL_INTERVAL_SECONDS : ' 60'
5049
5150jobs :
@@ -55,10 +54,10 @@ jobs:
5554 github.event_name == 'workflow_dispatch' ||
5655 (github.event_name == 'pull_request_target' &&
5756 (github.event.label.name == 'run-eval-1' ||
58- github.event.label.name == 'run-eval-2' ||
5957 github.event.label.name == 'run-eval-50' ||
60- github.event.label.name == 'run-eval-100'))
61- runs-on : blacksmith-32vcpu-ubuntu-2204
58+ github.event.label.name == 'run-eval-200' ||
59+ github.event.label.name == 'run-eval-500'))
60+ runs-on : ubuntu-latest
6261 permissions :
6362 contents : read
6463 packages : write
7170 uses : actions/checkout@v4
7271 with :
7372 ref : ${{ github.event_name == 'pull_request_target' && github.event.pull_request.base.sha || (github.event_name ==
74- ' workflow_dispatch' && github.event.inputs.branch ) || github.ref }}
73+ ' workflow_dispatch' && github.event.inputs.sdk_ref ) || github.ref }}
7574 fetch-depth : 0
7675
7776 - name : Load allowlists
@@ -117,9 +116,9 @@ jobs:
117116 LABEL="${{ github.event.label.name }}"
118117 case "$LABEL" in
119118 run-eval-1) EVAL_LIMIT=1 ;;
120- run-eval-2) EVAL_LIMIT=2 ;;
121119 run-eval-50) EVAL_LIMIT=50 ;;
122- run-eval-100) EVAL_LIMIT=100 ;;
120+ run-eval-200) EVAL_LIMIT=200 ;;
121+ run-eval-500) EVAL_LIMIT=500 ;;
123122 *) echo "Unsupported label $LABEL" >&2; exit 1 ;;
124123 esac
125124 SDK_REF="${{ github.event.pull_request.head.ref }}"
0 commit comments