From b6305c276d63045d0c8db98f3ac9b32b8590da47 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Fri, 10 Apr 2026 08:43:38 +0000
Subject: [PATCH 01/24] feat: add Dockerfile and Cloud Build pipeline for
 automated Evalbench testing of Cloud SQL PostgreSQL extension

---
 Dockerfile              | 36 +++++++++++++++++++++++
 cloudbuild.yaml         | 64 +++++++++++++++++++++++++++++++++++++++++
 evals/dataset.json      | 15 ++++++++++
 evals/model_config.yaml | 18 ++++++++++++
 evals/run_config.yaml   | 12 ++++++++
 5 files changed, 145 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 cloudbuild.yaml
 create mode 100644 evals/dataset.json
 create mode 100644 evals/model_config.yaml
 create mode 100644 evals/run_config.yaml

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..39e5b3c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,36 @@
+# --- Stage 1: Build the binary from source (Latest Nightly) ---
+FROM golang:1.25 AS builder
+
+WORKDIR /build
+
+# Clone the official genai-toolbox source code (always latest main branch)
+RUN git clone --depth 1 https://github.com/googleapis/genai-toolbox.git .
+
+# Compile the binary with CGO ENABLED to support all upstream database drivers (Oracle, etc.)
+RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o toolbox .
+
+# --- Stage 2: Final Lightweight Runtime Image ---
+# Using the exact same image (golang:1.25) for runtime to perfectly match GLIBC versions
+FROM golang:1.25
+
+
+# Install necessary runtime certificates and standard C libraries for CGO binary
+RUN apt-get update && apt-get install -y ca-certificates libc6 && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy the freshly compiled binary from the builder stage
+COPY --from=builder /build/toolbox /app/toolbox
+RUN chmod +x /app/toolbox
+
+# Copy the extension's skills and configuration into the container
+COPY skills/ ./skills/
+COPY gemini-extension.json .
+
+# Add required tools.yaml placeholder to satisfy binary startup checks
+RUN touch tools.yaml
+
+# Expose HTTP API and UI endpoints to successfully pass Cloud Run health checks
+ENTRYPOINT ["/app/toolbox", "--prebuilt", "cloud-sql-postgres", "--address=0.0.0.0", "--port=8080", "--enable-api", "--ui"]
+
+
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
new file mode 100644
index 0000000..44bb5ff
--- /dev/null
+++ b/cloudbuild.yaml
@@ -0,0 +1,64 @@
+steps:
+
+  # --- STEP 1: Build and Push Docker Image ---
+  - name: 'gcr.io/cloud-builders/docker'
+    args:
+      - 'build'
+      - '-t'
+      - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest'
+      - '.'
+
+  - name: 'gcr.io/cloud-builders/docker'
+    args:
+      - 'push'
+      - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest'
+
+  # --- STEP 2: Deploy to Cloud Run ---
+  - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
+    entrypoint: gcloud
+    args:
+      - 'run'
+      - 'deploy'
+      - 'cloud-sql-postgresql-server'
+      - '--image=us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest'
+      - '--region=us-central1'
+      - '--allow-unauthenticated'
+      - '--port=8080'
+      - '--timeout=300'
+      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=7`[EP^`U"_frcD;q,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
+
+  # --- STEP 3: Run Eval Server in Background ---
+  - name: 'gcr.io/cloud-builders/docker'
+    args:
+      - 'run'
+      - '-d'
+      - '--network=cloudbuild'
+      - '--name=eval_server'
+      - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest'
+
+  # --- STEP 4: Run Evalbench Evaluation Client ---
+  # - name: 'python:3.10'
+  #   entrypoint: 'bash'
+  #   args:
+  #     - '-c'
+  #     - |
+  #       # Clone Evalbench
+  #       git clone https://github.com/GoogleCloudPlatform/evalbench.git
+  #       cd evalbench
+        
+  #       # Install Dependencies
+  #       pip install -r requirements.txt
+        
+  #       # Setup Environment Variables
+  #       export EVAL_GCP_PROJECT_ID=omkar-playground
+  #       export EVAL_GCP_PROJECT_REGION=us-central1
+  #       export EVAL_CONFIG=../evals/run_config.yaml
+        
+  #       # Compile required protobuf modules and Run Evaluation Client against the eval_server container
+  #       make proto
+  #       ./run_client.sh --endpoint=eval_server:50051
+
+
+options:
+  env:
+    - 'DOCKER_BUILDKIT=1'
diff --git a/evals/dataset.json b/evals/dataset.json
new file mode 100644
index 0000000..42af644
--- /dev/null
+++ b/evals/dataset.json
@@ -0,0 +1,15 @@
+{
+  "scenarios": [
+    {
+      "id": "cloud-sql-debug-01",
+      "starting_prompt": "I need to debug the database.",
+      "conversation_plan": "Ask the agent to list instances in project omkar-playground. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.",
+      "expected_trajectory": [
+        "list_instances",
+        "get_metrics"
+      ],
+      "kind": "tool",
+      "max_turns": 15
+    }
+  ]
+}
\ No newline at end of file
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
new file mode 100644
index 0000000..dbb2dc5
--- /dev/null
+++ b/evals/model_config.yaml
@@ -0,0 +1,18 @@
+gemini_cli_version: "@google/gemini-cli@0.26.0"
+generator: gemini_cli
+env:
+  GOOGLE_CLOUD_PROJECT: "omkar-playground"
+  GOOGLE_CLOUD_LOCATION: "us-central1"
+  GOOGLE_GENAI_USE_VERTEXAI: "true"
+  GEMINI_API_MODEL: "gemini-2.5-pro"
+setup:
+  extensions:
+    "https://github.com/gemini-cli-extensions/cloud-sql-postgresql":
+      settings:
+        CLOUD_SQL_POSTGRES_PROJECT: "omkar-playground"
+        CLOUD_SQL_POSTGRES_INSTANCE: "omkar-demo-postgres-1"
+        CLOUD_SQL_POSTGRES_REGION: "us-central1"
+        CLOUD_SQL_POSTGRES_DATABASE: "postgres"
+        CLOUD_SQL_POSTGRES_USER: "postgres"
+        CLOUD_SQL_POSTGRES_PASSWORD: '7`[EP^`U"_frcD;q'
+        CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
new file mode 100644
index 0000000..a631de9
--- /dev/null
+++ b/evals/run_config.yaml
@@ -0,0 +1,12 @@
+dataset_config: /workspace/evals/dataset.json
+dataset_format: gemini-cli-format
+
+orchestrator: geminicli
+model_config: /workspace/evals/model_config.yaml
+# You can reference default simulated user models provided by the evalbench repo:
+simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
+
+scorers:
+  trajectory_matcher: {}
+  goal_completion:
+    model_config: datasets/model_configs/gemini_2.5_pro_model.yaml

From 01bc25bebc1245fc2a0b8f8f82c77a39be1d1c00 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Mon, 13 Apr 2026 10:22:54 +0000
Subject: [PATCH 02/24] feat: integrate full evaluation pipeline in cloudbuild
 and update model configurations

---
 cloudbuild.yaml                 | 96 ++++++++++++++++++++++-----------
 evals/dataset.json              |  3 +-
 evals/gemini_2.5_pro_model.yaml |  4 ++
 evals/model_config.yaml         |  2 +-
 evals/run_config.yaml           |  4 +-
 5 files changed, 73 insertions(+), 36 deletions(-)
 create mode 100644 evals/gemini_2.5_pro_model.yaml

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 44bb5ff..90bbcd8 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -25,40 +25,74 @@ steps:
       - '--allow-unauthenticated'
       - '--port=8080'
       - '--timeout=300'
-      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=7`[EP^`U"_frcD;q,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
+      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
 
-  # --- STEP 3: Run Eval Server in Background ---
-  - name: 'gcr.io/cloud-builders/docker'
+  # --- STEP 3: Fully Integrated Evaluation to Persist Results ---
+  - name: 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest'
+    entrypoint: 'bash'
     args:
-      - 'run'
-      - '-d'
-      - '--network=cloudbuild'
-      - '--name=eval_server'
-      - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest'
+      - '-c'
+      - |
+        set -e
+        cd /evalbench
+
+        export EVAL_GCP_PROJECT_ID=omkar-playground
+        export EVAL_GCP_PROJECT_REGION=us-central1
+
+        echo "Compiling protobuf files..."
+        python3 -m grpc_tools.protoc --proto_path=evalbench/evalproto --python_out=evalbench/evalproto --grpc_python_out=evalbench/evalproto evalbench/evalproto/*.proto
+
+        echo "Patching client to use insecure credentials..."
+        # sed -i 's/"localhost:50051"/"127.0.0.1:50051"/g' evalbench/client/eval_client.py
+        sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py
+        sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py
+
+        echo "Patching server to listen on all IPv4 interfaces (0.0.0.0)..."
+        sed -i 's/"\[::\]:%s"/"0.0.0.0:%s"/g' /evalbench/evalbench/eval_server.py
+        echo "Checking bind success in server (writing to stderr)..."
+        sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n        import sys\n        sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n        if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py
 
-  # --- STEP 4: Run Evalbench Evaluation Client ---
-  # - name: 'python:3.10'
-  #   entrypoint: 'bash'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       # Clone Evalbench
-  #       git clone https://github.com/GoogleCloudPlatform/evalbench.git
-  #       cd evalbench
+        echo "Patching eval_service.py to fix TypeError in get_reporters..."
+        sed -i 's|reporters = get_reporters(config.get("reporting"), job_id, run_time)|reporters = get_reporters(config.get("reporting") or {}, job_id, run_time)|' /evalbench/evalbench/eval_service.py
+
+        echo "Patching util/session.py to make ADK import lazy..."
+        sed -i 's|from google.adk.sessions import VertexAiSessionService||' /evalbench/evalbench/util/session.py
+        sed -i 's|    def __init__(self, config):|    def __init__(self, config):\n        from google.adk.sessions import VertexAiSessionService|' /evalbench/evalbench/util/session.py
+        echo "Patching databases/util.py to make SecretManagerClient lazy..."
+        sed -i 's|CLIENT = secretmanager_v1.SecretManagerServiceClient()|CLIENT = None\ndef get_client():\n    global CLIENT\n    if CLIENT is None:\n        CLIENT = secretmanager_v1.SecretManagerServiceClient()\n    return CLIENT|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py"
+        sed -i 's|CLIENT.access_secret_version|get_client().access_secret_version|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py usage"
+        cd evalbench
+        export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+        export PYTHONPATH=./evalproto:.
+        export CLOUD_RUN=True
+        export PORT=50051
+
+
+
+        echo "Starting Evaluation Server in background..."
+        # NEW: Added </dev/null in case it was waiting for input
+        python3 -u ./eval_server.py --localhost </dev/null &
+        SERVER_PID=$$!
         
-  #       # Install Dependencies
-  #       pip install -r requirements.txt
+        echo "Waiting for port 50051 to open..."
+        python3 -c "
+        import socket
+        import time
+        for i in range(20):
+            try:
+                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                s.connect(('127.0.0.1', 50051))
+                print('Port is open!')
+                exit(0)
+            except Exception as e:
+                print(f'Port not open yet: {e}')
+                time.sleep(1)
+        print('Port failed to open')
+        exit(1)
+        " || { echo "Server failed to bind port. Check logs above."; exit 1; }
         
-  #       # Setup Environment Variables
-  #       export EVAL_GCP_PROJECT_ID=omkar-playground
-  #       export EVAL_GCP_PROJECT_REGION=us-central1
-  #       export EVAL_CONFIG=../evals/run_config.yaml
+        echo "Server is running. Launching Evaluation Client..."
+        cd /evalbench
+        export PYTHONPATH=./evalbench:./evalbench/evalproto
         
-  #       # Compile required protobuf modules and Run Evaluation Client against the eval_server container
-  #       make proto
-  #       ./run_client.sh --endpoint=eval_server:50051
-
-
-options:
-  env:
-    - 'DOCKER_BUILDKIT=1'
+        python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; }
diff --git a/evals/dataset.json b/evals/dataset.json
index 42af644..65c48fe 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -5,8 +5,7 @@
       "starting_prompt": "I need to debug the database.",
       "conversation_plan": "Ask the agent to list instances in project omkar-playground. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.",
       "expected_trajectory": [
-        "list_instances",
-        "get_metrics"
+        "list_instances"
       ],
       "kind": "tool",
       "max_turns": 15
diff --git a/evals/gemini_2.5_pro_model.yaml b/evals/gemini_2.5_pro_model.yaml
new file mode 100644
index 0000000..26eb0b7
--- /dev/null
+++ b/evals/gemini_2.5_pro_model.yaml
@@ -0,0 +1,4 @@
+generator: gcp_vertex_gemini
+vertex_model: gemini-2.5-pro
+base_prompt: ""
+execs_per_minute: 5
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index dbb2dc5..ff6eb2f 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -14,5 +14,5 @@ setup:
         CLOUD_SQL_POSTGRES_REGION: "us-central1"
         CLOUD_SQL_POSTGRES_DATABASE: "postgres"
         CLOUD_SQL_POSTGRES_USER: "postgres"
-        CLOUD_SQL_POSTGRES_PASSWORD: '7`[EP^`U"_frcD;q'
+        CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD}
         CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index a631de9..ce09cfd 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -4,9 +4,9 @@ dataset_format: gemini-cli-format
 orchestrator: geminicli
 model_config: /workspace/evals/model_config.yaml
 # You can reference default simulated user models provided by the evalbench repo:
-simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
+simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
 
 scorers:
   trajectory_matcher: {}
   goal_completion:
-    model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml

From 663d2aa8fc5e2685eaab4b483a413514091f1109 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Mon, 13 Apr 2026 11:32:38 +0000
Subject: [PATCH 03/24] chore: update Dockerfile to fetch binary, and
 parameterize Cloud Build variables with $PROJECT_ID

---
 Dockerfile      | 30 +++++++++---------------------
 cloudbuild.yaml | 12 ++++++------
 2 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 39e5b3c..47d3a31 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,26 +1,16 @@
-# --- Stage 1: Build the binary from source (Latest Nightly) ---
-FROM golang:1.25 AS builder
+# --- Final Runtime Image ---
+# Using python:3.11 as the base image to support evaluations that require Python,
+# while still running the pre-compiled Go binary for the toolbox server.
+FROM python:3.11
 
-WORKDIR /build
-
-# Clone the official genai-toolbox source code (always latest main branch)
-RUN git clone --depth 1 https://github.com/googleapis/genai-toolbox.git .
-
-# Compile the binary with CGO ENABLED to support all upstream database drivers (Oracle, etc.)
-RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o toolbox .
-
-# --- Stage 2: Final Lightweight Runtime Image ---
-# Using the exact same image (golang:1.25) for runtime to perfectly match GLIBC versions
-FROM golang:1.25
-
-
-# Install necessary runtime certificates and standard C libraries for CGO binary
-RUN apt-get update && apt-get install -y ca-certificates libc6 && rm -rf /var/lib/apt/lists/*
+# Install necessary runtime certificates, standard C libraries, and curl
+RUN apt-get update && apt-get install -y ca-certificates libc6 curl && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
 
-# Copy the freshly compiled binary from the builder stage
-COPY --from=builder /build/toolbox /app/toolbox
+# Dynamically fetch the latest version and download the binary
+RUN LATEST_VERSION=$(curl -s https://api.github.com/repos/googleapis/mcp-toolbox/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') && \
+    curl -L https://storage.googleapis.com/mcp-toolbox-for-databases/${LATEST_VERSION}/linux/amd64/toolbox -o /app/toolbox
 RUN chmod +x /app/toolbox
 
 # Copy the extension's skills and configuration into the container
@@ -32,5 +22,3 @@ RUN touch tools.yaml
 
 # Expose HTTP API and UI endpoints to successfully pass Cloud Run health checks
 ENTRYPOINT ["/app/toolbox", "--prebuilt", "cloud-sql-postgres", "--address=0.0.0.0", "--port=8080", "--enable-api", "--ui"]
-
-
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 90bbcd8..68c06d8 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -5,13 +5,13 @@ steps:
     args:
       - 'build'
       - '-t'
-      - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest'
+      - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
       - '.'
 
   - name: 'gcr.io/cloud-builders/docker'
     args:
       - 'push'
-      - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest'
+      - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
 
   # --- STEP 2: Deploy to Cloud Run ---
   - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
@@ -20,15 +20,15 @@ steps:
       - 'run'
       - 'deploy'
       - 'cloud-sql-postgresql-server'
-      - '--image=us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest'
+      - '--image=us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
       - '--region=us-central1'
       - '--allow-unauthenticated'
       - '--port=8080'
       - '--timeout=300'
-      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
+      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
 
   # --- STEP 3: Fully Integrated Evaluation to Persist Results ---
-  - name: 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest'
+  - name: 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/eval_server:latest'
     entrypoint: 'bash'
     args:
       - '-c'
@@ -36,7 +36,7 @@ steps:
         set -e
         cd /evalbench
 
-        export EVAL_GCP_PROJECT_ID=omkar-playground
+        export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=us-central1
 
         echo "Compiling protobuf files..."

From a83c79e2970638eadfd727401c60c951aafcce9e Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Mon, 13 Apr 2026 12:16:42 +0000
Subject: [PATCH 04/24] chore: enable BigQuery reporting in eval configuration

---
 evals/run_config.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index ce09cfd..d2ba9cd 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -10,3 +10,6 @@ scorers:
   trajectory_matcher: {}
   goal_completion:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+
+reporting:
+  bigquery: {}

From c221c21658ded50ac24bb95c3637270b784b319f Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Wed, 15 Apr 2026 10:30:30 +0000
Subject: [PATCH 05/24] chore: update model config path, disable
 unauthenticated cloud run access, and remove evalbench patching scripts

---
 cloudbuild.yaml         | 20 +++++++++-----------
 evals/model_config.yaml | 10 +++++++++-
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 68c06d8..a449e9e 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -6,7 +6,7 @@ steps:
       - 'build'
       - '-t'
       - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
-      - '.'
+      - '.'   # The "." evaluates Dockerfile
 
   - name: 'gcr.io/cloud-builders/docker'
     args:
@@ -22,7 +22,7 @@ steps:
       - 'cloud-sql-postgresql-server'
       - '--image=us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
       - '--region=us-central1'
-      - '--allow-unauthenticated'
+      - '--no-allow-unauthenticated'
       - '--port=8080'
       - '--timeout=300'
       - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
@@ -34,6 +34,13 @@ steps:
       - '-c'
       - |
         set -e
+        # ----------------------------
+        echo "=== INSIDE /workspace ==="
+        ls -la /workspace
+        
+        echo "=== INSIDE /evalbench ==="
+        ls -la /evalbench
+        # ----------------------------
         cd /evalbench
 
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
@@ -52,15 +59,6 @@ steps:
         echo "Checking bind success in server (writing to stderr)..."
         sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n        import sys\n        sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n        if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py
 
-        echo "Patching eval_service.py to fix TypeError in get_reporters..."
-        sed -i 's|reporters = get_reporters(config.get("reporting"), job_id, run_time)|reporters = get_reporters(config.get("reporting") or {}, job_id, run_time)|' /evalbench/evalbench/eval_service.py
-
-        echo "Patching util/session.py to make ADK import lazy..."
-        sed -i 's|from google.adk.sessions import VertexAiSessionService||' /evalbench/evalbench/util/session.py
-        sed -i 's|    def __init__(self, config):|    def __init__(self, config):\n        from google.adk.sessions import VertexAiSessionService|' /evalbench/evalbench/util/session.py
-        echo "Patching databases/util.py to make SecretManagerClient lazy..."
-        sed -i 's|CLIENT = secretmanager_v1.SecretManagerServiceClient()|CLIENT = None\ndef get_client():\n    global CLIENT\n    if CLIENT is None:\n        CLIENT = secretmanager_v1.SecretManagerServiceClient()\n    return CLIENT|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py"
-        sed -i 's|CLIENT.access_secret_version|get_client().access_secret_version|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py usage"
         cd evalbench
         export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
         export PYTHONPATH=./evalproto:.
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index ff6eb2f..c63808a 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -7,7 +7,7 @@ env:
   GEMINI_API_MODEL: "gemini-2.5-pro"
 setup:
   extensions:
-    "https://github.com/gemini-cli-extensions/cloud-sql-postgresql":
+    "/workspace":
       settings:
         CLOUD_SQL_POSTGRES_PROJECT: "omkar-playground"
         CLOUD_SQL_POSTGRES_INSTANCE: "omkar-demo-postgres-1"
@@ -16,3 +16,11 @@ setup:
         CLOUD_SQL_POSTGRES_USER: "postgres"
         CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD}
         CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"
+
+  # mcp_servers:
+  #   "cloud-sql-postgresql":
+  #     httpUrl: "CLOUD_RUN_URL_PLACEHOLDER"
+  #     authProviderType: google_credentials
+  #     oauth:
+  #       scopes:
+  #       - https://www.googleapis.com/auth/cloud-platform

From a8401a2accebc5f41c4080aeffa216a7257f1539 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 16 Apr 2026 05:39:40 +0000
Subject: [PATCH 06/24] chore: remove debug logging from cloudbuild
 configuration

---
 cloudbuild.yaml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index a449e9e..03f96ae 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -34,13 +34,6 @@ steps:
       - '-c'
       - |
         set -e
-        # ----------------------------
-        echo "=== INSIDE /workspace ==="
-        ls -la /workspace
-        
-        echo "=== INSIDE /evalbench ==="
-        ls -la /evalbench
-        # ----------------------------
         cd /evalbench
 
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID

From e36074080c072e10c3196827be3a474a5e163339 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 16 Apr 2026 10:27:23 +0000
Subject: [PATCH 07/24] feat: migrate database password from environment
 variable to Secret Manager for improved security

---
 cloudbuild.yaml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 03f96ae..0288079 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -25,11 +25,13 @@ steps:
       - '--no-allow-unauthenticated'
       - '--port=8080'
       - '--timeout=300'
-      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
+      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
+      - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=cloud-sql-postgres-password:latest'
 
   # --- STEP 3: Fully Integrated Evaluation to Persist Results ---
   - name: 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/eval_server:latest'
     entrypoint: 'bash'
+    secretEnv: ['DB_PASSWORD']
     args:
       - '-c'
       - |
@@ -87,3 +89,8 @@ steps:
         export PYTHONPATH=./evalbench:./evalbench/evalproto
         
         python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; }
+
+availableSecrets:
+  secretManager:
+  - versionName: projects/$PROJECT_ID/secrets/cloud-sql-postgres-password/versions/latest
+    env: 'DB_PASSWORD'

From 73bb2025d50dc723079ab22763df922dd21f3408 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 16 Apr 2026 18:31:32 +0000
Subject: [PATCH 08/24] refactor: remove unnecessary comment from docker build
 step in cloudbuild.yaml

---
 cloudbuild.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 0288079..1373f52 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -6,7 +6,7 @@ steps:
       - 'build'
       - '-t'
       - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
-      - '.'   # The "." evaluates Dockerfile
+      - '.'
 
   - name: 'gcr.io/cloud-builders/docker'
     args:

From c88e6f7d518190e7a471fd3a9f1960f81ccee298 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Fri, 17 Apr 2026 05:30:09 +0000
Subject: [PATCH 09/24] chore: add Apache 2.0 license headers to configuration
 and build files

---
 cloudbuild.yaml                 | 14 ++++++++++++++
 evals/gemini_2.5_pro_model.yaml | 14 ++++++++++++++
 evals/model_config.yaml         | 14 ++++++++++++++
 evals/run_config.yaml           | 14 ++++++++++++++
 4 files changed, 56 insertions(+)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 1373f52..c94ad19 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 steps:
 
   # --- STEP 1: Build and Push Docker Image ---
diff --git a/evals/gemini_2.5_pro_model.yaml b/evals/gemini_2.5_pro_model.yaml
index 26eb0b7..7154ec3 100644
--- a/evals/gemini_2.5_pro_model.yaml
+++ b/evals/gemini_2.5_pro_model.yaml
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 generator: gcp_vertex_gemini
 vertex_model: gemini-2.5-pro
 base_prompt: ""
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index c63808a..5addfd5 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 gemini_cli_version: "@google/gemini-cli@0.26.0"
 generator: gemini_cli
 env:
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index d2ba9cd..8aaaabb 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 dataset_config: /workspace/evals/dataset.json
 dataset_format: gemini-cli-format
 

From d7168ea89bcc8d24caca685cc49599bfcccb809c Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Fri, 17 Apr 2026 07:50:08 +0000
Subject: [PATCH 10/24] chore: update cloudbuild configuration and model
 environment settings to use daily CI evaluation database

---
 cloudbuild.yaml         | 11 +++++++----
 evals/model_config.yaml |  6 +++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index c94ad19..ce091af 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+options:
+  logging: CLOUD_LOGGING_ONLY
+
 steps:
 
   # --- STEP 1: Build and Push Docker Image ---
@@ -39,11 +42,11 @@ steps:
       - '--no-allow-unauthenticated'
       - '--port=8080'
       - '--timeout=300'
-      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
-      - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=cloud-sql-postgres-password:latest'
+      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=daily-ci-evals-db,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
+      - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=daily-ci-evals-db-password:latest'
 
   # --- STEP 3: Fully Integrated Evaluation to Persist Results ---
-  - name: 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/eval_server:latest'
+  - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
     entrypoint: 'bash'
     secretEnv: ['DB_PASSWORD']
     args:
@@ -106,5 +109,5 @@ steps:
 
 availableSecrets:
   secretManager:
-  - versionName: projects/$PROJECT_ID/secrets/cloud-sql-postgres-password/versions/latest
+  - versionName: projects/$PROJECT_ID/secrets/daily-ci-evals-db-password/versions/latest
     env: 'DB_PASSWORD'
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 5addfd5..8456724 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -15,7 +15,7 @@
 gemini_cli_version: "@google/gemini-cli@0.26.0"
 generator: gemini_cli
 env:
-  GOOGLE_CLOUD_PROJECT: "omkar-playground"
+  GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres"
   GOOGLE_CLOUD_LOCATION: "us-central1"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
   GEMINI_API_MODEL: "gemini-2.5-pro"
@@ -23,8 +23,8 @@ setup:
   extensions:
     "/workspace":
       settings:
-        CLOUD_SQL_POSTGRES_PROJECT: "omkar-playground"
-        CLOUD_SQL_POSTGRES_INSTANCE: "omkar-demo-postgres-1"
+        CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres"
+        CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db"
         CLOUD_SQL_POSTGRES_REGION: "us-central1"
         CLOUD_SQL_POSTGRES_DATABASE: "postgres"
         CLOUD_SQL_POSTGRES_USER: "postgres"

From d38f261a92e66ac1575d86e31c3e4ed900fd5e6d Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Fri, 17 Apr 2026 10:15:44 +0000
Subject: [PATCH 11/24] chore: update evaluation project ID and gemini-cli
 dependency version

---
 evals/dataset.json      | 2 +-
 evals/model_config.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/evals/dataset.json b/evals/dataset.json
index 65c48fe..e8eadb4 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -3,7 +3,7 @@
     {
       "id": "cloud-sql-debug-01",
       "starting_prompt": "I need to debug the database.",
-      "conversation_plan": "Ask the agent to list instances in project omkar-playground. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.",
+      "conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.",
       "expected_trajectory": [
         "list_instances"
       ],
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 8456724..2e7e953 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-gemini_cli_version: "@google/gemini-cli@0.26.0"
+gemini_cli_version: "@google/gemini-cli@0.38.1"
 generator: gemini_cli
 env:
   GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres"

From aa6ab94479175b577ed9bad5c58b17351828fddf Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Fri, 17 Apr 2026 18:38:57 +0000
Subject: [PATCH 12/24] chore: configure service account for Cloud Run step and
 simplify evaluation server execution script

---
 cloudbuild.yaml | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index ce091af..6bdb7b8 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -44,6 +44,7 @@ steps:
       - '--timeout=300'
       - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=daily-ci-evals-db,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
       - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=daily-ci-evals-db-password:latest'
+      - '--service-account=evals-ci-runner@ext-test-cloud-sql-postgres.iam.gserviceaccount.com'
 
   # --- STEP 3: Fully Integrated Evaluation to Persist Results ---
   - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
@@ -58,11 +59,8 @@ steps:
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=us-central1
 
-        echo "Compiling protobuf files..."
-        python3 -m grpc_tools.protoc --proto_path=evalbench/evalproto --python_out=evalbench/evalproto --grpc_python_out=evalbench/evalproto evalbench/evalproto/*.proto
-
         echo "Patching client to use insecure credentials..."
-        # sed -i 's/"localhost:50051"/"127.0.0.1:50051"/g' evalbench/client/eval_client.py
+
         sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py
         sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py
 
@@ -77,12 +75,8 @@ steps:
         export CLOUD_RUN=True
         export PORT=50051
 
-
-
         echo "Starting Evaluation Server in background..."
-        # NEW: Added </dev/null in case it was waiting for input
-        python3 -u ./eval_server.py --localhost </dev/null &
-        SERVER_PID=$$!
+        python3 -u ./eval_server.py --localhost </dev/null 2>&1 | tee server.log &
         
         echo "Waiting for port 50051 to open..."
         python3 -c "

From 6b1d22be087d51605c22fbfc3afffacab506d482 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Mon, 20 Apr 2026 06:01:02 +0000
Subject: [PATCH 13/24] feat: add GEMINI_MODEL configuration to
 model_config.yaml, ortherwise it was attempting to use gemini-3-flash-preview
 and failing due to a 404 permissions error.

---
 evals/model_config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 2e7e953..6823dce 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -19,6 +19,7 @@ env:
   GOOGLE_CLOUD_LOCATION: "us-central1"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
   GEMINI_API_MODEL: "gemini-2.5-pro"
+  GEMINI_MODEL: "gemini-2.5-pro"
 setup:
   extensions:
     "/workspace":

From 8b9462c24944b05a6145df15f0e8336acb2e1acb Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Mon, 20 Apr 2026 08:18:42 +0000
Subject: [PATCH 14/24] chore: standardize model evaluation configuration
 settings

---
 evals/model_config.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 6823dce..0d34647 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -16,14 +16,12 @@ gemini_cli_version: "@google/gemini-cli@0.38.1"
 generator: gemini_cli
 env:
   GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres"
-  GOOGLE_CLOUD_LOCATION: "us-central1"
+  GOOGLE_CLOUD_LOCATION: "global"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
-  GEMINI_API_MODEL: "gemini-2.5-pro"
-  GEMINI_MODEL: "gemini-2.5-pro"
 setup:
   extensions:
     "/workspace":
-      settings:
+      settings: 
         CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres"
         CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db"
         CLOUD_SQL_POSTGRES_REGION: "us-central1"

From ae11bdfb21506df2c13a661b3cfa8cd743ab908e Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Mon, 20 Apr 2026 11:49:22 +0000
Subject: [PATCH 15/24] feat: inject CLOUD_SQL_POSTGRES_PASSWORD into build
 step and update evaluation dataset scenarios

---
 cloudbuild.yaml         |  5 ++++-
 evals/dataset.json      | 50 +++++++++++++++++++++++++++++++++++++----
 evals/model_config.yaml |  2 +-
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 6bdb7b8..1e78174 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -49,7 +49,8 @@ steps:
   # --- STEP 3: Fully Integrated Evaluation to Persist Results ---
   - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
     entrypoint: 'bash'
-    secretEnv: ['DB_PASSWORD']
+    # Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable
+    secretEnv: ['DB_PASSWORD'] 
     args:
       - '-c'
       - |
@@ -58,6 +59,8 @@ steps:
 
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=us-central1
+        # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
+        export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
 
         echo "Patching client to use insecure credentials..."
 
diff --git a/evals/dataset.json b/evals/dataset.json
index e8eadb4..a442e03 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -1,14 +1,56 @@
 {
   "scenarios": [
     {
-      "id": "cloud-sql-debug-01",
-      "starting_prompt": "I need to debug the database.",
-      "conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.",
+      "id": "cloud-sql-list-instances",
+      "starting_prompt": "Show me all the Cloud SQL instances in this project.",
+      "conversation_plan": "Ask the agent to list the Cloud SQL instances in the current project.",
       "expected_trajectory": [
         "list_instances"
       ],
       "kind": "tool",
-      "max_turns": 15
+      "max_turns": 5
+    },
+    {
+      "id": "cloud-sql-data-explore",
+      "starting_prompt": "What schemas and tables do we have in this database? Please list them.",
+      "conversation_plan": "Ask the agent to list the schemas in the database. Then ask to list the tables.",
+      "expected_trajectory": [
+        "list_schemas",
+        "list_tables"
+      ],
+      "kind": "tool",
+      "max_turns": 5
+    },
+    {
+      "id": "cloud-sql-perf-troubleshoot",
+      "starting_prompt": "The database is running slow. Are there any active queries running for more than 10 seconds or any locks?",
+      "conversation_plan": "Ask the agent to check for active queries running longer than 10 seconds. Then ask to check for locks.",
+      "expected_trajectory": [
+        "list_active_queries",
+        "list_locks"
+      ],
+      "kind": "tool",
+      "max_turns": 5
+    },
+    {
+      "id": "cloud-sql-metrics-cpu",
+      "starting_prompt": "Can you show me the CPU utilization for instance 'daily-ci-evals-db' in project 'ext-test-cloud-sql-postgres' for the last 5 minutes?",
+      "conversation_plan": "Ask the agent to query the CPU utilization metric for the specified instance and project using PromQL.",
+      "expected_trajectory": [
+        "get_system_metrics"
+      ],
+      "kind": "tool",
+      "max_turns": 4
+    },
+    {
+      "id": "cloud-sql-unused-indexes",
+      "starting_prompt": "Are there any unused indexes in the database that we can clean up?",
+      "conversation_plan": "Ask the agent to list unused indexes in the database.",
+      "expected_trajectory": [
+        "list_indexes"
+      ],
+      "kind": "tool",
+      "max_turns": 4
     }
   ]
 }
\ No newline at end of file
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 0d34647..a461eb5 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -21,7 +21,7 @@ env:
 setup:
   extensions:
     "/workspace":
-      settings: 
+      settings:
         CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres"
         CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db"
         CLOUD_SQL_POSTGRES_REGION: "us-central1"

From 5a6d8650daed2e2ccd316828658870c37add1176 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 21 Apr 2026 06:07:50 +0000
Subject: [PATCH 16/24] remove Dockerfile, cloudbuild deployment steps, and
 configurations

---
 Dockerfile              | 24 ---------------------
 cloudbuild.yaml         | 46 ++++-------------------------------------
 evals/model_config.yaml |  8 -------
 3 files changed, 4 insertions(+), 74 deletions(-)
 delete mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 47d3a31..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-# --- Final Runtime Image ---
-# Using python:3.11 as the base image to support evaluations that require Python,
-# while still running the pre-compiled Go binary for the toolbox server.
-FROM python:3.11
-
-# Install necessary runtime certificates, standard C libraries, and curl
-RUN apt-get update && apt-get install -y ca-certificates libc6 curl && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-
-# Dynamically fetch the latest version and download the binary
-RUN LATEST_VERSION=$(curl -s https://api.github.com/repos/googleapis/mcp-toolbox/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') && \
-    curl -L https://storage.googleapis.com/mcp-toolbox-for-databases/${LATEST_VERSION}/linux/amd64/toolbox -o /app/toolbox
-RUN chmod +x /app/toolbox
-
-# Copy the extension's skills and configuration into the container
-COPY skills/ ./skills/
-COPY gemini-extension.json .
-
-# Add required tools.yaml placeholder to satisfy binary startup checks
-RUN touch tools.yaml
-
-# Expose HTTP API and UI endpoints to successfully pass Cloud Run health checks
-ENTRYPOINT ["/app/toolbox", "--prebuilt", "cloud-sql-postgres", "--address=0.0.0.0", "--port=8080", "--enable-api", "--ui"]
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 1e78174..7710bea 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -17,37 +17,8 @@ options:
 
 steps:
 
-  # --- STEP 1: Build and Push Docker Image ---
-  - name: 'gcr.io/cloud-builders/docker'
-    args:
-      - 'build'
-      - '-t'
-      - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
-      - '.'
-
-  - name: 'gcr.io/cloud-builders/docker'
-    args:
-      - 'push'
-      - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
-
-  # --- STEP 2: Deploy to Cloud Run ---
-  - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
-    entrypoint: gcloud
-    args:
-      - 'run'
-      - 'deploy'
-      - 'cloud-sql-postgresql-server'
-      - '--image=us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest'
-      - '--region=us-central1'
-      - '--no-allow-unauthenticated'
-      - '--port=8080'
-      - '--timeout=300'
-      - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=daily-ci-evals-db,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
-      - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=daily-ci-evals-db-password:latest'
-      - '--service-account=evals-ci-runner@ext-test-cloud-sql-postgres.iam.gserviceaccount.com'
-
-  # --- STEP 3: Fully Integrated Evaluation to Persist Results ---
-  - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
+  # --- Evaluation Step ---
+  - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:89aa9fefd4b247610a95ef0896ba55d468563f50'
     entrypoint: 'bash'
     # Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable
     secretEnv: ['DB_PASSWORD'] 
@@ -61,17 +32,8 @@ steps:
         export EVAL_GCP_PROJECT_REGION=us-central1
         # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
         export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
-
-        echo "Patching client to use insecure credentials..."
-
-        sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py
-        sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py
-
-        echo "Patching server to listen on all IPv4 interfaces (0.0.0.0)..."
-        sed -i 's/"\[::\]:%s"/"0.0.0.0:%s"/g' /evalbench/evalbench/eval_server.py
-        echo "Checking bind success in server (writing to stderr)..."
-        sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n        import sys\n        sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n        if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py
-
+        export EVALBENCH_INSECURE=True
+        export EVALBENCH_HOST=0.0.0.0
         cd evalbench
         export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
         export PYTHONPATH=./evalproto:.
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index a461eb5..6b2bfd2 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -29,11 +29,3 @@ setup:
         CLOUD_SQL_POSTGRES_USER: "postgres"
         CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD}
         CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"
-
-  # mcp_servers:
-  #   "cloud-sql-postgresql":
-  #     httpUrl: "CLOUD_RUN_URL_PLACEHOLDER"
-  #     authProviderType: google_credentials
-  #     oauth:
-  #       scopes:
-  #       - https://www.googleapis.com/auth/cloud-platform

From 35f1c787c9266bb7cab747a2ac3a57b998eb3caa Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 21 Apr 2026 10:02:34 +0000
Subject: [PATCH 17/24] feat: integrate environment substitution, add port
 polling utility, update Cloud Build symlinks,

---
 cloudbuild.yaml         | 28 ++++++++++------------------
 evals/model_config.yaml |  5 +++--
 evals/substitute_env.py | 17 +++++++++++++++++
 evals/wait_for_port.py  | 20 ++++++++++++++++++++
 4 files changed, 50 insertions(+), 20 deletions(-)
 create mode 100644 evals/substitute_env.py
 create mode 100644 evals/wait_for_port.py

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 7710bea..73ccaa1 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -26,12 +26,18 @@ steps:
       - '-c'
       - |
         set -e
+        # Workaround for evalbench bug: settings are only applied if path basename matches extension ID
+        ln -s /workspace /workspace/cloud-sql-postgresql
         cd /evalbench
 
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=us-central1
         # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
         export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
+
+        # Substitute environment variables in model_config.yaml
+        python3 /workspace/evals/substitute_env.py
+
         export EVALBENCH_INSECURE=True
         export EVALBENCH_HOST=0.0.0.0
         cd evalbench
@@ -42,28 +48,14 @@ steps:
 
         echo "Starting Evaluation Server in background..."
         python3 -u ./eval_server.py --localhost </dev/null 2>&1 | tee server.log &
-        
+
         echo "Waiting for port 50051 to open..."
-        python3 -c "
-        import socket
-        import time
-        for i in range(20):
-            try:
-                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-                s.connect(('127.0.0.1', 50051))
-                print('Port is open!')
-                exit(0)
-            except Exception as e:
-                print(f'Port not open yet: {e}')
-                time.sleep(1)
-        print('Port failed to open')
-        exit(1)
-        " || { echo "Server failed to bind port. Check logs above."; exit 1; }
-        
+        python3 /workspace/evals/wait_for_port.py || { echo "Server failed to bind port."; exit 1; }
+
         echo "Server is running. Launching Evaluation Client..."
         cd /evalbench
         export PYTHONPATH=./evalbench:./evalbench/evalproto
-        
+
         python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; }
 
 availableSecrets:
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 6b2bfd2..7dba6a5 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -20,12 +20,13 @@ env:
   GOOGLE_GENAI_USE_VERTEXAI: "true"
 setup:
   extensions:
-    "/workspace":
+    # Points to the symlink created in cloudbuild.yaml to match the extension ID
+    "/workspace/cloud-sql-postgresql":
       settings:
         CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres"
         CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db"
         CLOUD_SQL_POSTGRES_REGION: "us-central1"
         CLOUD_SQL_POSTGRES_DATABASE: "postgres"
         CLOUD_SQL_POSTGRES_USER: "postgres"
-        CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD}
+        CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
         CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"
diff --git a/evals/substitute_env.py b/evals/substitute_env.py
new file mode 100644
index 0000000..3ef2295
--- /dev/null
+++ b/evals/substitute_env.py
@@ -0,0 +1,17 @@
+import os
+import re
+
+def main():
+    yaml_path = '/workspace/evals/model_config.yaml'
+    if os.path.exists(yaml_path):
+        with open(yaml_path, 'r') as f:
+            content = f.read()
+        content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
+        with open(yaml_path, 'w') as f:
+            f.write(content)
+        print(f"Successfully substituted environment variables in {yaml_path}")
+    else:
+        print(f"File not found: {yaml_path}")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/evals/wait_for_port.py b/evals/wait_for_port.py
new file mode 100644
index 0000000..8b11c44
--- /dev/null
+++ b/evals/wait_for_port.py
@@ -0,0 +1,20 @@
+import socket
+import time
+import sys
+
+def main():
+    for i in range(20):
+        try:
+            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            s.settimeout(1)
+            s.connect(('127.0.0.1', 50051))
+            print('Port 50051 is open!')
+            sys.exit(0)
+        except Exception:
+            print('Port not open yet, retrying...')
+            time.sleep(1)
+    print('Port failed to open')
+    sys.exit(1)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From d4f796a758d286c513016dd0e1ea1633f6788d2a Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 21 Apr 2026 11:33:21 +0000
Subject: [PATCH 18/24] chore: update configuration settings To identify which
 evaluation results belong to my extension added extension_id:
 cloud-sql-postgresql key

---
 evals/run_config.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index 8aaaabb..6a2a657 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+extension_id: cloud-sql-postgresql
+
 dataset_config: /workspace/evals/dataset.json
 dataset_format: gemini-cli-format
 
@@ -26,4 +28,5 @@ scorers:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
 
 reporting:
-  bigquery: {}
+  bigquery:
+    gcp_project_id: cloud-db-nl2sql

From 1722276c7adcd1487caf62a0603803fbc716f72e Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 21 Apr 2026 14:41:52 +0000
Subject: [PATCH 19/24] feat: update evaluation scenarios and add performance
 and qualitative scorers to run configuration

---
 evals/dataset.json    | 72 +++++++++++++++++++++++++++----------------
 evals/run_config.yaml | 13 ++++++++
 2 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/evals/dataset.json b/evals/dataset.json
index a442e03..6716b0c 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -1,55 +1,73 @@
 {
   "scenarios": [
     {
-      "id": "cloud-sql-list-instances",
-      "starting_prompt": "Show me all the Cloud SQL instances in this project.",
-      "conversation_plan": "Ask the agent to list the Cloud SQL instances in the current project.",
+      "id": "cloud-sql-debug-instance",
+      "starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.",
+      "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.",
       "expected_trajectory": [
-        "list_instances"
+        "list_instances",
+        "get_instance"
       ],
-      "kind": "tool",
-      "max_turns": 5
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+      },
+      "kind": "tools",
+      "max_turns": 4
     },
     {
-      "id": "cloud-sql-data-explore",
-      "starting_prompt": "What schemas and tables do we have in this database? Please list them.",
-      "conversation_plan": "Ask the agent to list the schemas in the database. Then ask to list the tables.",
+      "id": "cloud-sql-schema-tables-explore",
+      "starting_prompt": "I want to understand the structure of my database.",
+      "conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.",
       "expected_trajectory": [
         "list_schemas",
         "list_tables"
       ],
-      "kind": "tool",
-      "max_turns": 5
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+      },
+      "kind": "tools",
+      "max_turns": 6
     },
     {
-      "id": "cloud-sql-perf-troubleshoot",
-      "starting_prompt": "The database is running slow. Are there any active queries running for more than 10 seconds or any locks?",
-      "conversation_plan": "Ask the agent to check for active queries running longer than 10 seconds. Then ask to check for locks.",
+      "id": "cloud-sql-performance-check",
+      "starting_prompt": "Our database performance seems degraded.",
+      "conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.",
       "expected_trajectory": [
         "list_active_queries",
         "list_locks"
       ],
-      "kind": "tool",
-      "max_turns": 5
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+      },
+      "kind": "tools",
+      "max_turns": 6
     },
     {
-      "id": "cloud-sql-metrics-cpu",
-      "starting_prompt": "Can you show me the CPU utilization for instance 'daily-ci-evals-db' in project 'ext-test-cloud-sql-postgres' for the last 5 minutes?",
-      "conversation_plan": "Ask the agent to query the CPU utilization metric for the specified instance and project using PromQL.",
+      "id": "cloud-sql-metrics-cpu-investigation",
+      "starting_prompt": "I'm worried about the database load for daily-ci-evals-db.",
+      "conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
       "expected_trajectory": [
-        "get_system_metrics"
+        "get_system_metrics",
+        "list_database_stats"
       ],
-      "kind": "tool",
-      "max_turns": 4
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+      },
+      "kind": "tools",
+      "max_turns": 6
     },
     {
-      "id": "cloud-sql-unused-indexes",
-      "starting_prompt": "Are there any unused indexes in the database that we can clean up?",
-      "conversation_plan": "Ask the agent to list unused indexes in the database.",
+      "id": "cloud-sql-instance-not-found",
+      "starting_prompt": "Get details for the instance 'missing-db-123'.",
+      "conversation_plan": "The user asks for details of an instance named 'missing-db-123' that doesn't exist. The agent should try to get it, fail, and inform the user. The user will then ask to list instances to find the correct name.",
       "expected_trajectory": [
-        "list_indexes"
+        "get_instance",
+        "list_instances"
       ],
-      "kind": "tool",
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+      },
+      "kind": "tools",
       "max_turns": 4
     }
   ]
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index 6a2a657..8f1aedf 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -23,9 +23,22 @@ model_config: /workspace/evals/model_config.yaml
 simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
 
 scorers:
+  # Structural
   trajectory_matcher: {}
+  
+  # Qualitative (Judge-based)
   goal_completion:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  behavioral_metrics:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  parameter_analysis:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+
+  # Performance
+  turn_count: {}
+  end_to_end_latency: {}
+  tool_call_latency: {}
+  token_consumption: {}
 
 reporting:
   bigquery:

From 602da9fd2d0e11d6eaf377bb980ef9ecfdd71968 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Wed, 22 Apr 2026 06:36:10 +0000
Subject: [PATCH 20/24] refactor: parameterize evaluation configuration
 variables and remove unused scorers

---
 cloudbuild.yaml         |  8 ++++++++
 evals/model_config.yaml | 15 ++++++++-------
 evals/run_config.yaml   |  5 -----
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 73ccaa1..e542bca 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -32,6 +32,14 @@ steps:
 
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=us-central1
+        export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
+        export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
+        export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
+        export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION
+        export CLOUD_SQL_POSTGRES_DATABASE=$_CLOUD_SQL_DATABASE
+        export CLOUD_SQL_POSTGRES_USER=$_CLOUD_SQL_USER
+        export CLOUD_SQL_POSTGRES_IP_TYPE=$_CLOUD_SQL_IP_TYPE
+
         # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
         export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
 
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 7dba6a5..8460a7e 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -15,7 +15,8 @@
 gemini_cli_version: "@google/gemini-cli@0.38.1"
 generator: gemini_cli
 env:
-  GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres"
+  GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
+
   GOOGLE_CLOUD_LOCATION: "global"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
 setup:
@@ -23,10 +24,10 @@ setup:
     # Points to the symlink created in cloudbuild.yaml to match the extension ID
     "/workspace/cloud-sql-postgresql":
       settings:
-        CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres"
-        CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db"
-        CLOUD_SQL_POSTGRES_REGION: "us-central1"
-        CLOUD_SQL_POSTGRES_DATABASE: "postgres"
-        CLOUD_SQL_POSTGRES_USER: "postgres"
+        CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
+        CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
+        CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
+        CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
+        CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
         CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
-        CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"
+        CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index 8f1aedf..b83b7e6 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -23,16 +23,11 @@ model_config: /workspace/evals/model_config.yaml
 simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
 
 scorers:
-  # Structural
-  trajectory_matcher: {}
-  
   # Qualitative (Judge-based)
   goal_completion:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
   behavioral_metrics:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
-  parameter_analysis:
-    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
 
   # Performance
   turn_count: {}

From 65a84d9a0e386cad70a706444a0bf3f70dc5646d Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Wed, 22 Apr 2026 07:08:22 +0000
Subject: [PATCH 21/24] refactor: switch to standalone evaluation execution

---
 cloudbuild.yaml | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index e542bca..ad967fb 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -46,25 +46,13 @@ steps:
         # Substitute environment variables in model_config.yaml
         python3 /workspace/evals/substitute_env.py
 
-        export EVALBENCH_INSECURE=True
-        export EVALBENCH_HOST=0.0.0.0
-        cd evalbench
-        export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
-        export PYTHONPATH=./evalproto:.
-        export CLOUD_RUN=True
-        export PORT=50051
-
-        echo "Starting Evaluation Server in background..."
-        python3 -u ./eval_server.py --localhost </dev/null 2>&1 | tee server.log &
-
-        echo "Waiting for port 50051 to open..."
-        python3 /workspace/evals/wait_for_port.py || { echo "Server failed to bind port."; exit 1; }
-
-        echo "Server is running. Launching Evaluation Client..."
         cd /evalbench
         export PYTHONPATH=./evalbench:./evalbench/evalproto
+        export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+        echo "Launching Standalone Evaluation..."
+        python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
 
-        python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; }
 
 availableSecrets:
   secretManager:

From a130f3de316cc2b7f547a7c5b50e9442b465db74 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Wed, 22 Apr 2026 07:12:29 +0000
Subject: [PATCH 22/24] refactor: update eval_server image to latest

---
 cloudbuild.yaml         | 2 +-
 evals/model_config.yaml | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index ad967fb..402e43c 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -18,7 +18,7 @@ options:
 steps:
 
   # --- Evaluation Step ---
-  - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:89aa9fefd4b247610a95ef0896ba55d468563f50'
+  - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
     entrypoint: 'bash'
     # Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable
     secretEnv: ['DB_PASSWORD'] 
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 8460a7e..485c758 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -16,7 +16,6 @@ gemini_cli_version: "@google/gemini-cli@0.38.1"
 generator: gemini_cli
 env:
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
-
   GOOGLE_CLOUD_LOCATION: "global"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
 setup:

From 42ce176a6d8dec763284b9bf8f97221b18c92adf Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Wed, 22 Apr 2026 07:15:30 +0000
Subject: [PATCH 23/24] refactor: remove unused port check utility

---
 evals/wait_for_port.py | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 evals/wait_for_port.py

diff --git a/evals/wait_for_port.py b/evals/wait_for_port.py
deleted file mode 100644
index 8b11c44..0000000
--- a/evals/wait_for_port.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import socket
-import time
-import sys
-
-def main():
-    for i in range(20):
-        try:
-            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            s.settimeout(1)
-            s.connect(('127.0.0.1', 50051))
-            print('Port 50051 is open!')
-            sys.exit(0)
-        except Exception:
-            print('Port not open yet, retrying...')
-            time.sleep(1)
-    print('Port failed to open')
-    sys.exit(1)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file

From 941693b66c554a4f8dae08e11c38521dddbdf1a8 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Wed, 22 Apr 2026 11:31:16 +0000
Subject: [PATCH 24/24] refactor: standardize max_turns to 3 across all
 evaluation datasets and remove redundant cloud-sql-instance-not-found entry

---
 evals/dataset.json | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/evals/dataset.json b/evals/dataset.json
index 6716b0c..a42bbae 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -12,7 +12,7 @@
         "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
       },
       "kind": "tools",
-      "max_turns": 4
+      "max_turns": 3
     },
     {
       "id": "cloud-sql-schema-tables-explore",
@@ -26,7 +26,7 @@
         "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
       },
       "kind": "tools",
-      "max_turns": 6
+      "max_turns": 3
     },
     {
       "id": "cloud-sql-performance-check",
@@ -40,7 +40,7 @@
         "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
       },
       "kind": "tools",
-      "max_turns": 6
+      "max_turns": 3
     },
     {
       "id": "cloud-sql-metrics-cpu-investigation",
@@ -54,21 +54,7 @@
         "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
       },
       "kind": "tools",
-      "max_turns": 6
-    },
-    {
-      "id": "cloud-sql-instance-not-found",
-      "starting_prompt": "Get details for the instance 'missing-db-123'.",
-      "conversation_plan": "The user asks for details of an instance named 'missing-db-123' that doesn't exist. The agent should try to get it, fail, and inform the user. The user will then ask to list instances to find the correct name.",
-      "expected_trajectory": [
-        "get_instance",
-        "list_instances"
-      ],
-      "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
-      },
-      "kind": "tools",
-      "max_turns": 4
+      "max_turns": 3
     }
   ]
 }
\ No newline at end of file