From b6305c276d63045d0c8db98f3ac9b32b8590da47 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Fri, 10 Apr 2026 08:43:38 +0000 Subject: [PATCH 01/24] feat: add Dockerfile and Cloud Build pipeline for automated Evalbench testing of Cloud SQL PostgreSQL extension --- Dockerfile | 36 +++++++++++++++++++++++ cloudbuild.yaml | 64 +++++++++++++++++++++++++++++++++++++++++ evals/dataset.json | 15 ++++++++++ evals/model_config.yaml | 18 ++++++++++++ evals/run_config.yaml | 12 ++++++++ 5 files changed, 145 insertions(+) create mode 100644 Dockerfile create mode 100644 cloudbuild.yaml create mode 100644 evals/dataset.json create mode 100644 evals/model_config.yaml create mode 100644 evals/run_config.yaml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..39e5b3c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,36 @@ +# --- Stage 1: Build the binary from source (Latest Nightly) --- +FROM golang:1.25 AS builder + +WORKDIR /build + +# Clone the official genai-toolbox source code (always latest main branch) +RUN git clone --depth 1 https://github.com/googleapis/genai-toolbox.git . + +# Compile the binary with CGO ENABLED to support all upstream database drivers (Oracle, etc.) +RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o toolbox . + +# --- Stage 2: Final Lightweight Runtime Image --- +# Using the exact same image (golang:1.25) for runtime to perfectly match GLIBC versions +FROM golang:1.25 + + +# Install necessary runtime certificates and standard C libraries for CGO binary +RUN apt-get update && apt-get install -y ca-certificates libc6 && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy the freshly compiled binary from the builder stage +COPY --from=builder /build/toolbox /app/toolbox +RUN chmod +x /app/toolbox + +# Copy the extension's skills and configuration into the container +COPY skills/ ./skills/ +COPY gemini-extension.json . + +# Add required tools.yaml placeholder to satisfy binary startup checks +RUN touch tools.yaml + +# Expose HTTP API and UI endpoints to successfully pass Cloud Run health checks +ENTRYPOINT ["/app/toolbox", "--prebuilt", "cloud-sql-postgres", "--address=0.0.0.0", "--port=8080", "--enable-api", "--ui"] + + diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..44bb5ff --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,64 @@ +steps: + + # --- STEP 1: Build and Push Docker Image --- + - name: 'gcr.io/cloud-builders/docker' + args: + - 'build' + - '-t' + - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest' + - '.' + + - name: 'gcr.io/cloud-builders/docker' + args: + - 'push' + - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest' + + # --- STEP 2: Deploy to Cloud Run --- + - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + entrypoint: gcloud + args: + - 'run' + - 'deploy' + - 'cloud-sql-postgresql-server' + - '--image=us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest' + - '--region=us-central1' + - '--allow-unauthenticated' + - '--port=8080' + - '--timeout=300' + - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=7`[EP^`U"_frcD;q,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' + + # --- STEP 3: Run Eval Server in Background --- + - name: 'gcr.io/cloud-builders/docker' + args: + - 'run' + - '-d' + - '--network=cloudbuild' + - '--name=eval_server' + - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest' + + # --- STEP 4: Run Evalbench Evaluation Client --- + # - name: 'python:3.10' + # entrypoint: 'bash' + # args: + # - '-c' + # - | + # # Clone Evalbench + # git clone https://github.com/GoogleCloudPlatform/evalbench.git + # cd evalbench + + # # Install Dependencies + # pip install -r requirements.txt + + # # Setup Environment Variables + # export EVAL_GCP_PROJECT_ID=omkar-playground + # export EVAL_GCP_PROJECT_REGION=us-central1 + # export EVAL_CONFIG=../evals/run_config.yaml + + # # Compile required protobuf modules and Run Evaluation Client against the eval_server container + # make proto + # ./run_client.sh --endpoint=eval_server:50051 + + +options: + env: + - 'DOCKER_BUILDKIT=1' diff --git a/evals/dataset.json b/evals/dataset.json new file mode 100644 index 0000000..42af644 --- /dev/null +++ b/evals/dataset.json @@ -0,0 +1,15 @@ +{ + "scenarios": [ + { + "id": "cloud-sql-debug-01", + "starting_prompt": "I need to debug the database.", + "conversation_plan": "Ask the agent to list instances in project omkar-playground. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.", + "expected_trajectory": [ + "list_instances", + "get_metrics" + ], + "kind": "tool", + "max_turns": 15 + } + ] +} \ No newline at end of file diff --git a/evals/model_config.yaml b/evals/model_config.yaml new file mode 100644 index 0000000..dbb2dc5 --- /dev/null +++ b/evals/model_config.yaml @@ -0,0 +1,18 @@ +gemini_cli_version: "@google/gemini-cli@0.26.0" +generator: gemini_cli +env: + GOOGLE_CLOUD_PROJECT: "omkar-playground" + GOOGLE_CLOUD_LOCATION: "us-central1" + GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_API_MODEL: "gemini-2.5-pro" +setup: + extensions: + "https://github.com/gemini-cli-extensions/cloud-sql-postgresql": + settings: + CLOUD_SQL_POSTGRES_PROJECT: "omkar-playground" + CLOUD_SQL_POSTGRES_INSTANCE: "omkar-demo-postgres-1" + CLOUD_SQL_POSTGRES_REGION: "us-central1" + CLOUD_SQL_POSTGRES_DATABASE: "postgres" + CLOUD_SQL_POSTGRES_USER: "postgres" + CLOUD_SQL_POSTGRES_PASSWORD: '7`[EP^`U"_frcD;q' + CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC" diff --git a/evals/run_config.yaml b/evals/run_config.yaml new file mode 100644 index 0000000..a631de9 --- /dev/null +++ b/evals/run_config.yaml @@ -0,0 +1,12 @@ +dataset_config: /workspace/evals/dataset.json +dataset_format: gemini-cli-format + +orchestrator: geminicli +model_config: /workspace/evals/model_config.yaml +# You can reference default simulated user models provided by the evalbench repo: +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +scorers: + trajectory_matcher: {} + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml From 01bc25bebc1245fc2a0b8f8f82c77a39be1d1c00 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 13 Apr 2026 10:22:54 +0000 Subject: [PATCH 02/24] feat: integrate full evaluation pipeline in cloudbuild and update model configurations --- cloudbuild.yaml | 96 ++++++++++++++++++++++----------- evals/dataset.json | 3 +- evals/gemini_2.5_pro_model.yaml | 4 ++ evals/model_config.yaml | 2 +- evals/run_config.yaml | 4 +- 5 files changed, 73 insertions(+), 36 deletions(-) create mode 100644 evals/gemini_2.5_pro_model.yaml diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 44bb5ff..90bbcd8 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -25,40 +25,74 @@ steps: - '--allow-unauthenticated' - '--port=8080' - '--timeout=300' - - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=7`[EP^`U"_frcD;q,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' + - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' - # --- STEP 3: Run Eval Server in Background --- - - name: 'gcr.io/cloud-builders/docker' + # --- STEP 3: Fully Integrated Evaluation to Persist Results --- + - name: 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest' + entrypoint: 'bash' args: - - 'run' - - '-d' - - '--network=cloudbuild' - - '--name=eval_server' - - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest' + - '-c' + - | + set -e + cd /evalbench + + export EVAL_GCP_PROJECT_ID=omkar-playground + export EVAL_GCP_PROJECT_REGION=us-central1 + + echo "Compiling protobuf files..." + python3 -m grpc_tools.protoc --proto_path=evalbench/evalproto --python_out=evalbench/evalproto --grpc_python_out=evalbench/evalproto evalbench/evalproto/*.proto + + echo "Patching client to use insecure credentials..." + # sed -i 's/"localhost:50051"/"127.0.0.1:50051"/g' evalbench/client/eval_client.py + sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py + sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py + + echo "Patching server to listen on all IPv4 interfaces (0.0.0.0)..." + sed -i 's/"\[::\]:%s"/"0.0.0.0:%s"/g' /evalbench/evalbench/eval_server.py + echo "Checking bind success in server (writing to stderr)..." + sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n import sys\n sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py - # --- STEP 4: Run Evalbench Evaluation Client --- - # - name: 'python:3.10' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # # Clone Evalbench - # git clone https://github.com/GoogleCloudPlatform/evalbench.git - # cd evalbench + echo "Patching eval_service.py to fix TypeError in get_reporters..." + sed -i 's|reporters = get_reporters(config.get("reporting"), job_id, run_time)|reporters = get_reporters(config.get("reporting") or {}, job_id, run_time)|' /evalbench/evalbench/eval_service.py + + echo "Patching util/session.py to make ADK import lazy..." + sed -i 's|from google.adk.sessions import VertexAiSessionService||' /evalbench/evalbench/util/session.py + sed -i 's| def __init__(self, config):| def __init__(self, config):\n from google.adk.sessions import VertexAiSessionService|' /evalbench/evalbench/util/session.py + echo "Patching databases/util.py to make SecretManagerClient lazy..." + sed -i 's|CLIENT = secretmanager_v1.SecretManagerServiceClient()|CLIENT = None\ndef get_client():\n global CLIENT\n if CLIENT is None:\n CLIENT = secretmanager_v1.SecretManagerServiceClient()\n return CLIENT|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py" + sed -i 's|CLIENT.access_secret_version|get_client().access_secret_version|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py usage" + cd evalbench + export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + export PYTHONPATH=./evalproto:. + export CLOUD_RUN=True + export PORT=50051 + + + + echo "Starting Evaluation Server in background..." + # NEW: Added Date: Mon, 13 Apr 2026 11:32:38 +0000 Subject: [PATCH 03/24] chore: update Dockerfile to fetch binary, and parameterize Cloud Build variables with $PROJECT_ID --- Dockerfile | 30 +++++++++--------------------- cloudbuild.yaml | 12 ++++++------ 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/Dockerfile b/Dockerfile index 39e5b3c..47d3a31 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,26 +1,16 @@ -# --- Stage 1: Build the binary from source (Latest Nightly) --- -FROM golang:1.25 AS builder +# --- Final Runtime Image --- +# Using python:3.11 as the base image to support evaluations that require Python, +# while still running the pre-compiled Go binary for the toolbox server. +FROM python:3.11 -WORKDIR /build - -# Clone the official genai-toolbox source code (always latest main branch) -RUN git clone --depth 1 https://github.com/googleapis/genai-toolbox.git . - -# Compile the binary with CGO ENABLED to support all upstream database drivers (Oracle, etc.) -RUN CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o toolbox . - -# --- Stage 2: Final Lightweight Runtime Image --- -# Using the exact same image (golang:1.25) for runtime to perfectly match GLIBC versions -FROM golang:1.25 - - -# Install necessary runtime certificates and standard C libraries for CGO binary -RUN apt-get update && apt-get install -y ca-certificates libc6 && rm -rf /var/lib/apt/lists/* +# Install necessary runtime certificates, standard C libraries, and curl +RUN apt-get update && apt-get install -y ca-certificates libc6 curl && rm -rf /var/lib/apt/lists/* WORKDIR /app -# Copy the freshly compiled binary from the builder stage -COPY --from=builder /build/toolbox /app/toolbox +# Dynamically fetch the latest version and download the binary +RUN LATEST_VERSION=$(curl -s https://api.github.com/repos/googleapis/mcp-toolbox/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') && \ + curl -L https://storage.googleapis.com/mcp-toolbox-for-databases/${LATEST_VERSION}/linux/amd64/toolbox -o /app/toolbox RUN chmod +x /app/toolbox # Copy the extension's skills and configuration into the container @@ -32,5 +22,3 @@ RUN touch tools.yaml # Expose HTTP API and UI endpoints to successfully pass Cloud Run health checks ENTRYPOINT ["/app/toolbox", "--prebuilt", "cloud-sql-postgres", "--address=0.0.0.0", "--port=8080", "--enable-api", "--ui"] - - diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 90bbcd8..68c06d8 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -5,13 +5,13 @@ steps: args: - 'build' - '-t' - - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest' + - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - '.' - name: 'gcr.io/cloud-builders/docker' args: - 'push' - - 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest' + - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' # --- STEP 2: Deploy to Cloud Run --- - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' @@ -20,15 +20,15 @@ steps: - 'run' - 'deploy' - 'cloud-sql-postgresql-server' - - '--image=us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/cloud-sql-postgresql:latest' + - '--image=us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - '--region=us-central1' - '--allow-unauthenticated' - '--port=8080' - '--timeout=300' - - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' + - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' # --- STEP 3: Fully Integrated Evaluation to Persist Results --- - - name: 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest' + - name: 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/eval_server:latest' entrypoint: 'bash' args: - '-c' @@ -36,7 +36,7 @@ steps: set -e cd /evalbench - export EVAL_GCP_PROJECT_ID=omkar-playground + export EVAL_GCP_PROJECT_ID=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=us-central1 echo "Compiling protobuf files..." From a83c79e2970638eadfd727401c60c951aafcce9e Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 13 Apr 2026 12:16:42 +0000 Subject: [PATCH 04/24] chore: enable BigQuery reporting in eval configuration --- evals/run_config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/evals/run_config.yaml b/evals/run_config.yaml index ce09cfd..d2ba9cd 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -10,3 +10,6 @@ scorers: trajectory_matcher: {} goal_completion: model_config: /workspace/evals/gemini_2.5_pro_model.yaml + +reporting: + bigquery: {} From c221c21658ded50ac24bb95c3637270b784b319f Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Wed, 15 Apr 2026 10:30:30 +0000 Subject: [PATCH 05/24] chore: update model config path, disable unauthenticated cloud run access, and remove evalbench patching scripts --- cloudbuild.yaml | 20 +++++++++----------- evals/model_config.yaml | 10 +++++++++- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 68c06d8..a449e9e 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -6,7 +6,7 @@ steps: - 'build' - '-t' - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - - '.' + - '.' # The "." evaluates Dockerfile - name: 'gcr.io/cloud-builders/docker' args: @@ -22,7 +22,7 @@ steps: - 'cloud-sql-postgresql-server' - '--image=us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - '--region=us-central1' - - '--allow-unauthenticated' + - '--no-allow-unauthenticated' - '--port=8080' - '--timeout=300' - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' @@ -34,6 +34,13 @@ steps: - '-c' - | set -e + # ---------------------------- + echo "=== INSIDE /workspace ===" + ls -la /workspace + + echo "=== INSIDE /evalbench ===" + ls -la /evalbench + # ---------------------------- cd /evalbench export EVAL_GCP_PROJECT_ID=$PROJECT_ID @@ -52,15 +59,6 @@ steps: echo "Checking bind success in server (writing to stderr)..." sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n import sys\n sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py - echo "Patching eval_service.py to fix TypeError in get_reporters..." - sed -i 's|reporters = get_reporters(config.get("reporting"), job_id, run_time)|reporters = get_reporters(config.get("reporting") or {}, job_id, run_time)|' /evalbench/evalbench/eval_service.py - - echo "Patching util/session.py to make ADK import lazy..." - sed -i 's|from google.adk.sessions import VertexAiSessionService||' /evalbench/evalbench/util/session.py - sed -i 's| def __init__(self, config):| def __init__(self, config):\n from google.adk.sessions import VertexAiSessionService|' /evalbench/evalbench/util/session.py - echo "Patching databases/util.py to make SecretManagerClient lazy..." - sed -i 's|CLIENT = secretmanager_v1.SecretManagerServiceClient()|CLIENT = None\ndef get_client():\n global CLIENT\n if CLIENT is None:\n CLIENT = secretmanager_v1.SecretManagerServiceClient()\n return CLIENT|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py" - sed -i 's|CLIENT.access_secret_version|get_client().access_secret_version|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py usage" cd evalbench export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python export PYTHONPATH=./evalproto:. diff --git a/evals/model_config.yaml b/evals/model_config.yaml index ff6eb2f..c63808a 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -7,7 +7,7 @@ env: GEMINI_API_MODEL: "gemini-2.5-pro" setup: extensions: - "https://github.com/gemini-cli-extensions/cloud-sql-postgresql": + "/workspace": settings: CLOUD_SQL_POSTGRES_PROJECT: "omkar-playground" CLOUD_SQL_POSTGRES_INSTANCE: "omkar-demo-postgres-1" @@ -16,3 +16,11 @@ setup: CLOUD_SQL_POSTGRES_USER: "postgres" CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD} CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC" + + # mcp_servers: + # "cloud-sql-postgresql": + # httpUrl: "CLOUD_RUN_URL_PLACEHOLDER" + # authProviderType: google_credentials + # oauth: + # scopes: + # - https://www.googleapis.com/auth/cloud-platform From a8401a2accebc5f41c4080aeffa216a7257f1539 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 16 Apr 2026 05:39:40 +0000 Subject: [PATCH 06/24] chore: remove debug logging from cloudbuild configuration --- cloudbuild.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index a449e9e..03f96ae 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -34,13 +34,6 @@ steps: - '-c' - | set -e - # ---------------------------- - echo "=== INSIDE /workspace ===" - ls -la /workspace - - echo "=== INSIDE /evalbench ===" - ls -la /evalbench - # ---------------------------- cd /evalbench export EVAL_GCP_PROJECT_ID=$PROJECT_ID From e36074080c072e10c3196827be3a474a5e163339 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 16 Apr 2026 10:27:23 +0000 Subject: [PATCH 07/24] feat: migrate database password from environment variable to Secret Manager for improved security --- cloudbuild.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 03f96ae..0288079 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -25,11 +25,13 @@ steps: - '--no-allow-unauthenticated' - '--port=8080' - '--timeout=300' - - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' + - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' + - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=cloud-sql-postgres-password:latest' # --- STEP 3: Fully Integrated Evaluation to Persist Results --- - name: 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/eval_server:latest' entrypoint: 'bash' + secretEnv: ['DB_PASSWORD'] args: - '-c' - | @@ -87,3 +89,8 @@ steps: export PYTHONPATH=./evalbench:./evalbench/evalproto python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; } + +availableSecrets: + secretManager: + - versionName: projects/$PROJECT_ID/secrets/cloud-sql-postgres-password/versions/latest + env: 'DB_PASSWORD' From 73bb2025d50dc723079ab22763df922dd21f3408 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 16 Apr 2026 18:31:32 +0000 Subject: [PATCH 08/24] refactor: remove unnecessary comment from docker build step in cloudbuild.yaml --- cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 0288079..1373f52 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -6,7 +6,7 @@ steps: - 'build' - '-t' - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - - '.' # The "." evaluates Dockerfile + - '.' - name: 'gcr.io/cloud-builders/docker' args: From c88e6f7d518190e7a471fd3a9f1960f81ccee298 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Fri, 17 Apr 2026 05:30:09 +0000 Subject: [PATCH 09/24] chore: add Apache 2.0 license headers to configuration and build files --- cloudbuild.yaml | 14 ++++++++++++++ evals/gemini_2.5_pro_model.yaml | 14 ++++++++++++++ evals/model_config.yaml | 14 ++++++++++++++ evals/run_config.yaml | 14 ++++++++++++++ 4 files changed, 56 insertions(+) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 1373f52..c94ad19 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -1,3 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + steps: # --- STEP 1: Build and Push Docker Image --- diff --git a/evals/gemini_2.5_pro_model.yaml b/evals/gemini_2.5_pro_model.yaml index 26eb0b7..7154ec3 100644 --- a/evals/gemini_2.5_pro_model.yaml +++ b/evals/gemini_2.5_pro_model.yaml @@ -1,3 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + generator: gcp_vertex_gemini vertex_model: gemini-2.5-pro base_prompt: "" diff --git a/evals/model_config.yaml b/evals/model_config.yaml index c63808a..5addfd5 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -1,3 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + gemini_cli_version: "@google/gemini-cli@0.26.0" generator: gemini_cli env: diff --git a/evals/run_config.yaml b/evals/run_config.yaml index d2ba9cd..8aaaabb 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -1,3 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + dataset_config: /workspace/evals/dataset.json dataset_format: gemini-cli-format From d7168ea89bcc8d24caca685cc49599bfcccb809c Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Fri, 17 Apr 2026 07:50:08 +0000 Subject: [PATCH 10/24] chore: update cloudbuild configuration and model environment settings to use daily CI evaluation database --- cloudbuild.yaml | 11 +++++++---- evals/model_config.yaml | 6 +++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index c94ad19..ce091af 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +options: + logging: CLOUD_LOGGING_ONLY + steps: # --- STEP 1: Build and Push Docker Image --- @@ -39,11 +42,11 @@ steps: - '--no-allow-unauthenticated' - '--port=8080' - '--timeout=300' - - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' - - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=cloud-sql-postgres-password:latest' + - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=daily-ci-evals-db,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' + - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=daily-ci-evals-db-password:latest' # --- STEP 3: Fully Integrated Evaluation to Persist Results --- - - name: 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/eval_server:latest' + - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' entrypoint: 'bash' secretEnv: ['DB_PASSWORD'] args: @@ -106,5 +109,5 @@ steps: availableSecrets: secretManager: - - versionName: projects/$PROJECT_ID/secrets/cloud-sql-postgres-password/versions/latest + - versionName: projects/$PROJECT_ID/secrets/daily-ci-evals-db-password/versions/latest env: 'DB_PASSWORD' diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 5addfd5..8456724 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -15,7 +15,7 @@ gemini_cli_version: "@google/gemini-cli@0.26.0" generator: gemini_cli env: - GOOGLE_CLOUD_PROJECT: "omkar-playground" + GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres" GOOGLE_CLOUD_LOCATION: "us-central1" GOOGLE_GENAI_USE_VERTEXAI: "true" GEMINI_API_MODEL: "gemini-2.5-pro" @@ -23,8 +23,8 @@ setup: extensions: "/workspace": settings: - CLOUD_SQL_POSTGRES_PROJECT: "omkar-playground" - CLOUD_SQL_POSTGRES_INSTANCE: "omkar-demo-postgres-1" + CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres" + CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db" CLOUD_SQL_POSTGRES_REGION: "us-central1" CLOUD_SQL_POSTGRES_DATABASE: "postgres" CLOUD_SQL_POSTGRES_USER: "postgres" From d38f261a92e66ac1575d86e31c3e4ed900fd5e6d Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Fri, 17 Apr 2026 10:15:44 +0000 Subject: [PATCH 11/24] chore: update evaluation project ID and gemini-cli dependency version --- evals/dataset.json | 2 +- evals/model_config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/dataset.json b/evals/dataset.json index 65c48fe..e8eadb4 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -3,7 +3,7 @@ { "id": "cloud-sql-debug-01", "starting_prompt": "I need to debug the database.", - "conversation_plan": "Ask the agent to list instances in project omkar-playground. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.", + "conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.", "expected_trajectory": [ "list_instances" ], diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 8456724..2e7e953 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -gemini_cli_version: "@google/gemini-cli@0.26.0" +gemini_cli_version: "@google/gemini-cli@0.38.1" generator: gemini_cli env: GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres" From aa6ab94479175b577ed9bad5c58b17351828fddf Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Fri, 17 Apr 2026 18:38:57 +0000 Subject: [PATCH 12/24] chore: configure service account for Cloud Run step and simplify evaluation server execution script --- cloudbuild.yaml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index ce091af..6bdb7b8 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -44,6 +44,7 @@ steps: - '--timeout=300' - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=daily-ci-evals-db,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=daily-ci-evals-db-password:latest' + - '--service-account=evals-ci-runner@ext-test-cloud-sql-postgres.iam.gserviceaccount.com' # --- STEP 3: Fully Integrated Evaluation to Persist Results --- - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' @@ -58,11 +59,8 @@ steps: export EVAL_GCP_PROJECT_ID=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=us-central1 - echo "Compiling protobuf files..." - python3 -m grpc_tools.protoc --proto_path=evalbench/evalproto --python_out=evalbench/evalproto --grpc_python_out=evalbench/evalproto evalbench/evalproto/*.proto - echo "Patching client to use insecure credentials..." - # sed -i 's/"localhost:50051"/"127.0.0.1:50051"/g' evalbench/client/eval_client.py + sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py @@ -77,12 +75,8 @@ steps: export CLOUD_RUN=True export PORT=50051 - - echo "Starting Evaluation Server in background..." - # NEW: Added &1 | tee server.log & echo "Waiting for port 50051 to open..." python3 -c " From 6b1d22be087d51605c22fbfc3afffacab506d482 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 20 Apr 2026 06:01:02 +0000 Subject: [PATCH 13/24] feat: add GEMINI_MODEL configuration to model_config.yaml, ortherwise it was attempting to use gemini-3-flash-preview and failing due to a 404 permissions error. --- evals/model_config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 2e7e953..6823dce 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -19,6 +19,7 @@ env: GOOGLE_CLOUD_LOCATION: "us-central1" GOOGLE_GENAI_USE_VERTEXAI: "true" GEMINI_API_MODEL: "gemini-2.5-pro" + GEMINI_MODEL: "gemini-2.5-pro" setup: extensions: "/workspace": From 8b9462c24944b05a6145df15f0e8336acb2e1acb Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 20 Apr 2026 08:18:42 +0000 Subject: [PATCH 14/24] chore: standardize model evaluation configuration settings --- evals/model_config.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 6823dce..0d34647 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -16,14 +16,12 @@ gemini_cli_version: "@google/gemini-cli@0.38.1" generator: gemini_cli env: GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres" - GOOGLE_CLOUD_LOCATION: "us-central1" + GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" - GEMINI_API_MODEL: "gemini-2.5-pro" - GEMINI_MODEL: "gemini-2.5-pro" setup: extensions: "/workspace": - settings: + settings: CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres" CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db" CLOUD_SQL_POSTGRES_REGION: "us-central1" From ae11bdfb21506df2c13a661b3cfa8cd743ab908e Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 20 Apr 2026 11:49:22 +0000 Subject: [PATCH 15/24] feat: inject CLOUD_SQL_POSTGRES_PASSWORD into build step and update evaluation dataset scenarios --- cloudbuild.yaml | 5 ++++- evals/dataset.json | 50 +++++++++++++++++++++++++++++++++++++---- evals/model_config.yaml | 2 +- 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 6bdb7b8..1e78174 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -49,7 +49,8 @@ steps: # --- STEP 3: Fully Integrated Evaluation to Persist Results --- - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' entrypoint: 'bash' - secretEnv: ['DB_PASSWORD'] + # Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable + secretEnv: ['DB_PASSWORD'] args: - '-c' - | @@ -58,6 +59,8 @@ steps: export EVAL_GCP_PROJECT_ID=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=us-central1 + # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills + export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD echo "Patching client to use insecure credentials..." diff --git a/evals/dataset.json b/evals/dataset.json index e8eadb4..a442e03 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -1,14 +1,56 @@ { "scenarios": [ { - "id": "cloud-sql-debug-01", - "starting_prompt": "I need to debug the database.", - "conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.", + "id": "cloud-sql-list-instances", + "starting_prompt": "Show me all the Cloud SQL instances in this project.", + "conversation_plan": "Ask the agent to list the Cloud SQL instances in the current project.", "expected_trajectory": [ "list_instances" ], "kind": "tool", - "max_turns": 15 + "max_turns": 5 + }, + { + "id": "cloud-sql-data-explore", + "starting_prompt": "What schemas and tables do we have in this database? Please list them.", + "conversation_plan": "Ask the agent to list the schemas in the database. Then ask to list the tables.", + "expected_trajectory": [ + "list_schemas", + "list_tables" + ], + "kind": "tool", + "max_turns": 5 + }, + { + "id": "cloud-sql-perf-troubleshoot", + "starting_prompt": "The database is running slow. Are there any active queries running for more than 10 seconds or any locks?", + "conversation_plan": "Ask the agent to check for active queries running longer than 10 seconds. Then ask to check for locks.", + "expected_trajectory": [ + "list_active_queries", + "list_locks" + ], + "kind": "tool", + "max_turns": 5 + }, + { + "id": "cloud-sql-metrics-cpu", + "starting_prompt": "Can you show me the CPU utilization for instance 'daily-ci-evals-db' in project 'ext-test-cloud-sql-postgres' for the last 5 minutes?", + "conversation_plan": "Ask the agent to query the CPU utilization metric for the specified instance and project using PromQL.", + "expected_trajectory": [ + "get_system_metrics" + ], + "kind": "tool", + "max_turns": 4 + }, + { + "id": "cloud-sql-unused-indexes", + "starting_prompt": "Are there any unused indexes in the database that we can clean up?", + "conversation_plan": "Ask the agent to list unused indexes in the database.", + "expected_trajectory": [ + "list_indexes" + ], + "kind": "tool", + "max_turns": 4 } ] } \ No newline at end of file diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 0d34647..a461eb5 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -21,7 +21,7 @@ env: setup: extensions: "/workspace": - settings: + settings: CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres" CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db" CLOUD_SQL_POSTGRES_REGION: "us-central1" From 5a6d8650daed2e2ccd316828658870c37add1176 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 21 Apr 2026 06:07:50 +0000 Subject: [PATCH 16/24] remove Dockerfile, cloudbuild deployment steps, and configurations --- Dockerfile | 24 --------------------- cloudbuild.yaml | 46 ++++------------------------------------- evals/model_config.yaml | 8 ------- 3 files changed, 4 insertions(+), 74 deletions(-) delete mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 47d3a31..0000000 --- a/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -# --- Final Runtime Image --- -# Using python:3.11 as the base image to support evaluations that require Python, -# while still running the pre-compiled Go binary for the toolbox server. -FROM python:3.11 - -# Install necessary runtime certificates, standard C libraries, and curl -RUN apt-get update && apt-get install -y ca-certificates libc6 curl && rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -# Dynamically fetch the latest version and download the binary -RUN LATEST_VERSION=$(curl -s https://api.github.com/repos/googleapis/mcp-toolbox/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/') && \ - curl -L https://storage.googleapis.com/mcp-toolbox-for-databases/${LATEST_VERSION}/linux/amd64/toolbox -o /app/toolbox -RUN chmod +x /app/toolbox - -# Copy the extension's skills and configuration into the container -COPY skills/ ./skills/ -COPY gemini-extension.json . - -# Add required tools.yaml placeholder to satisfy binary startup checks -RUN touch tools.yaml - -# Expose HTTP API and UI endpoints to successfully pass Cloud Run health checks -ENTRYPOINT ["/app/toolbox", "--prebuilt", "cloud-sql-postgres", "--address=0.0.0.0", "--port=8080", "--enable-api", "--ui"] diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 1e78174..7710bea 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -17,37 +17,8 @@ options: steps: - # --- STEP 1: Build and Push Docker Image --- - - name: 'gcr.io/cloud-builders/docker' - args: - - 'build' - - '-t' - - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - - '.' - - - name: 'gcr.io/cloud-builders/docker' - args: - - 'push' - - 'us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - - # --- STEP 2: Deploy to Cloud Run --- - - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' - entrypoint: gcloud - args: - - 'run' - - 'deploy' - - 'cloud-sql-postgresql-server' - - '--image=us-central1-docker.pkg.dev/$PROJECT_ID/toolbox-evals/cloud-sql-postgresql:latest' - - '--region=us-central1' - - '--no-allow-unauthenticated' - - '--port=8080' - - '--timeout=300' - - '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID,CLOUD_SQL_POSTGRES_INSTANCE=daily-ci-evals-db,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC' - - '--set-secrets=CLOUD_SQL_POSTGRES_PASSWORD=daily-ci-evals-db-password:latest' - - '--service-account=evals-ci-runner@ext-test-cloud-sql-postgres.iam.gserviceaccount.com' - - # --- STEP 3: Fully Integrated Evaluation to Persist Results --- - - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' + # --- Evaluation Step --- + - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:89aa9fefd4b247610a95ef0896ba55d468563f50' entrypoint: 'bash' # Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable secretEnv: ['DB_PASSWORD'] @@ -61,17 +32,8 @@ steps: export EVAL_GCP_PROJECT_REGION=us-central1 # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD - - echo "Patching client to use insecure credentials..." - - sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py - sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py - - echo "Patching server to listen on all IPv4 interfaces (0.0.0.0)..." - sed -i 's/"\[::\]:%s"/"0.0.0.0:%s"/g' /evalbench/evalbench/eval_server.py - echo "Checking bind success in server (writing to stderr)..." - sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n import sys\n sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py - + export EVALBENCH_INSECURE=True + export EVALBENCH_HOST=0.0.0.0 cd evalbench export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python export PYTHONPATH=./evalproto:. diff --git a/evals/model_config.yaml b/evals/model_config.yaml index a461eb5..6b2bfd2 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -29,11 +29,3 @@ setup: CLOUD_SQL_POSTGRES_USER: "postgres" CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD} CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC" - - # mcp_servers: - # "cloud-sql-postgresql": - # httpUrl: "CLOUD_RUN_URL_PLACEHOLDER" - # authProviderType: google_credentials - # oauth: - # scopes: - # - https://www.googleapis.com/auth/cloud-platform From 35f1c787c9266bb7cab747a2ac3a57b998eb3caa Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 21 Apr 2026 10:02:34 +0000 Subject: [PATCH 17/24] feat: integrate environment substitution, add port polling utility, update Cloud Build symlinks, --- cloudbuild.yaml | 28 ++++++++++------------------ evals/model_config.yaml | 5 +++-- evals/substitute_env.py | 17 +++++++++++++++++ evals/wait_for_port.py | 20 ++++++++++++++++++++ 4 files changed, 50 insertions(+), 20 deletions(-) create mode 100644 evals/substitute_env.py create mode 100644 evals/wait_for_port.py diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 7710bea..73ccaa1 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -26,12 +26,18 @@ steps: - '-c' - | set -e + # Workaround for evalbench bug: settings are only applied if path basename matches extension ID + ln -s /workspace /workspace/cloud-sql-postgresql cd /evalbench export EVAL_GCP_PROJECT_ID=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=us-central1 # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD + + # Substitute environment variables in model_config.yaml + python3 /workspace/evals/substitute_env.py + export EVALBENCH_INSECURE=True export EVALBENCH_HOST=0.0.0.0 cd evalbench @@ -42,28 +48,14 @@ steps: echo "Starting Evaluation Server in background..." python3 -u ./eval_server.py --localhost &1 | tee server.log & - + echo "Waiting for port 50051 to open..." - python3 -c " - import socket - import time - for i in range(20): - try: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.connect(('127.0.0.1', 50051)) - print('Port is open!') - exit(0) - except Exception as e: - print(f'Port not open yet: {e}') - time.sleep(1) - print('Port failed to open') - exit(1) - " || { echo "Server failed to bind port. Check logs above."; exit 1; } - + python3 /workspace/evals/wait_for_port.py || { echo "Server failed to bind port."; exit 1; } + echo "Server is running. Launching Evaluation Client..." cd /evalbench export PYTHONPATH=./evalbench:./evalbench/evalproto - + python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; } availableSecrets: diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 6b2bfd2..7dba6a5 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -20,12 +20,13 @@ env: GOOGLE_GENAI_USE_VERTEXAI: "true" setup: extensions: - "/workspace": + # Points to the symlink created in cloudbuild.yaml to match the extension ID + "/workspace/cloud-sql-postgresql": settings: CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres" CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db" CLOUD_SQL_POSTGRES_REGION: "us-central1" CLOUD_SQL_POSTGRES_DATABASE: "postgres" CLOUD_SQL_POSTGRES_USER: "postgres" - CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD} + CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}' CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC" diff --git a/evals/substitute_env.py b/evals/substitute_env.py new file mode 100644 index 0000000..3ef2295 --- /dev/null +++ b/evals/substitute_env.py @@ -0,0 +1,17 @@ +import os +import re + +def main(): + yaml_path = '/workspace/evals/model_config.yaml' + if os.path.exists(yaml_path): + with open(yaml_path, 'r') as f: + content = f.read() + content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) + with open(yaml_path, 'w') as f: + f.write(content) + print(f"Successfully substituted environment variables in {yaml_path}") + else: + print(f"File not found: {yaml_path}") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/evals/wait_for_port.py b/evals/wait_for_port.py new file mode 100644 index 0000000..8b11c44 --- /dev/null +++ b/evals/wait_for_port.py @@ -0,0 +1,20 @@ +import socket +import time +import sys + +def main(): + for i in range(20): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(1) + s.connect(('127.0.0.1', 50051)) + print('Port 50051 is open!') + sys.exit(0) + except Exception: + print('Port not open yet, retrying...') + time.sleep(1) + print('Port failed to open') + sys.exit(1) + +if __name__ == '__main__': + main() \ No newline at end of file From d4f796a758d286c513016dd0e1ea1633f6788d2a Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 21 Apr 2026 11:33:21 +0000 Subject: [PATCH 18/24] chore: update configuration settings To identify which evaluation results belong to my extension added extension_id: cloud-sql-postgresql key --- evals/run_config.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 8aaaabb..6a2a657 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +extension_id: cloud-sql-postgresql + dataset_config: /workspace/evals/dataset.json dataset_format: gemini-cli-format @@ -26,4 +28,5 @@ scorers: model_config: /workspace/evals/gemini_2.5_pro_model.yaml reporting: - bigquery: {} + bigquery: + gcp_project_id: cloud-db-nl2sql From 1722276c7adcd1487caf62a0603803fbc716f72e Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 21 Apr 2026 14:41:52 +0000 Subject: [PATCH 19/24] feat: update evaluation scenarios and add performance and qualitative scorers to run configuration --- evals/dataset.json | 72 +++++++++++++++++++++++++++---------------- evals/run_config.yaml | 13 ++++++++ 2 files changed, 58 insertions(+), 27 deletions(-) diff --git a/evals/dataset.json b/evals/dataset.json index a442e03..6716b0c 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -1,55 +1,73 @@ { "scenarios": [ { - "id": "cloud-sql-list-instances", - "starting_prompt": "Show me all the Cloud SQL instances in this project.", - "conversation_plan": "Ask the agent to list the Cloud SQL instances in the current project.", + "id": "cloud-sql-debug-instance", + "starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.", + "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.", "expected_trajectory": [ - "list_instances" + "list_instances", + "get_instance" ], - "kind": "tool", - "max_turns": 5 + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 4 }, { - "id": "cloud-sql-data-explore", - "starting_prompt": "What schemas and tables do we have in this database? Please list them.", - "conversation_plan": "Ask the agent to list the schemas in the database. Then ask to list the tables.", + "id": "cloud-sql-schema-tables-explore", + "starting_prompt": "I want to understand the structure of my database.", + "conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.", "expected_trajectory": [ "list_schemas", "list_tables" ], - "kind": "tool", - "max_turns": 5 + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 6 }, { - "id": "cloud-sql-perf-troubleshoot", - "starting_prompt": "The database is running slow. Are there any active queries running for more than 10 seconds or any locks?", - "conversation_plan": "Ask the agent to check for active queries running longer than 10 seconds. Then ask to check for locks.", + "id": "cloud-sql-performance-check", + "starting_prompt": "Our database performance seems degraded.", + "conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.", "expected_trajectory": [ "list_active_queries", "list_locks" ], - "kind": "tool", - "max_turns": 5 + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 6 }, { - "id": "cloud-sql-metrics-cpu", - "starting_prompt": "Can you show me the CPU utilization for instance 'daily-ci-evals-db' in project 'ext-test-cloud-sql-postgres' for the last 5 minutes?", - "conversation_plan": "Ask the agent to query the CPU utilization metric for the specified instance and project using PromQL.", + "id": "cloud-sql-metrics-cpu-investigation", + "starting_prompt": "I'm worried about the database load for daily-ci-evals-db.", + "conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", "expected_trajectory": [ - "get_system_metrics" + "get_system_metrics", + "list_database_stats" ], - "kind": "tool", - "max_turns": 4 + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 6 }, { - "id": "cloud-sql-unused-indexes", - "starting_prompt": "Are there any unused indexes in the database that we can clean up?", - "conversation_plan": "Ask the agent to list unused indexes in the database.", + "id": "cloud-sql-instance-not-found", + "starting_prompt": "Get details for the instance 'missing-db-123'.", + "conversation_plan": "The user asks for details of an instance named 'missing-db-123' that doesn't exist. The agent should try to get it, fail, and inform the user. The user will then ask to list instances to find the correct name.", "expected_trajectory": [ - "list_indexes" + "get_instance", + "list_instances" ], - "kind": "tool", + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", "max_turns": 4 } ] diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 6a2a657..8f1aedf 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -23,9 +23,22 @@ model_config: /workspace/evals/model_config.yaml simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml scorers: + # Structural trajectory_matcher: {} + + # Qualitative (Judge-based) goal_completion: model_config: /workspace/evals/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + parameter_analysis: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + + # Performance + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} reporting: bigquery: From 602da9fd2d0e11d6eaf377bb980ef9ecfdd71968 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Wed, 22 Apr 2026 06:36:10 +0000 Subject: [PATCH 20/24] refactor: parameterize evaluation configuration variables and remove unused scorers --- cloudbuild.yaml | 8 ++++++++ evals/model_config.yaml | 15 ++++++++------- evals/run_config.yaml | 5 ----- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 73ccaa1..e542bca 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -32,6 +32,14 @@ steps: export EVAL_GCP_PROJECT_ID=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=us-central1 + export GOOGLE_CLOUD_PROJECT=$PROJECT_ID + export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID + export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE + export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION + export CLOUD_SQL_POSTGRES_DATABASE=$_CLOUD_SQL_DATABASE + export CLOUD_SQL_POSTGRES_USER=$_CLOUD_SQL_USER + export CLOUD_SQL_POSTGRES_IP_TYPE=$_CLOUD_SQL_IP_TYPE + # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 7dba6a5..8460a7e 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -15,7 +15,8 @@ gemini_cli_version: "@google/gemini-cli@0.38.1" generator: gemini_cli env: - GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres" + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" setup: @@ -23,10 +24,10 @@ setup: # Points to the symlink created in cloudbuild.yaml to match the extension ID "/workspace/cloud-sql-postgresql": settings: - CLOUD_SQL_POSTGRES_PROJECT: "ext-test-cloud-sql-postgres" - CLOUD_SQL_POSTGRES_INSTANCE: "daily-ci-evals-db" - CLOUD_SQL_POSTGRES_REGION: "us-central1" - CLOUD_SQL_POSTGRES_DATABASE: "postgres" - CLOUD_SQL_POSTGRES_USER: "postgres" + CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" + CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" + CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}" + CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}" + CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}" CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}' - CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC" + CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}" diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 8f1aedf..b83b7e6 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -23,16 +23,11 @@ model_config: /workspace/evals/model_config.yaml simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml scorers: - # Structural - trajectory_matcher: {} - # Qualitative (Judge-based) goal_completion: model_config: /workspace/evals/gemini_2.5_pro_model.yaml behavioral_metrics: model_config: /workspace/evals/gemini_2.5_pro_model.yaml - parameter_analysis: - model_config: /workspace/evals/gemini_2.5_pro_model.yaml # Performance turn_count: {} From 65a84d9a0e386cad70a706444a0bf3f70dc5646d Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Wed, 22 Apr 2026 07:08:22 +0000 Subject: [PATCH 21/24] refactor: switch to standalone evaluation execution --- cloudbuild.yaml | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index e542bca..ad967fb 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -46,25 +46,13 @@ steps: # Substitute environment variables in model_config.yaml python3 /workspace/evals/substitute_env.py - export EVALBENCH_INSECURE=True - export EVALBENCH_HOST=0.0.0.0 - cd evalbench - export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - export PYTHONPATH=./evalproto:. - export CLOUD_RUN=True - export PORT=50051 - - echo "Starting Evaluation Server in background..." - python3 -u ./eval_server.py --localhost &1 | tee server.log & - - echo "Waiting for port 50051 to open..." - python3 /workspace/evals/wait_for_port.py || { echo "Server failed to bind port."; exit 1; } - - echo "Server is running. Launching Evaluation Client..." cd /evalbench export PYTHONPATH=./evalbench:./evalbench/evalproto + export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + + echo "Launching Standalone Evaluation..." + python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml - python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; } availableSecrets: secretManager: From a130f3de316cc2b7f547a7c5b50e9442b465db74 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Wed, 22 Apr 2026 07:12:29 +0000 Subject: [PATCH 22/24] refactor: update eval_server image to latest --- cloudbuild.yaml | 2 +- evals/model_config.yaml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index ad967fb..402e43c 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -18,7 +18,7 @@ options: steps: # --- Evaluation Step --- - - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:89aa9fefd4b247610a95ef0896ba55d468563f50' + - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' entrypoint: 'bash' # Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable secretEnv: ['DB_PASSWORD'] diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 8460a7e..485c758 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -16,7 +16,6 @@ gemini_cli_version: "@google/gemini-cli@0.38.1" generator: gemini_cli env: GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" - GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" setup: From 42ce176a6d8dec763284b9bf8f97221b18c92adf Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Wed, 22 Apr 2026 07:15:30 +0000 Subject: [PATCH 23/24] refactor: remove unused port check utility --- evals/wait_for_port.py | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 evals/wait_for_port.py diff --git a/evals/wait_for_port.py b/evals/wait_for_port.py deleted file mode 100644 index 8b11c44..0000000 --- a/evals/wait_for_port.py +++ /dev/null @@ -1,20 +0,0 @@ -import socket -import time -import sys - -def main(): - for i in range(20): - try: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.settimeout(1) - s.connect(('127.0.0.1', 50051)) - print('Port 50051 is open!') - sys.exit(0) - except Exception: - print('Port not open yet, retrying...') - time.sleep(1) - print('Port failed to open') - sys.exit(1) - -if __name__ == '__main__': - main() \ No newline at end of file From 941693b66c554a4f8dae08e11c38521dddbdf1a8 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Wed, 22 Apr 2026 11:31:16 +0000 Subject: [PATCH 24/24] refactor: standardize max_turns to 3 across all evaluation datasets and remove redundant cloud-sql-instance-not-found entry --- evals/dataset.json | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/evals/dataset.json b/evals/dataset.json index 6716b0c..a42bbae 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -12,7 +12,7 @@ "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" }, "kind": "tools", - "max_turns": 4 + "max_turns": 3 }, { "id": "cloud-sql-schema-tables-explore", @@ -26,7 +26,7 @@ "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" }, "kind": "tools", - "max_turns": 6 + "max_turns": 3 }, { "id": "cloud-sql-performance-check", @@ -40,7 +40,7 @@ "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" }, "kind": "tools", - "max_turns": 6 + "max_turns": 3 }, { "id": "cloud-sql-metrics-cpu-investigation", @@ -54,21 +54,7 @@ "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" }, "kind": "tools", - "max_turns": 6 - }, - { - "id": "cloud-sql-instance-not-found", - "starting_prompt": "Get details for the instance 'missing-db-123'.", - "conversation_plan": "The user asks for details of an instance named 'missing-db-123' that doesn't exist. The agent should try to get it, fail, and inform the user. The user will then ask to list instances to find the correct name.", - "expected_trajectory": [ - "get_instance", - "list_instances" - ], - "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" - }, - "kind": "tools", - "max_turns": 4 + "max_turns": 3 } ] } \ No newline at end of file