diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..402e43c --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +options: + logging: CLOUD_LOGGING_ONLY + +steps: + + # --- Evaluation Step --- + - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' + entrypoint: 'bash' + # Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable + secretEnv: ['DB_PASSWORD'] + args: + - '-c' + - | + set -e + # Workaround for evalbench bug: settings are only applied if path basename matches extension ID + ln -s /workspace /workspace/cloud-sql-postgresql + cd /evalbench + + export EVAL_GCP_PROJECT_ID=$PROJECT_ID + export EVAL_GCP_PROJECT_REGION=us-central1 + export GOOGLE_CLOUD_PROJECT=$PROJECT_ID + export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID + export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE + export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION + export CLOUD_SQL_POSTGRES_DATABASE=$_CLOUD_SQL_DATABASE + export CLOUD_SQL_POSTGRES_USER=$_CLOUD_SQL_USER + export CLOUD_SQL_POSTGRES_IP_TYPE=$_CLOUD_SQL_IP_TYPE + + # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills + export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD + + # Substitute environment variables in model_config.yaml + python3 /workspace/evals/substitute_env.py + + cd /evalbench + export PYTHONPATH=./evalbench:./evalbench/evalproto + export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + + echo "Launching Standalone Evaluation..." + python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml + + +availableSecrets: + secretManager: + - versionName: projects/$PROJECT_ID/secrets/daily-ci-evals-db-password/versions/latest + env: 'DB_PASSWORD' diff --git a/evals/dataset.json b/evals/dataset.json new file mode 100644 index 0000000..a42bbae --- /dev/null +++ b/evals/dataset.json @@ -0,0 +1,60 @@ +{ + "scenarios": [ + { + "id": "cloud-sql-debug-instance", + "starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.", + "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.", + "expected_trajectory": [ + "list_instances", + "get_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-schema-tables-explore", + "starting_prompt": "I want to understand the structure of my database.", + "conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.", + "expected_trajectory": [ + "list_schemas", + "list_tables" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-performance-check", + "starting_prompt": "Our database performance seems degraded.", + "conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.", + "expected_trajectory": [ + "list_active_queries", + "list_locks" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-metrics-cpu-investigation", + "starting_prompt": "I'm worried about the database load for daily-ci-evals-db.", + "conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", + "expected_trajectory": [ + "get_system_metrics", + "list_database_stats" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} \ No newline at end of file diff --git a/evals/gemini_2.5_pro_model.yaml b/evals/gemini_2.5_pro_model.yaml new file mode 100644 index 0000000..7154ec3 --- /dev/null +++ b/evals/gemini_2.5_pro_model.yaml @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +generator: gcp_vertex_gemini +vertex_model: gemini-2.5-pro +base_prompt: "" +execs_per_minute: 5 diff --git a/evals/model_config.yaml b/evals/model_config.yaml new file mode 100644 index 0000000..485c758 --- /dev/null +++ b/evals/model_config.yaml @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +gemini_cli_version: "@google/gemini-cli@0.38.1" +generator: gemini_cli +env: + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + GOOGLE_CLOUD_LOCATION: "global" + GOOGLE_GENAI_USE_VERTEXAI: "true" +setup: + extensions: + # Points to the symlink created in cloudbuild.yaml to match the extension ID + "/workspace/cloud-sql-postgresql": + settings: + CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" + CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" + CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}" + CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}" + CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}" + CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}' + CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}" diff --git a/evals/run_config.yaml b/evals/run_config.yaml new file mode 100644 index 0000000..b83b7e6 --- /dev/null +++ b/evals/run_config.yaml @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +extension_id: cloud-sql-postgresql + +dataset_config: /workspace/evals/dataset.json +dataset_format: gemini-cli-format + +orchestrator: geminicli +model_config: /workspace/evals/model_config.yaml +# You can reference default simulated user models provided by the evalbench repo: +simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml + +scorers: + # Qualitative (Judge-based) + goal_completion: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + + # Performance + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +reporting: + bigquery: + gcp_project_id: cloud-db-nl2sql diff --git a/evals/substitute_env.py b/evals/substitute_env.py new file mode 100644 index 0000000..3ef2295 --- /dev/null +++ b/evals/substitute_env.py @@ -0,0 +1,17 @@ +import os +import re + +def main(): + yaml_path = '/workspace/evals/model_config.yaml' + if os.path.exists(yaml_path): + with open(yaml_path, 'r') as f: + content = f.read() + content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) + with open(yaml_path, 'w') as f: + f.write(content) + print(f"Successfully substituted environment variables in {yaml_path}") + else: + print(f"File not found: {yaml_path}") + +if __name__ == '__main__': + main() \ No newline at end of file