From 1d8c1d10d912d61dd4a5a3a5e1af3f3b78a4b82f Mon Sep 17 00:00:00 2001 From: Emma Rogge Date: Thu, 13 Nov 2025 17:20:50 +0000 Subject: [PATCH 1/3] Create tool to enable users to estimate costs of running their notebooks in Workbench --- notebook_cost_estimator/cost_estimator.ipynb | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 notebook_cost_estimator/cost_estimator.ipynb diff --git a/notebook_cost_estimator/cost_estimator.ipynb b/notebook_cost_estimator/cost_estimator.ipynb new file mode 100644 index 0000000..21c2679 --- /dev/null +++ b/notebook_cost_estimator/cost_estimator.ipynb @@ -0,0 +1,10 @@ +{ + "cells": [], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b7dad7617ea984a5d9aa4f44f70642d1b473dbe5 Mon Sep 17 00:00:00 2001 From: Emma Rogge Date: Thu, 13 Nov 2025 18:37:54 +0000 Subject: [PATCH 2/3] Update notebook to use wb CLI --- notebook_cost_estimator/cost_estimator.ipynb | 673 ++++++++++++++++++- 1 file changed, 672 insertions(+), 1 deletion(-) diff --git a/notebook_cost_estimator/cost_estimator.ipynb b/notebook_cost_estimator/cost_estimator.ipynb index 21c2679..de55270 100644 --- a/notebook_cost_estimator/cost_estimator.ipynb +++ b/notebook_cost_estimator/cost_estimator.ipynb @@ -1,5 +1,676 @@ { - "cells": [], + "cells": [ + { + "cell_type": "markdown", + "id": "9701fa84", + "metadata": {}, + "source": [ + "# Verily Workbench Notebook Cost Estimator\n", + "\n", + "This tool helps you estimate the approximate GCP cost of running a specified JupyterLab notebook on a standard Verily Workbench JupyterLab app. \n", + "\n", + "Refer to the [Verily Workbench Cloud Apps documentation](https://support.workbench.verily.com/docs/guides/cloud_apps/apps_intro/) for details on default app configurations and pricing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15450aaa", + "metadata": {}, + "outputs": [], + "source": [ + "# Import Required Libraries\n", + "import os\n", + "import math\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "from IPython.display import display, Markdown" + ] + }, + { + "cell_type": "markdown", + "id": "ea71a703", + "metadata": {}, + "source": [ + "## User Input: Notebook and Data Specifications\n", + "\n", + "Please provide the following information to estimate your cost:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a20058dd", + "metadata": {}, + "outputs": [], + "source": [ + "# User Input: Notebook and Data Specifications\n", + "from ipywidgets import widgets\n", + "\n", + "notebook_file = widgets.Text(\n", + " value='',\n", + " placeholder='Enter notebook filename (e.g., analysis.ipynb)',\n", + " description='Notebook:',\n", + " disabled=False\n", + ")\n", + "runtime_hours = widgets.FloatText(\n", + " value=1.0,\n", + " description='Runtime (hrs):',\n", + " disabled=False\n", + ")\n", + "\n", + "# Workspace resource inputs\n", + "resource_type = widgets.Dropdown(\n", + " options=[\n", + " ('BigQuery Dataset', 'bq_dataset'),\n", + " ('BigQuery Table', 'bq_table'), \n", + " ('Cloud Storage Bucket', 'gcs_bucket'),\n", + " ('Cloud Storage Object/File', 'gcs_object'),\n", + " ('Mixed Resources', 'mixed')\n", + " ],\n", + " value='gcs_bucket',\n", + " description='Resource Type:',\n", + " disabled=False\n", + ")\n", + "\n", + "data_size_gb = widgets.FloatText(\n", + " value=1.0,\n", + " description='Data Size (GB):',\n", + " disabled=False\n", + ")\n", + "output_size_gb = widgets.FloatText(\n", + " value=0.5,\n", + " description='Output Size (GB):',\n", + " disabled=False\n", + ")\n", + "\n", + "# BigQuery-specific inputs\n", + "bq_queries = widgets.IntText(\n", + " value=1,\n", + " description='# BQ Queries:',\n", + " disabled=False\n", + ")\n", + "bq_data_processed_gb = widgets.FloatText(\n", + " value=1.0,\n", + " description='BQ Data Processed (GB):',\n", + " disabled=False\n", + ")\n", + "\n", + "special_resources = widgets.Text(\n", + " value='',\n", + " placeholder='e.g., GPU, highmem',\n", + " description='Special Resources:',\n", + " disabled=False\n", + ")\n", + "\n", + "ui = widgets.VBox([\n", + " notebook_file, \n", + " runtime_hours, \n", + " resource_type,\n", + " data_size_gb, \n", + " output_size_gb,\n", + " bq_queries,\n", + " bq_data_processed_gb,\n", + " special_resources\n", + "])\n", + "display(ui)" + ] + }, + { + "cell_type": "markdown", + "id": "980e6f0a", + "metadata": {}, + "source": [ + "## Estimate Compute Resource Usage\n", + "\n", + "This section estimates the compute resources required based on your input and the default JupyterLab app specs from Verily Workbench documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5cf6acd", + "metadata": {}, + "outputs": [], + "source": [ + "# Estimate Compute Resource Usage\n", + "# Default JupyterLab app specs (as of Nov 2025, see Verily docs):\n", + "# n1-standard-4 (4 vCPU, 15 GB RAM), $0.158/hr (us-central1)\n", + "default_machine_type = 'n1-standard-4'\n", + "default_vcpu = 4\n", + "default_ram_gb = 15\n", + "compute_price_per_hour = 0.158 # USD/hr (update if pricing changes)\n", + "\n", + "print(f\"Default machine: {default_machine_type} ({default_vcpu} vCPU, {default_ram_gb} GB RAM)\")\n", + "print(f\"Compute price: ${compute_price_per_hour}/hr (us-central1, Nov 2025)\")" + ] + }, + { + "cell_type": "markdown", + "id": "31d73559", + "metadata": {}, + "source": [ + "## Estimate Storage Usage Based on Workspace Resources\n", + "\n", + "Estimate storage usage based on the data resources in your Verily Workbench workspace. Workbench supports different types of data resources:\n", + "\n", + "- **BigQuery Datasets/Tables**: Query-based pricing\n", + "- **Cloud Storage Buckets/Objects**: Storage + data transfer pricing \n", + "- **Referenced Resources**: Point to external data (no workspace storage cost)\n", + "- **Controlled Resources**: Managed within your workspace\n", + "\n", + "For details, see the [Workbench data resources documentation](https://support.workbench.verily.com/docs/guides/research_data/resource_intro/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c4e9f34", + "metadata": {}, + "outputs": [], + "source": [ + "# Estimate Storage Usage Based on Workspace Resources\n", + "# GCP Pricing (us-central1, Nov 2025 - update as needed)\n", + "storage_price_per_gb_month = 0.04 # Cloud Storage Standard\n", + "bq_query_price_per_tb = 6.25 # BigQuery on-demand pricing per TB processed\n", + "bq_storage_price_per_gb_month = 0.02 # BigQuery storage per GB/month\n", + "\n", + "# Convert monthly to hourly rates\n", + "storage_price_per_gb_hour = storage_price_per_gb_month / (30 * 24)\n", + "bq_storage_price_per_gb_hour = bq_storage_price_per_gb_month / (30 * 24)\n", + "\n", + "def estimate_storage_costs(resource_type, data_gb, output_gb, queries, processed_gb, runtime_hrs):\n", + " storage_cost = 0\n", + " query_cost = 0\n", + " \n", + " if resource_type in ['gcs_bucket', 'gcs_object', 'mixed']:\n", + " # Cloud Storage costs for data + output\n", + " total_storage_gb = data_gb + output_gb\n", + " storage_cost = storage_price_per_gb_hour * total_storage_gb * runtime_hrs\n", + " \n", + " elif resource_type in ['bq_dataset', 'bq_table']:\n", + " # BigQuery storage (for controlled datasets) + query costs\n", + " storage_cost = bq_storage_price_per_gb_hour * data_gb * runtime_hrs\n", + " query_cost = (processed_gb / 1000) * bq_query_price_per_tb * queries # Convert GB to TB\n", + " \n", + " elif resource_type == 'mixed':\n", + " # Combination of storage and BigQuery\n", + " gcs_storage = storage_price_per_gb_hour * data_gb * runtime_hrs\n", + " bq_queries = (processed_gb / 1000) * bq_query_price_per_tb * queries\n", + " storage_cost = gcs_storage + bq_queries\n", + " \n", + " return storage_cost, query_cost\n", + "\n", + "storage_cost, query_cost = estimate_storage_costs(\n", + " resource_type.value,\n", + " data_size_gb.value,\n", + " output_size_gb.value, \n", + " bq_queries.value,\n", + " bq_data_processed_gb.value,\n", + " runtime_hours.value\n", + ")\n", + "\n", + "total_data_cost = storage_cost + query_cost\n", + "\n", + "print(f\"Resource type: {resource_type.value}\")\n", + "print(f\"Estimated storage cost: ${storage_cost:.4f}\")\n", + "if query_cost > 0:\n", + " print(f\"Estimated BigQuery query cost: ${query_cost:.4f}\")\n", + "print(f\"Total data-related cost: ${total_data_cost:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "760fe16c", + "metadata": {}, + "source": [ + "## Calculate Approximate Cost\n", + "\n", + "This section calculates the estimated cost for compute and storage resources based on your inputs and current pricing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b900007f", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate Approximate Cost\n", + "def estimate_total_cost(runtime_hours, storage_cost, query_cost):\n", + " compute_cost = compute_price_per_hour * runtime_hours\n", + " total_data_cost = storage_cost + query_cost\n", + " total_cost = compute_cost + total_data_cost\n", + " return compute_cost, total_data_cost, total_cost\n", + "\n", + "compute_cost, total_data_cost, total_cost = estimate_total_cost(\n", + " runtime_hours.value, \n", + " storage_cost, \n", + " query_cost\n", + ")\n", + "\n", + "print(f\"Estimated compute cost: ${compute_cost:.2f}\")\n", + "print(f\"Estimated data cost (storage + queries): ${total_data_cost:.4f}\")\n", + "print(f\"Total estimated cost: ${total_cost:.2f}\")\n", + "\n", + "# Cost breakdown by component\n", + "if resource_type.value in ['bq_dataset', 'bq_table'] and query_cost > 0:\n", + " print(f\"\\nData cost breakdown:\")\n", + " print(f\" - Storage: ${storage_cost:.4f}\")\n", + " print(f\" - BigQuery queries: ${query_cost:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c3146df", + "metadata": {}, + "source": [ + "## Display Cost Breakdown\n", + "\n", + "Below is a detailed breakdown of your estimated costs for running the specified notebook on Verily Workbench." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bac0444", + "metadata": {}, + "outputs": [], + "source": [ + "# Display Cost Breakdown\n", + "import pandas as pd\n", + "\n", + "# Create cost breakdown table\n", + "cost_components = ['Compute', 'Data (Storage + Queries)', 'Total']\n", + "cost_values = [f\"${compute_cost:.2f}\", f\"${total_data_cost:.4f}\", f\"${total_cost:.2f}\"]\n", + "\n", + "cost_breakdown = pd.DataFrame({\n", + " 'Component': cost_components,\n", + " 'Estimated Cost (USD)': cost_values\n", + "})\n", + "\n", + "display(cost_breakdown)\n", + "\n", + "# Detailed explanation\n", + "resource_explanation = {\n", + " 'gcs_bucket': 'Cloud Storage bucket',\n", + " 'gcs_object': 'Cloud Storage object/file', \n", + " 'bq_dataset': 'BigQuery dataset',\n", + " 'bq_table': 'BigQuery table',\n", + " 'mixed': 'Mixed resources (Cloud Storage + BigQuery)'\n", + "}\n", + "\n", + "explanation = f\"\"\"\n", + "**Cost Estimation Details:**\n", + "\n", + "**Compute:**\n", + "- Default JupyterLab app: {default_machine_type} ({default_vcpu} vCPU, {default_ram_gb} GB RAM)\n", + "- Compute price: ${compute_price_per_hour}/hr (us-central1, Nov 2025)\n", + "- Runtime: {runtime_hours.value} hours\n", + "\n", + "**Data Resources:**\n", + "- Resource type: {resource_explanation.get(resource_type.value, resource_type.value)}\n", + "- Storage price: ${storage_price_per_gb_month}/GB/month (Cloud Storage Standard)\"\"\"\n", + "\n", + "if resource_type.value in ['bq_dataset', 'bq_table']:\n", + " explanation += f\"\"\"\n", + "- BigQuery storage: ${bq_storage_price_per_gb_month}/GB/month\n", + "- BigQuery queries: ${bq_query_price_per_tb}/TB processed\n", + "- Estimated queries: {bq_queries.value}\n", + "- Data processed per query: {bq_data_processed_gb.value} GB\"\"\"\n", + "\n", + "explanation += f\"\"\"\n", + "\n", + "**Notes:**\n", + "- Costs are estimated for us-central1 region\n", + "- Referenced resources (external data) may have no workspace storage cost\n", + "- Controlled resources are managed within your workspace\n", + "- Actual costs may vary based on region, usage patterns, and discounts\n", + "- See [Workbench data resources](https://support.workbench.verily.com/docs/guides/research_data/resource_intro/) for more details\n", + "\"\"\"\n", + "\n", + "display(Markdown(explanation))" + ] + }, + { + "cell_type": "markdown", + "id": "2df01784", + "metadata": {}, + "source": [ + "## Discover Your Workspace Resources with wb CLI\n", + "\n", + "The Verily Workbench CLI is available in this environment. We'll automatically discover and analyze the data resources in your workspace to provide accurate size estimates and resource types for cost calculation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa4b96a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Discover Your Workspace Resources using Workbench CLI\n", + "import subprocess\n", + "import json\n", + "import pandas as pd\n", + "from IPython.display import display, HTML\n", + "\n", + "def run_wb_command(command):\n", + " \"\"\"Run a wb CLI command and return the result\"\"\"\n", + " result = subprocess.run(command, shell=True, capture_output=True, text=True)\n", + " if result.returncode == 0:\n", + " return result.stdout.strip()\n", + " else:\n", + " raise Exception(f\"Command failed: {command}\\nError: {result.stderr}\")\n", + "\n", + "def list_workspace_resources():\n", + " \"\"\"List all resources in the current workspace\"\"\"\n", + " print(\"🔍 Discovering workspace resources...\")\n", + " \n", + " # List all resources in JSON format for easier parsing\n", + " resources_json = run_wb_command(\"wb resource list --format=JSON\")\n", + " \n", + " resources = json.loads(resources_json)\n", + " if not resources:\n", + " print(\"No resources found in this workspace.\")\n", + " return pd.DataFrame()\n", + " \n", + " # Create a summary table\n", + " resource_data = []\n", + " for resource in resources:\n", + " resource_data.append({\n", + " 'Name': resource.get('name', 'Unknown'),\n", + " 'Type': resource.get('resourceType', 'Unknown'),\n", + " 'Stewardship': resource.get('stewardshipType', 'Unknown'),\n", + " 'Description': resource.get('description', '')[:50] + '...' if len(resource.get('description', '')) > 50 else resource.get('description', ''),\n", + " 'Cloud Resource': resource.get('cloudName', 'Unknown')\n", + " })\n", + " \n", + " df = pd.DataFrame(resource_data)\n", + " print(f\"\\n✅ Found {len(resources)} resources in your workspace:\")\n", + " display(df)\n", + " return df\n", + "\n", + "def get_resource_details(resource_name):\n", + " \"\"\"Get detailed information about a specific resource\"\"\"\n", + " print(f\"📋 Getting details for resource: {resource_name}\")\n", + " \n", + " details = run_wb_command(f\"wb resource describe --name={resource_name} --format=JSON\")\n", + " resource_info = json.loads(details)\n", + " \n", + " print(f\"\\n**Resource:** {resource_info.get('name', 'Unknown')}\")\n", + " print(f\"**Type:** {resource_info.get('resourceType', 'Unknown')}\")\n", + " print(f\"**Stewardship:** {resource_info.get('stewardshipType', 'Unknown')}\")\n", + " print(f\"**Cloud Resource:** {resource_info.get('cloudName', 'Unknown')}\")\n", + " \n", + " # For GCS buckets, we can try to get size info\n", + " if resource_info.get('resourceType') == 'GCS_BUCKET':\n", + " bucket_name = resource_info.get('cloudName', '')\n", + " if bucket_name:\n", + " print(f\"**Bucket:** {bucket_name}\")\n", + " print(f\"💡 Use get_gcs_bucket_size('{bucket_name}') to get actual size\")\n", + " \n", + " # For BigQuery datasets, show how to get table info\n", + " elif resource_info.get('resourceType') == 'BQ_DATASET':\n", + " dataset_name = resource_info.get('cloudName', '')\n", + " if dataset_name:\n", + " print(f\"**Dataset:** {dataset_name}\")\n", + " print(f\"💡 Use get_bigquery_dataset_size('{dataset_name}') to get actual size\")\n", + " \n", + " return resource_info\n", + "\n", + "# Automatically run the resource discovery\n", + "resources_df = list_workspace_resources()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccdf1bc7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get detailed information for specific resources\n", + "print(\"📋 Available resources in your workspace:\")\n", + "if not resources_df.empty:\n", + " for i, name in enumerate(resources_df['Name'], 1):\n", + " resource_type = resources_df.iloc[i-1]['Type']\n", + " stewardship = resources_df.iloc[i-1]['Stewardship']\n", + " print(f\" {i}. {name} ({resource_type}, {stewardship})\")\n", + " \n", + " print(f\"\\n💡 To get detailed information about a resource, use:\")\n", + " print(f\" resource_details = get_resource_details('resource-name')\")\n", + " print(f\"\\n💡 Example: get_resource_details('{resources_df.iloc[0]['Name']}')\")\n", + "else:\n", + " print(\" No resources found in this workspace.\")\n", + " print(\" Make sure you're running this in a Verily Workbench JupyterLab environment with resources configured.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed3bd12a", + "metadata": {}, + "outputs": [], + "source": [ + "# Get Cloud Storage bucket sizes and BigQuery dataset information\n", + "def get_gcs_bucket_size(bucket_name):\n", + " \"\"\"Get the size of a GCS bucket using gsutil via wb CLI\"\"\"\n", + " print(f\"📏 Getting size for bucket: {bucket_name}\")\n", + " \n", + " # Get bucket size in bytes\n", + " size_output = run_wb_command(f\"wb gsutil du -s gs://{bucket_name}\")\n", + " \n", + " # Parse the size (gsutil du returns: size_bytes gs://bucket_name)\n", + " if size_output:\n", + " size_bytes = int(size_output.split()[0])\n", + " size_gb = size_bytes / (1024**3) # Convert to GB\n", + " \n", + " print(f\"Bucket size: {size_gb:.2f} GB ({size_bytes:,} bytes)\")\n", + " return size_gb\n", + " else:\n", + " print(f\"Could not determine bucket size\")\n", + " return 0\n", + "\n", + "def get_bigquery_dataset_size(project_dataset):\n", + " \"\"\"Get BigQuery dataset size and table information\"\"\"\n", + " print(f\"📊 Getting BigQuery dataset info: {project_dataset}\")\n", + " \n", + " # Split project.dataset if needed\n", + " if '.' not in project_dataset:\n", + " print(\"⚠️ Dataset name should be in format 'project.dataset' or provide the full dataset reference\")\n", + " return 0\n", + " \n", + " project, dataset = project_dataset.split('.', 1)\n", + " \n", + " # Query to get table sizes\n", + " sql_query = f\"\"\"\n", + " SELECT \n", + " table_name,\n", + " ROUND(size_bytes/1024/1024/1024, 2) as size_gb,\n", + " row_count,\n", + " creation_time\n", + " FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLES`\n", + " WHERE table_type = 'BASE TABLE'\n", + " ORDER BY size_bytes DESC\n", + " \"\"\"\n", + " \n", + " print(\"Running BigQuery size analysis...\")\n", + " query_result = run_wb_command(f'wb bq query --sql=\"{sql_query}\" --format=JSON')\n", + " \n", + " tables = json.loads(query_result)\n", + " \n", + " if tables:\n", + " print(f\"\\n📋 Tables in {project_dataset}:\")\n", + " table_data = []\n", + " total_size_gb = 0\n", + " \n", + " for table in tables:\n", + " size_gb = float(table.get('size_gb', 0))\n", + " total_size_gb += size_gb\n", + " table_data.append({\n", + " 'Table': table.get('table_name', ''),\n", + " 'Size (GB)': size_gb,\n", + " 'Rows': table.get('row_count', 0),\n", + " 'Created': table.get('creation_time', '')[:10] # Just date\n", + " })\n", + " \n", + " df = pd.DataFrame(table_data)\n", + " display(df)\n", + " print(f\"\\n**Total dataset size: {total_size_gb:.2f} GB**\")\n", + " return total_size_gb\n", + " else:\n", + " print(\"No tables found in dataset\")\n", + " return 0\n", + "\n", + "# Auto-detect and analyze resources with actual sizes\n", + "print(\"🔍 Analyzing discovered resources for accurate size information...\")\n", + "\n", + "for _, resource in resources_df.iterrows():\n", + " resource_name = resource['Name']\n", + " resource_type = resource['Type']\n", + " cloud_resource = resource['Cloud Resource']\n", + " \n", + " print(f\"\\n🔹 **{resource_name}** ({resource_type})\")\n", + " \n", + " try:\n", + " if resource_type == 'GCS_BUCKET' and cloud_resource != 'Unknown':\n", + " actual_size = get_gcs_bucket_size(cloud_resource)\n", + " print(f\" ✅ Actual size determined: {actual_size:.2f} GB\")\n", + " \n", + " elif resource_type == 'BQ_DATASET' and cloud_resource != 'Unknown':\n", + " actual_size = get_bigquery_dataset_size(cloud_resource)\n", + " print(f\" ✅ Actual size determined: {actual_size:.2f} GB\")\n", + " \n", + " else:\n", + " print(f\" ℹ️ Size analysis not available for {resource_type}\")\n", + " \n", + " except Exception as e:\n", + " print(f\" ⚠️ Could not determine size: {str(e)}\")\n", + "\n", + "print(f\"\\n💡 You can also call these functions manually:\")\n", + "print(f\" get_gcs_bucket_size('bucket-name')\")\n", + "print(f\" get_bigquery_dataset_size('project.dataset')\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dcb89fe", + "metadata": {}, + "outputs": [], + "source": [ + "# Automated Cost Estimation with Discovered Resources\n", + "def estimate_costs_from_resources():\n", + " \"\"\"Estimate costs based on discovered workspace resources\"\"\"\n", + " print(\"🎯 Automated Cost Estimation Based on Your Workspace Resources\")\n", + " print(\"=\" * 60)\n", + " \n", + " if resources_df.empty:\n", + " print(\"No resources discovered in this workspace.\")\n", + " return 0\n", + " \n", + " total_estimated_cost = 0\n", + " total_storage_gb = 0\n", + " total_query_costs = 0\n", + " \n", + " print(\"\\n📊 Analyzing each resource:\")\n", + " \n", + " for _, resource in resources_df.iterrows():\n", + " resource_name = resource['Name']\n", + " resource_type = resource['Type']\n", + " stewardship = resource['Stewardship']\n", + " cloud_resource = resource['Cloud Resource']\n", + " \n", + " print(f\"\\n🔹 **{resource_name}** ({resource_type}, {stewardship})\")\n", + " \n", + " estimated_size = 0\n", + " estimated_queries = 0\n", + " \n", + " # Try to get actual sizes where possible\n", + " try:\n", + " if resource_type == 'GCS_BUCKET' and cloud_resource != 'Unknown':\n", + " estimated_size = get_gcs_bucket_size(cloud_resource)\n", + " print(f\" \udccf Actual size: {estimated_size:.2f} GB\")\n", + " \n", + " elif resource_type == 'BQ_DATASET' and cloud_resource != 'Unknown':\n", + " estimated_size = get_bigquery_dataset_size(cloud_resource)\n", + " estimated_queries = max(1, int(estimated_size / 10)) # Assume 1 query per 10GB\n", + " print(f\" 📏 Actual size: {estimated_size:.2f} GB\")\n", + " print(f\" \udd0d Estimated queries: {estimated_queries}\")\n", + " \n", + " elif resource_type == 'GCS_OBJECT':\n", + " estimated_size = 0.1 # Assume 100MB for individual objects\n", + " print(f\" \udccf Estimated size: {estimated_size} GB\")\n", + " \n", + " elif resource_type == 'BQ_TABLE':\n", + " estimated_size = 0.5 # Assume 500MB for individual tables\n", + " estimated_queries = 1\n", + " print(f\" 📏 Estimated size: {estimated_size} GB\")\n", + " print(f\" 🔍 Estimated queries: {estimated_queries}\")\n", + " \n", + " else:\n", + " print(f\" ℹ️ Resource type {resource_type} not included in cost calculation\")\n", + " continue\n", + " \n", + " except Exception as e:\n", + " print(f\" ⚠️ Could not get actual size, using estimates: {str(e)}\")\n", + " # Fall back to estimates\n", + " if resource_type == 'GCS_BUCKET':\n", + " estimated_size = 5.0\n", + " elif resource_type == 'BQ_DATASET':\n", + " estimated_size = 2.0\n", + " estimated_queries = 5\n", + " elif resource_type in ['GCS_OBJECT', 'BQ_TABLE']:\n", + " estimated_size = 1.0\n", + " estimated_queries = 1 if resource_type == 'BQ_TABLE' else 0\n", + " \n", + " print(f\" 📏 Estimated size: {estimated_size} GB\")\n", + " if estimated_queries > 0:\n", + " print(f\" 🔍 Estimated queries: {estimated_queries}\")\n", + " \n", + " # Calculate costs for this resource\n", + " if resource_type in ['GCS_BUCKET', 'GCS_OBJECT']:\n", + " storage_cost = storage_price_per_gb_hour * estimated_size * runtime_hours.value\n", + " total_estimated_cost += storage_cost\n", + " total_storage_gb += estimated_size\n", + " print(f\" 💰 Storage cost: ${storage_cost:.4f}\")\n", + " \n", + " elif resource_type in ['BQ_DATASET', 'BQ_TABLE']:\n", + " bq_storage_cost = bq_storage_price_per_gb_hour * estimated_size * runtime_hours.value\n", + " query_cost = (estimated_size / 1000) * bq_query_price_per_tb * estimated_queries\n", + " total_estimated_cost += bq_storage_cost + query_cost\n", + " total_storage_gb += estimated_size\n", + " total_query_costs += query_cost\n", + " print(f\" 💰 Storage cost: ${bq_storage_cost:.4f}\")\n", + " print(f\" 💰 Query cost: ${query_cost:.4f}\")\n", + " \n", + " # Add compute costs\n", + " compute_cost = compute_price_per_hour * runtime_hours.value\n", + " total_estimated_cost += compute_cost\n", + " \n", + " print(f\"\\n\" + \"=\" * 60)\n", + " print(f\"📋 **TOTAL COST ESTIMATE FROM WORKSPACE RESOURCES**\")\n", + " print(f\" 💻 Compute cost ({runtime_hours.value}h): ${compute_cost:.2f}\")\n", + " print(f\" 💾 Storage cost ({total_storage_gb:.1f}GB): ${total_estimated_cost - compute_cost - total_query_costs:.4f}\")\n", + " if total_query_costs > 0:\n", + " print(f\" 🔍 Query costs: ${total_query_costs:.4f}\")\n", + " print(f\" 🎯 **TOTAL: ${total_estimated_cost:.2f}**\")\n", + " \n", + " return total_estimated_cost\n", + "\n", + "# Run automated estimation with discovered resources\n", + "print(\"\\n🚀 Running automated cost estimation...\")\n", + "automated_cost = estimate_costs_from_resources()" + ] + }, + { + "cell_type": "markdown", + "id": "ab2333c2", + "metadata": {}, + "source": [] + } + ], "metadata": { "language_info": { "name": "python" From ca1ddbf78730a8c4d379528a445db9ae2f99d6a9 Mon Sep 17 00:00:00 2001 From: Emma Rogge Date: Fri, 14 Nov 2025 17:38:42 +0000 Subject: [PATCH 3/3] Fix formatting --- notebook_cost_estimator/cost_estimator.ipynb | 621 ++++++++----------- 1 file changed, 262 insertions(+), 359 deletions(-) diff --git a/notebook_cost_estimator/cost_estimator.ipynb b/notebook_cost_estimator/cost_estimator.ipynb index de55270..19a78da 100644 --- a/notebook_cost_estimator/cost_estimator.ipynb +++ b/notebook_cost_estimator/cost_estimator.ipynb @@ -48,70 +48,70 @@ "from ipywidgets import widgets\n", "\n", "notebook_file = widgets.Text(\n", - " value='',\n", - " placeholder='Enter notebook filename (e.g., analysis.ipynb)',\n", - " description='Notebook:',\n", - " disabled=False\n", + "    value='',\n", + "    placeholder='Enter notebook filename (e.g., analysis.ipynb)',\n", + "    description='Notebook:',\n", + "    disabled=False\n", ")\n", "runtime_hours = widgets.FloatText(\n", - " value=1.0,\n", - " description='Runtime (hrs):',\n", - " disabled=False\n", + "    value=1.0,\n", + "    description='Runtime (hrs):',\n", + "    disabled=False\n", ")\n", "\n", "# Workspace resource inputs\n", "resource_type = widgets.Dropdown(\n", - " options=[\n", - " ('BigQuery Dataset', 'bq_dataset'),\n", - " ('BigQuery Table', 'bq_table'), \n", - " ('Cloud Storage Bucket', 'gcs_bucket'),\n", - " ('Cloud Storage Object/File', 'gcs_object'),\n", - " ('Mixed Resources', 'mixed')\n", - " ],\n", - " value='gcs_bucket',\n", - " description='Resource Type:',\n", - " disabled=False\n", + "    options=[\n", + "        ('BigQuery Dataset', 'bq_dataset'),\n", + "        ('BigQuery Table', 'bq_table'), \n", + "        ('Cloud Storage Bucket', 'gcs_bucket'),\n", + "        ('Cloud Storage Object/File', 'gcs_object'),\n", + "        ('Mixed Resources', 'mixed')\n", + "    ],\n", + "    value='gcs_bucket',\n", + "    description='Resource Type:',\n", + "    disabled=False\n", ")\n", "\n", "data_size_gb = widgets.FloatText(\n", - " value=1.0,\n", - " description='Data Size (GB):',\n", - " disabled=False\n", + "    value=1.0,\n", + "    description='Data Size (GB):',\n", + "    disabled=False\n", ")\n", "output_size_gb = widgets.FloatText(\n", - " value=0.5,\n", - " description='Output Size (GB):',\n", - " disabled=False\n", + "    value=0.5,\n", + "    description='Output Size (GB):',\n", + "    disabled=False\n", ")\n", "\n", "# BigQuery-specific inputs\n", "bq_queries = widgets.IntText(\n", - " value=1,\n", - " description='# BQ Queries:',\n", - " disabled=False\n", + "    value=1,\n", + "    description='# BQ Queries:',\n", + "    disabled=False\n", ")\n", "bq_data_processed_gb = widgets.FloatText(\n", - " value=1.0,\n", - " description='BQ Data Processed (GB):',\n", - " disabled=False\n", + "    value=1.0,\n", + "    description='BQ Data Processed (GB):',\n", + "    disabled=False\n", ")\n", "\n", "special_resources = widgets.Text(\n", - " value='',\n", - " placeholder='e.g., GPU, highmem',\n", - " description='Special Resources:',\n", - " disabled=False\n", + "    value='',\n", + "    placeholder='e.g., GPU, highmem',\n", + "    description='Special Resources:',\n", + "    disabled=False\n", ")\n", "\n", "ui = widgets.VBox([\n", - " notebook_file, \n", - " runtime_hours, \n", - " resource_type,\n", - " data_size_gb, \n", - " output_size_gb,\n", - " bq_queries,\n", - " bq_data_processed_gb,\n", - " special_resources\n", + "    notebook_file, \n", + "    runtime_hours, \n", + "    resource_type,\n", + "    data_size_gb, \n", + "    output_size_gb,\n", + "    bq_queries,\n", + "    bq_data_processed_gb,\n", + "    special_resources\n", "])\n", "display(ui)" ] @@ -139,7 +139,7 @@ "default_machine_type = 'n1-standard-4'\n", "default_vcpu = 4\n", "default_ram_gb = 15\n", - "compute_price_per_hour = 0.158 # USD/hr (update if pricing changes)\n", + "compute_price_per_hour = 0.158  # USD/hr (update if pricing changes)\n", "\n", "print(f\"Default machine: {default_machine_type} ({default_vcpu} vCPU, {default_ram_gb} GB RAM)\")\n", "print(f\"Compute price: ${compute_price_per_hour}/hr (us-central1, Nov 2025)\")" @@ -155,7 +155,7 @@ "Estimate storage usage based on the data resources in your Verily Workbench workspace. Workbench supports different types of data resources:\n", "\n", "- **BigQuery Datasets/Tables**: Query-based pricing\n", - "- **Cloud Storage Buckets/Objects**: Storage + data transfer pricing \n", + "- **Cloud Storage Buckets/Objects**: Storage + data transfer pricing  \n", "- **Referenced Resources**: Point to external data (no workspace storage cost)\n", "- **Controlled Resources**: Managed within your workspace\n", "\n", @@ -171,43 +171,43 @@ "source": [ "# Estimate Storage Usage Based on Workspace Resources\n", "# GCP Pricing (us-central1, Nov 2025 - update as needed)\n", - "storage_price_per_gb_month = 0.04 # Cloud Storage Standard\n", - "bq_query_price_per_tb = 6.25 # BigQuery on-demand pricing per TB processed\n", - "bq_storage_price_per_gb_month = 0.02 # BigQuery storage per GB/month\n", + "storage_price_per_gb_month = 0.04  # Cloud Storage Standard\n", + "bq_query_price_per_tb = 6.25  # BigQuery on-demand pricing per TB processed\n", + "bq_storage_price_per_gb_month = 0.02  # BigQuery storage per GB/month\n", "\n", "# Convert monthly to hourly rates\n", "storage_price_per_gb_hour = storage_price_per_gb_month / (30 * 24)\n", "bq_storage_price_per_gb_hour = bq_storage_price_per_gb_month / (30 * 24)\n", "\n", "def estimate_storage_costs(resource_type, data_gb, output_gb, queries, processed_gb, runtime_hrs):\n", - " storage_cost = 0\n", - " query_cost = 0\n", - " \n", - " if resource_type in ['gcs_bucket', 'gcs_object', 'mixed']:\n", - " # Cloud Storage costs for data + output\n", - " total_storage_gb = data_gb + output_gb\n", - " storage_cost = storage_price_per_gb_hour * total_storage_gb * runtime_hrs\n", - " \n", - " elif resource_type in ['bq_dataset', 'bq_table']:\n", - " # BigQuery storage (for controlled datasets) + query costs\n", - " storage_cost = bq_storage_price_per_gb_hour * data_gb * runtime_hrs\n", - " query_cost = (processed_gb / 1000) * bq_query_price_per_tb * queries # Convert GB to TB\n", - " \n", - " elif resource_type == 'mixed':\n", - " # Combination of storage and BigQuery\n", - " gcs_storage = storage_price_per_gb_hour * data_gb * runtime_hrs\n", - " bq_queries = (processed_gb / 1000) * bq_query_price_per_tb * queries\n", - " storage_cost = gcs_storage + bq_queries\n", - " \n", - " return storage_cost, query_cost\n", - "\n", + "    storage_cost = 0\n", + "    query_cost = 0\n", + "    \n", + "    if resource_type in ['gcs_bucket', 'gcs_object', 'mixed']:\n", + "        # Cloud Storage costs for data + output\n", + "        total_storage_gb = data_gb + output_gb\n", + "        storage_cost = storage_price_per_gb_hour * total_storage_gb * runtime_hrs\n", + "        \n", + "    elif resource_type in ['bq_dataset', 'bq_table']:\n", + "        # BigQuery storage (for controlled datasets) + query costs\n", + "        storage_cost = bq_storage_price_per_gb_hour * data_gb * runtime_hrs\n", + "        query_cost = (processed_gb / 1000) * bq_query_price_per_tb * queries  # Convert GB to TB\n", + "        \n", + "    elif resource_type == 'mixed':\n", + "        # Combination of storage and BigQuery\n", + "        gcs_storage = storage_price_per_gb_hour * data_gb * runtime_hrs\n", + "        bq_queries = (processed_gb / 1000) * bq_query_price_per_tb * queries\n", + "        storage_cost = gcs_storage + bq_queries\n", + "    \n", + "    return storage_cost, query_cost\n", + "    \n", "storage_cost, query_cost = estimate_storage_costs(\n", - " resource_type.value,\n", - " data_size_gb.value,\n", - " output_size_gb.value, \n", - " bq_queries.value,\n", - " bq_data_processed_gb.value,\n", - " runtime_hours.value\n", + "    resource_type.value,\n", + "    data_size_gb.value,\n", + "    output_size_gb.value, \n", + "    bq_queries.value,\n", + "    bq_data_processed_gb.value,\n", + "    runtime_hours.value\n", ")\n", "\n", "total_data_cost = storage_cost + query_cost\n", @@ -215,7 +215,7 @@ "print(f\"Resource type: {resource_type.value}\")\n", "print(f\"Estimated storage cost: ${storage_cost:.4f}\")\n", "if query_cost > 0:\n", - " print(f\"Estimated BigQuery query cost: ${query_cost:.4f}\")\n", + "    print(f\"Estimated BigQuery query cost: ${query_cost:.4f}\")\n", "print(f\"Total data-related cost: ${total_data_cost:.4f}\")" ] }, @@ -238,15 +238,15 @@ "source": [ "# Calculate Approximate Cost\n", "def estimate_total_cost(runtime_hours, storage_cost, query_cost):\n", - " compute_cost = compute_price_per_hour * runtime_hours\n", - " total_data_cost = storage_cost + query_cost\n", - " total_cost = compute_cost + total_data_cost\n", - " return compute_cost, total_data_cost, total_cost\n", + "    compute_cost = compute_price_per_hour * runtime_hours\n", + "    total_data_cost = storage_cost + query_cost\n", + "    total_cost = compute_cost + total_data_cost\n", + "    return compute_cost, total_data_cost, total_cost\n", "\n", "compute_cost, total_data_cost, total_cost = estimate_total_cost(\n", - " runtime_hours.value, \n", - " storage_cost, \n", - " query_cost\n", + "    runtime_hours.value, \n", + "    storage_cost, \n", + "    query_cost\n", ")\n", "\n", "print(f\"Estimated compute cost: ${compute_cost:.2f}\")\n", @@ -255,9 +255,9 @@ "\n", "# Cost breakdown by component\n", "if resource_type.value in ['bq_dataset', 'bq_table'] and query_cost > 0:\n", - " print(f\"\\nData cost breakdown:\")\n", - " print(f\" - Storage: ${storage_cost:.4f}\")\n", - " print(f\" - BigQuery queries: ${query_cost:.4f}\")" + "    print(f\"\\nData cost breakdown:\")\n", + "    print(f\"  - Storage: ${storage_cost:.4f}\")\n", + "    print(f\"  - BigQuery queries: ${query_cost:.4f}\")" ] }, { @@ -285,19 +285,19 @@ "cost_values = [f\"${compute_cost:.2f}\", f\"${total_data_cost:.4f}\", f\"${total_cost:.2f}\"]\n", "\n", "cost_breakdown = pd.DataFrame({\n", - " 'Component': cost_components,\n", - " 'Estimated Cost (USD)': cost_values\n", + "    'Component': cost_components,\n", + "    'Estimated Cost (USD)': cost_values\n", "})\n", "\n", "display(cost_breakdown)\n", "\n", "# Detailed explanation\n", "resource_explanation = {\n", - " 'gcs_bucket': 'Cloud Storage bucket',\n", - " 'gcs_object': 'Cloud Storage object/file', \n", - " 'bq_dataset': 'BigQuery dataset',\n", - " 'bq_table': 'BigQuery table',\n", - " 'mixed': 'Mixed resources (Cloud Storage + BigQuery)'\n", + "    'gcs_bucket': 'Cloud Storage bucket',\n", + "    'gcs_object': 'Cloud Storage object/file', \n", + "    'bq_dataset': 'BigQuery dataset',\n", + "    'bq_table': 'BigQuery table',\n", + "    'mixed': 'Mixed resources (Cloud Storage + BigQuery)'\n", "}\n", "\n", "explanation = f\"\"\"\n", @@ -313,7 +313,7 @@ "- Storage price: ${storage_price_per_gb_month}/GB/month (Cloud Storage Standard)\"\"\"\n", "\n", "if resource_type.value in ['bq_dataset', 'bq_table']:\n", - " explanation += f\"\"\"\n", + "    explanation += f\"\"\"\n", "- BigQuery storage: ${bq_storage_price_per_gb_month}/GB/month\n", "- BigQuery queries: ${bq_query_price_per_tb}/TB processed\n", "- Estimated queries: {bq_queries.value}\n", @@ -356,69 +356,69 @@ "from IPython.display import display, HTML\n", "\n", "def run_wb_command(command):\n", - " \"\"\"Run a wb CLI command and return the result\"\"\"\n", - " result = subprocess.run(command, shell=True, capture_output=True, text=True)\n", - " if result.returncode == 0:\n", - " return result.stdout.strip()\n", - " else:\n", - " raise Exception(f\"Command failed: {command}\\nError: {result.stderr}\")\n", + "    \"\"\"Run a wb CLI command and return the result\"\"\"\n", + "    result = subprocess.run(command, shell=True, capture_output=True, text=True)\n", + "    if result.returncode == 0:\n", + "        return result.stdout.strip()\n", + "    else:\n", + "        raise Exception(f\"Command failed: {command}\\nError: {result.stderr}\")\n", "\n", "def list_workspace_resources():\n", - " \"\"\"List all resources in the current workspace\"\"\"\n", - " print(\"🔍 Discovering workspace resources...\")\n", - " \n", - " # List all resources in JSON format for easier parsing\n", - " resources_json = run_wb_command(\"wb resource list --format=JSON\")\n", - " \n", - " resources = json.loads(resources_json)\n", - " if not resources:\n", - " print(\"No resources found in this workspace.\")\n", - " return pd.DataFrame()\n", - " \n", - " # Create a summary table\n", - " resource_data = []\n", - " for resource in resources:\n", - " resource_data.append({\n", - " 'Name': resource.get('name', 'Unknown'),\n", - " 'Type': resource.get('resourceType', 'Unknown'),\n", - " 'Stewardship': resource.get('stewardshipType', 'Unknown'),\n", - " 'Description': resource.get('description', '')[:50] + '...' if len(resource.get('description', '')) > 50 else resource.get('description', ''),\n", - " 'Cloud Resource': resource.get('cloudName', 'Unknown')\n", - " })\n", - " \n", - " df = pd.DataFrame(resource_data)\n", - " print(f\"\\n✅ Found {len(resources)} resources in your workspace:\")\n", - " display(df)\n", - " return df\n", - "\n", + "    \"\"\"List all resources in the current workspace\"\"\"\n", + "    print(\"🔍 Discovering workspace resources...\")\n", + "    \n", + "    # List all resources in JSON format for easier parsing\n", + "    resources_json = run_wb_command(\"wb resource list --format=JSON\")\n", + "    \n", + "    resources = json.loads(resources_json)\n", + "    if not resources:\n", + "        print(\"No resources found in this workspace.\")\n", + "        return pd.DataFrame()\n", + "    \n", + "    # Create a summary table\n", + "    resource_data = []\n", + "    for resource in resources:\n", + "        resource_data.append({\n", + "            'Name': resource.get('name', 'Unknown'),\n", + "            'Type': resource.get('resourceType', 'Unknown'),\n", + "            'Stewardship': resource.get('stewardshipType', 'Unknown'),\n", + "            'Description': resource.get('description', '')[:50] + '...' if len(resource.get('description', '')) > 50 else resource.get('description', ''),\n", + "            'Cloud Resource': resource.get('cloudName', 'Unknown')\n", + "        })\n", + "    \n", + "    df = pd.DataFrame(resource_data)\n", + "    print(f\"\\n✅ Found {len(resources)} resources in your workspace:\")\n", + "    display(df)\n", + "    return df\n", + "    \n", "def get_resource_details(resource_name):\n", - " \"\"\"Get detailed information about a specific resource\"\"\"\n", - " print(f\"📋 Getting details for resource: {resource_name}\")\n", - " \n", - " details = run_wb_command(f\"wb resource describe --name={resource_name} --format=JSON\")\n", - " resource_info = json.loads(details)\n", - " \n", - " print(f\"\\n**Resource:** {resource_info.get('name', 'Unknown')}\")\n", - " print(f\"**Type:** {resource_info.get('resourceType', 'Unknown')}\")\n", - " print(f\"**Stewardship:** {resource_info.get('stewardshipType', 'Unknown')}\")\n", - " print(f\"**Cloud Resource:** {resource_info.get('cloudName', 'Unknown')}\")\n", - " \n", - " # For GCS buckets, we can try to get size info\n", - " if resource_info.get('resourceType') == 'GCS_BUCKET':\n", - " bucket_name = resource_info.get('cloudName', '')\n", - " if bucket_name:\n", - " print(f\"**Bucket:** {bucket_name}\")\n", - " print(f\"💡 Use get_gcs_bucket_size('{bucket_name}') to get actual size\")\n", - " \n", - " # For BigQuery datasets, show how to get table info\n", - " elif resource_info.get('resourceType') == 'BQ_DATASET':\n", - " dataset_name = resource_info.get('cloudName', '')\n", - " if dataset_name:\n", - " print(f\"**Dataset:** {dataset_name}\")\n", - " print(f\"💡 Use get_bigquery_dataset_size('{dataset_name}') to get actual size\")\n", - " \n", - " return resource_info\n", - "\n", + "    \"\"\"Get detailed information about a specific resource\"\"\"\n", + "    print(f\"📋 Getting details for resource: {resource_name}\")\n", + "    \n", + "    details = run_wb_command(f\"wb resource describe --name={resource_name} --format=JSON\")\n", + "    resource_info = json.loads(details)\n", + "    \n", + "    print(f\"\\n**Resource:** {resource_info.get('name', 'Unknown')}\")\n", + "    print(f\"**Type:** {resource_info.get('resourceType', 'Unknown')}\")\n", + "    print(f\"**Stewardship:** {resource_info.get('stewardshipType', 'Unknown')}\")\n", + "    print(f\"**Cloud Resource:** {resource_info.get('cloudName', 'Unknown')}\")\n", + "    \n", + "    # For GCS buckets, we can try to get size info\n", + "    if resource_info.get('resourceType') == 'GCS_BUCKET':\n", + "        bucket_name = resource_info.get('cloudName', '')\n", + "        if bucket_name:\n", + "            print(f\"**Bucket:** {bucket_name}\")\n", + "            print(f\"💡 Use get_gcs_bucket_size('{bucket_name}') to get actual size\")\n", + "    \n", + "    # For BigQuery datasets, show how to get table info\n", + "    elif resource_info.get('resourceType') == 'BQ_DATASET':\n", + "        dataset_name = resource_info.get('cloudName', '')\n", + "        if dataset_name:\n", + "            print(f\"**Dataset:** {dataset_name}\")\n", + "            print(f\"💡 Use get_bigquery_dataset_size('{dataset_name}') to get actual size\")\n", + "    \n", + "    return resource_info\n", + "    \n", "# Automatically run the resource discovery\n", "resources_df = list_workspace_resources()" ] @@ -433,17 +433,17 @@ "# Get detailed information for specific resources\n", "print(\"📋 Available resources in your workspace:\")\n", "if not resources_df.empty:\n", - " for i, name in enumerate(resources_df['Name'], 1):\n", - " resource_type = resources_df.iloc[i-1]['Type']\n", - " stewardship = resources_df.iloc[i-1]['Stewardship']\n", - " print(f\" {i}. {name} ({resource_type}, {stewardship})\")\n", - " \n", - " print(f\"\\n💡 To get detailed information about a resource, use:\")\n", - " print(f\" resource_details = get_resource_details('resource-name')\")\n", - " print(f\"\\n💡 Example: get_resource_details('{resources_df.iloc[0]['Name']}')\")\n", + "    for i, name in enumerate(resources_df['Name'], 1):\n", + "        resource_type = resources_df.iloc[i-1]['Type']\n", + "        stewardship = resources_df.iloc[i-1]['Stewardship']\n", + "        print(f\"   {i}. {name} ({resource_type}, {stewardship})\")\n", + "    \n", + "    print(f\"\\n💡 To get detailed information about a resource, use:\")\n", + "    print(f\"   resource_details = get_resource_details('resource-name')\")\n", + "    print(f\"\\n💡 Example: get_resource_details('{resources_df.iloc[0]['Name']}')\")\n", "else:\n", - " print(\" No resources found in this workspace.\")\n", - " print(\" Make sure you're running this in a Verily Workbench JupyterLab environment with resources configured.\")" + "    print(\"   No resources found in this workspace.\")\n", + "    print(\"   Make sure you're running this in a Verily Workbench JupyterLab environment with resources configured.\")" ] }, { @@ -455,210 +455,113 @@ "source": [ "# Get Cloud Storage bucket sizes and BigQuery dataset information\n", "def get_gcs_bucket_size(bucket_name):\n", - " \"\"\"Get the size of a GCS bucket using gsutil via wb CLI\"\"\"\n", - " print(f\"📏 Getting size for bucket: {bucket_name}\")\n", - " \n", - " # Get bucket size in bytes\n", - " size_output = run_wb_command(f\"wb gsutil du -s gs://{bucket_name}\")\n", - " \n", - " # Parse the size (gsutil du returns: size_bytes gs://bucket_name)\n", - " if size_output:\n", - " size_bytes = int(size_output.split()[0])\n", - " size_gb = size_bytes / (1024**3) # Convert to GB\n", - " \n", - " print(f\"Bucket size: {size_gb:.2f} GB ({size_bytes:,} bytes)\")\n", - " return size_gb\n", - " else:\n", - " print(f\"Could not determine bucket size\")\n", - " return 0\n", + "    \"\"\"Get the size of a GCS bucket using gsutil via wb CLI\"\"\"\n", + "    print(f\"📏 Getting size for bucket: {bucket_name}\")\n", + "    \n", + "    # Get bucket size in bytes\n", + "    size_output = run_wb_command(f\"wb gsutil du -s gs://{bucket_name}\")\n", + "    \n", + "    # Parse the size (gsutil du returns: size_bytes gs://bucket_name)\n", + "    if size_output:\n", + "        size_bytes = int(size_output.split()[0])\n", + "        size_gb = size_bytes / (1024**3)  # Convert to GB\n", + "        \n", + "        print(f\"Bucket size: {size_gb:.2f} GB ({size_bytes:,} bytes)\")\n", + "        return size_gb\n", + "    else:\n", + "        print(f\"Could not determine bucket size\")\n", + "        return 0\n", "\n", "def get_bigquery_dataset_size(project_dataset):\n", - " \"\"\"Get BigQuery dataset size and table information\"\"\"\n", - " print(f\"📊 Getting BigQuery dataset info: {project_dataset}\")\n", - " \n", - " # Split project.dataset if needed\n", - " if '.' not in project_dataset:\n", - " print(\"⚠️ Dataset name should be in format 'project.dataset' or provide the full dataset reference\")\n", - " return 0\n", - " \n", - " project, dataset = project_dataset.split('.', 1)\n", - " \n", - " # Query to get table sizes\n", - " sql_query = f\"\"\"\n", - " SELECT \n", - " table_name,\n", - " ROUND(size_bytes/1024/1024/1024, 2) as size_gb,\n", - " row_count,\n", - " creation_time\n", - " FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLES`\n", - " WHERE table_type = 'BASE TABLE'\n", - " ORDER BY size_bytes DESC\n", - " \"\"\"\n", - " \n", - " print(\"Running BigQuery size analysis...\")\n", - " query_result = run_wb_command(f'wb bq query --sql=\"{sql_query}\" --format=JSON')\n", - " \n", - " tables = json.loads(query_result)\n", - " \n", - " if tables:\n", - " print(f\"\\n📋 Tables in {project_dataset}:\")\n", - " table_data = []\n", - " total_size_gb = 0\n", - " \n", - " for table in tables:\n", - " size_gb = float(table.get('size_gb', 0))\n", - " total_size_gb += size_gb\n", - " table_data.append({\n", - " 'Table': table.get('table_name', ''),\n", - " 'Size (GB)': size_gb,\n", - " 'Rows': table.get('row_count', 0),\n", - " 'Created': table.get('creation_time', '')[:10] # Just date\n", - " })\n", - " \n", - " df = pd.DataFrame(table_data)\n", - " display(df)\n", - " print(f\"\\n**Total dataset size: {total_size_gb:.2f} GB**\")\n", - " return total_size_gb\n", - " else:\n", - " print(\"No tables found in dataset\")\n", - " return 0\n", + "    \"\"\"Get BigQuery dataset size and table information\"\"\"\n", + "    print(f\"📊 Getting BigQuery dataset info: {project_dataset}\")\n", + "    \n", + "    # Split project.dataset if needed\n", + "    if '.' not in project_dataset:\n", + "        print(\"⚠️ Dataset name should be in format 'project.dataset' or provide the full dataset reference\")\n", + "        return 0\n", + "    \n", + "    project, dataset = project_dataset.split('.', 1)\n", + "    \n", + "    # Query to get table sizes\n", + "    sql_query = f\"\"\"\n", + "    SELECT \n", + "        table_name,\n", + "        ROUND(size_bytes/1024/1024/1024, 2) as size_gb,\n", + "        row_count,\n", + "        creation_time\n", + "    FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLES`\n", + "    WHERE table_type = 'BASE TABLE'\n", + "    ORDER BY size_bytes DESC\n", + "    \"\"\"\n", + "    \n", + "    print(\"Running BigQuery size analysis...\")\n", + "    query_result = run_wb_command(f'wb bq query --sql=\"{sql_query}\" --format=JSON')\n", + "    \n", + "    tables = json.loads(query_result)\n", + "    \n", + "    if tables:\n", + "        print(f\"\\n📋 Tables in {project_dataset}:\")\n", + "        table_data = []\n", + "        total_size_gb = 0\n", + "        \n", + "        for table in tables:\n", + "            size_gb = float(table.get('size_gb', 0))\n", + "            total_size_gb += size_gb\n", + "            table_data.append({\n", + "                'Table': table.get('table_name', ''),\n", + "                'Size (GB)': size_gb,\n", + "                'Rows': table.get('row_count', 0),\n", + "                'Created': table.get('creation_time', '')[:10]  # Just date\n", + "            })\n", + "        \n", + "        df = pd.DataFrame(table_data)\n", + "        display(df)\n", + "        print(f\"\\n**Total dataset size: {total_size_gb:.2f} GB**\")\n", + "        return total_size_gb\n", + "    else:\n", + "        print(\"No tables found in dataset\")\n", + "        return 0\n", "\n", "# Auto-detect and analyze resources with actual sizes\n", "print(\"🔍 Analyzing discovered resources for accurate size information...\")\n", "\n", "for _, resource in resources_df.iterrows():\n", - " resource_name = resource['Name']\n", - " resource_type = resource['Type']\n", - " cloud_resource = resource['Cloud Resource']\n", - " \n", - " print(f\"\\n🔹 **{resource_name}** ({resource_type})\")\n", - " \n", - " try:\n", - " if resource_type == 'GCS_BUCKET' and cloud_resource != 'Unknown':\n", - " actual_size = get_gcs_bucket_size(cloud_resource)\n", - " print(f\" ✅ Actual size determined: {actual_size:.2f} GB\")\n", - " \n", - " elif resource_type == 'BQ_DATASET' and cloud_resource != 'Unknown':\n", - " actual_size = get_bigquery_dataset_size(cloud_resource)\n", - " print(f\" ✅ Actual size determined: {actual_size:.2f} GB\")\n", - " \n", - " else:\n", - " print(f\" ℹ️ Size analysis not available for {resource_type}\")\n", - " \n", - " except Exception as e:\n", - " print(f\" ⚠️ Could not determine size: {str(e)}\")\n", - "\n", - "print(f\"\\n💡 You can also call these functions manually:\")\n", - "print(f\" get_gcs_bucket_size('bucket-name')\")\n", - "print(f\" get_bigquery_dataset_size('project.dataset')\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1dcb89fe", - "metadata": {}, - "outputs": [], - "source": [ - "# Automated Cost Estimation with Discovered Resources\n", - "def estimate_costs_from_resources():\n", - " \"\"\"Estimate costs based on discovered workspace resources\"\"\"\n", - " print(\"🎯 Automated Cost Estimation Based on Your Workspace Resources\")\n", - " print(\"=\" * 60)\n", - " \n", - " if resources_df.empty:\n", - " print(\"No resources discovered in this workspace.\")\n", - " return 0\n", - " \n", - " total_estimated_cost = 0\n", - " total_storage_gb = 0\n", - " total_query_costs = 0\n", - " \n", - " print(\"\\n📊 Analyzing each resource:\")\n", - " \n", - " for _, resource in resources_df.iterrows():\n", - " resource_name = resource['Name']\n", - " resource_type = resource['Type']\n", - " stewardship = resource['Stewardship']\n", - " cloud_resource = resource['Cloud Resource']\n", - " \n", - " print(f\"\\n🔹 **{resource_name}** ({resource_type}, {stewardship})\")\n", - " \n", - " estimated_size = 0\n", - " estimated_queries = 0\n", - " \n", - " # Try to get actual sizes where possible\n", - " try:\n", - " if resource_type == 'GCS_BUCKET' and cloud_resource != 'Unknown':\n", - " estimated_size = get_gcs_bucket_size(cloud_resource)\n", - " print(f\" \udccf Actual size: {estimated_size:.2f} GB\")\n", - " \n", - " elif resource_type == 'BQ_DATASET' and cloud_resource != 'Unknown':\n", - " estimated_size = get_bigquery_dataset_size(cloud_resource)\n", - " estimated_queries = max(1, int(estimated_size / 10)) # Assume 1 query per 10GB\n", - " print(f\" 📏 Actual size: {estimated_size:.2f} GB\")\n", - " print(f\" \udd0d Estimated queries: {estimated_queries}\")\n", - " \n", - " elif resource_type == 'GCS_OBJECT':\n", - " estimated_size = 0.1 # Assume 100MB for individual objects\n", - " print(f\" \udccf Estimated size: {estimated_size} GB\")\n", - " \n", - " elif resource_type == 'BQ_TABLE':\n", - " estimated_size = 0.5 # Assume 500MB for individual tables\n", - " estimated_queries = 1\n", - " print(f\" 📏 Estimated size: {estimated_size} GB\")\n", - " print(f\" 🔍 Estimated queries: {estimated_queries}\")\n", - " \n", - " else:\n", - " print(f\" ℹ️ Resource type {resource_type} not included in cost calculation\")\n", - " continue\n", - " \n", - " except Exception as e:\n", - " print(f\" ⚠️ Could not get actual size, using estimates: {str(e)}\")\n", - " # Fall back to estimates\n", - " if resource_type == 'GCS_BUCKET':\n", - " estimated_size = 5.0\n", - " elif resource_type == 'BQ_DATASET':\n", - " estimated_size = 2.0\n", - " estimated_queries = 5\n", - " elif resource_type in ['GCS_OBJECT', 'BQ_TABLE']:\n", - " estimated_size = 1.0\n", - " estimated_queries = 1 if resource_type == 'BQ_TABLE' else 0\n", - " \n", - " print(f\" 📏 Estimated size: {estimated_size} GB\")\n", - " if estimated_queries > 0:\n", - " print(f\" 🔍 Estimated queries: {estimated_queries}\")\n", - " \n", - " # Calculate costs for this resource\n", - " if resource_type in ['GCS_BUCKET', 'GCS_OBJECT']:\n", - " storage_cost = storage_price_per_gb_hour * estimated_size * runtime_hours.value\n", - " total_estimated_cost += storage_cost\n", - " total_storage_gb += estimated_size\n", - " print(f\" 💰 Storage cost: ${storage_cost:.4f}\")\n", - " \n", - " elif resource_type in ['BQ_DATASET', 'BQ_TABLE']:\n", - " bq_storage_cost = bq_storage_price_per_gb_hour * estimated_size * runtime_hours.value\n", - " query_cost = (estimated_size / 1000) * bq_query_price_per_tb * estimated_queries\n", - " total_estimated_cost += bq_storage_cost + query_cost\n", - " total_storage_gb += estimated_size\n", - " total_query_costs += query_cost\n", - " print(f\" 💰 Storage cost: ${bq_storage_cost:.4f}\")\n", - " print(f\" 💰 Query cost: ${query_cost:.4f}\")\n", - " \n", - " # Add compute costs\n", - " compute_cost = compute_price_per_hour * runtime_hours.value\n", - " total_estimated_cost += compute_cost\n", - " \n", - " print(f\"\\n\" + \"=\" * 60)\n", - " print(f\"📋 **TOTAL COST ESTIMATE FROM WORKSPACE RESOURCES**\")\n", - " print(f\" 💻 Compute cost ({runtime_hours.value}h): ${compute_cost:.2f}\")\n", - " print(f\" 💾 Storage cost ({total_storage_gb:.1f}GB): ${total_estimated_cost - compute_cost - total_query_costs:.4f}\")\n", - " if total_query_costs > 0:\n", - " print(f\" 🔍 Query costs: ${total_query_costs:.4f}\")\n", - " print(f\" 🎯 **TOTAL: ${total_estimated_cost:.2f}**\")\n", - " \n", - " return total_estimated_cost\n", - "\n", + "    resource_name = resource['Name']\n", + "    resource_type = resource['Type']\n", + "    cloud_resource = resource['Cloud Resource']\n", + "    \n", + "    print(f\"\\n🔹 **{resource_name}** ({resource_type})\")\n", + "    \n", + "    try:\n", + "        if resource_type == 'GCS_BUCKET' and cloud_resource != 'Unknown':\n", + "            actual_size = get_gcs_bucket_size(cloud_resource)\n", + "            print(f\"   ✅ Actual size determined: {actual_size:.2f} GB\")\n", + "            \n", + "        elif resource_type == 'BQ_DATASET' and cloud_resource != 'Unknown':\n", + "            actual_size = get_bigquery_dataset_size(cloud_resource)\n", + "            print(f\"   ✅ Actual size determined: {actual_size:.2f} GB\")\n", + "            \n", + "        else:\n", + "            print(f\"   ℹ️ Size analysis not available for {resource_type}\")\n", + "            \n", + "    except Exception as e:\n", + "        print(f\"   ⚠️ Could not determine size: {str(e)}\")\n", + "    \n", + "    # Add compute costs\n", + "    compute_cost = compute_price_per_hour * runtime_hours.value\n", + "    total_estimated_cost += compute_cost\n", + "    \n", + "    print(f\"\\n\" + \"=\" * 60)\n", + "    print(f\"📋 **TOTAL COST ESTIMATE FROM WORKSPACE RESOURCES**\")\n", + "    print(f\"   💻 Compute cost ({runtime_hours.value}h): ${compute_cost:.2f}\")\n", + "    print(f\"   💾 Storage cost ({total_storage_gb:.1f}GB): ${total_estimated_cost - compute_cost - total_query_costs:.4f}\")\n", + "    if total_query_costs > 0:\n", + "        print(f\"   🔍 Query costs: ${total_query_costs:.4f}\")\n", + "    print(f\"   🎯 **TOTAL: ${total_estimated_cost:.2f}**\")\n", + "    \n", + "    return total_estimated_cost\n", + "    \n", "# Run automated estimation with discovered resources\n", "print(\"\\n🚀 Running automated cost estimation...\")\n", "automated_cost = estimate_costs_from_resources()"