diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index b6220f0..53deb7f 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -29,11 +29,11 @@ locals { ar_image_base = var.artifact_registry image = { - capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}" - stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}" - rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}" - serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}" - wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}" + capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}" + stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}" + rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}" + serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}" + wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}" } # R2 endpoint host: https://.r2.cloudflarestorage.com. The account @@ -59,14 +59,25 @@ resource "google_cloud_run_v2_service" "earnings_serving" { name = "earnings-serving" location = var.serving_region - # Pin to exactly one instance for the single-instance SSE fan-out (H2). The - # scheduler.tf live-window job flips min 0<->1; max is always 1. + # Service-level min floor. scheduler.tf PATCHes THIS field + # (updateMask=scaling.minInstanceCount) to flip the live-window warm/cool + # 0<->1, so min MUST stay at the service level: a revision-level + # template.scaling.min_instance_count is invisible to the scheduler and would + # pin the SSE instance warm 24/7 (the cool-down never scales it to zero). + # max_instance_count is NOT a valid field on the service-level scaling block; + # it lives in template.scaling below. scaling { min_instance_count = 1 - max_instance_count = 1 } template { + # Pin max to exactly one instance for the single-instance SSE fan-out (H2): + # two instances would split-brain the in-process fan-out. max_instance_count + # is a revision-level field (the service-level scaling block does not accept it). + scaling { + max_instance_count = 1 + } + # Session affinity so a reconnecting EventSource sticks to the one instance # holding the ring buffer (H2/H3 Last-Event-ID replay). session_affinity = true @@ -171,12 +182,14 @@ resource "google_cloud_run_v2_service" "weather_serving" { name = "weather-serving" location = var.serving_region - scaling { - min_instance_count = 0 - max_instance_count = var.serving_rest_max_instances - } - template { + # max_instance_count lives in template.scaling (the top-level scaling block + # does not accept it under the pinned provider); min stays 0 for idle-cheap. + scaling { + min_instance_count = 0 + max_instance_count = var.serving_rest_max_instances + } + service_account = google_service_account.serving.email containers { @@ -280,12 +293,15 @@ resource "google_cloud_run_v2_service" "stt" { provider = google-beta launch_stage = "BETA" - scaling { - min_instance_count = 0 - max_instance_count = var.stt_max_concurrency - } - template { + # Scale-to-zero (min=0); max bounded to the confirmed L4 quota (H8). + # max_instance_count lives in template.scaling (the top-level scaling block + # does not accept it under the pinned provider). + scaling { + min_instance_count = 0 + max_instance_count = var.stt_max_concurrency + } + service_account = google_service_account.earnings_stt.email gpu_zonal_redundancy_disabled = true diff --git a/infra/scheduler.tf b/infra/scheduler.tf index 9d81121..5758c11 100644 --- a/infra/scheduler.tf +++ b/infra/scheduler.tf @@ -77,7 +77,10 @@ resource "google_cloud_run_v2_service_iam_member" "sched_sse_developer" { locals { # Cloud Run Admin API endpoint to PATCH the serving service's scaling. The - # scheduler bodies set min-instances via the annotation; max stays 1. + # updateMask restricts the patch to the service-level scaling.minInstanceCount + # ONLY; max is pinned at the revision level in cloud_run.tf (template.scaling) + # and is NOT a valid service-level field, so the bodies below send just + # minInstanceCount. serving_admin_url = "https://run.googleapis.com/v2/projects/${google_project.serving.project_id}/locations/${var.serving_region}/services/${google_cloud_run_v2_service.earnings_serving.name}?updateMask=scaling.minInstanceCount" } @@ -92,7 +95,7 @@ resource "google_cloud_scheduler_job" "sse_warm" { http_target { http_method = "PATCH" uri = local.serving_admin_url - body = base64encode("{\"scaling\":{\"minInstanceCount\":1,\"maxInstanceCount\":1}}") + body = base64encode("{\"scaling\":{\"minInstanceCount\":1}}") headers = { "Content-Type" = "application/json" @@ -115,7 +118,7 @@ resource "google_cloud_scheduler_job" "sse_cool" { http_target { http_method = "PATCH" uri = local.serving_admin_url - body = base64encode("{\"scaling\":{\"minInstanceCount\":0,\"maxInstanceCount\":1}}") + body = base64encode("{\"scaling\":{\"minInstanceCount\":0}}") headers = { "Content-Type" = "application/json"