mostlyrightmd · helloiamvu · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026
diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf
@@ -29,11 +29,11 @@ locals {
   ar_image_base = var.artifact_registry
 
   image = {
-    capture     = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}"
-    stt         = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}"
-    rolefact    = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}"
-    serving     = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}"
-    wx_serving  = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}"
+    capture    = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}"
+    stt        = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}"
+    rolefact   = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}"
+    serving    = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}"
+    wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}"
   }
 
   # R2 endpoint host: https://<account-id>.r2.cloudflarestorage.com. The account
@@ -59,14 +59,25 @@ resource "google_cloud_run_v2_service" "earnings_serving" {
   name     = "earnings-serving"
   location = var.serving_region
 
-  # Pin to exactly one instance for the single-instance SSE fan-out (H2). The
-  # scheduler.tf live-window job flips min 0<->1; max is always 1.
+  # Service-level min floor. scheduler.tf PATCHes THIS field
+  # (updateMask=scaling.minInstanceCount) to flip the live-window warm/cool
+  # 0<->1, so min MUST stay at the service level: a revision-level
+  # template.scaling.min_instance_count is invisible to the scheduler and would
+  # pin the SSE instance warm 24/7 (the cool-down never scales it to zero).
+  # max_instance_count is NOT a valid field on the service-level scaling block;
+  # it lives in template.scaling below.
   scaling {
     min_instance_count = 1
-    max_instance_count = 1
   }
 
   template {
+    # Pin max to exactly one instance for the single-instance SSE fan-out (H2):
+    # two instances would split-brain the in-process fan-out. max_instance_count
+    # is a revision-level field (the service-level scaling block does not accept it).
+    scaling {
+      max_instance_count = 1
+    }
+
     # Session affinity so a reconnecting EventSource sticks to the one instance
     # holding the ring buffer (H2/H3 Last-Event-ID replay).
     session_affinity = true
@@ -171,12 +182,14 @@ resource "google_cloud_run_v2_service" "weather_serving" {
   name     = "weather-serving"
   location = var.serving_region
 
-  scaling {
-    min_instance_count = 0
-    max_instance_count = var.serving_rest_max_instances
-  }
-
   template {
+    # max_instance_count lives in template.scaling (the top-level scaling block
+    # does not accept it under the pinned provider); min stays 0 for idle-cheap.
+    scaling {
+      min_instance_count = 0
+      max_instance_count = var.serving_rest_max_instances
+    }
+
     service_account = google_service_account.serving.email
 
     containers {
@@ -280,12 +293,15 @@ resource "google_cloud_run_v2_service" "stt" {
   provider     = google-beta
   launch_stage = "BETA"
 
-  scaling {
-    min_instance_count = 0
-    max_instance_count = var.stt_max_concurrency
-  }
-
   template {
+    # Scale-to-zero (min=0); max bounded to the confirmed L4 quota (H8).
+    # max_instance_count lives in template.scaling (the top-level scaling block
+    # does not accept it under the pinned provider).
+    scaling {
+      min_instance_count = 0
+      max_instance_count = var.stt_max_concurrency
+    }
+
     service_account               = google_service_account.earnings_stt.email
     gpu_zonal_redundancy_disabled = true
 

diff --git a/infra/scheduler.tf b/infra/scheduler.tf
@@ -77,7 +77,10 @@ resource "google_cloud_run_v2_service_iam_member" "sched_sse_developer" {
 
 locals {
   # Cloud Run Admin API endpoint to PATCH the serving service's scaling. The
-  # scheduler bodies set min-instances via the annotation; max stays 1.
+  # updateMask restricts the patch to the service-level scaling.minInstanceCount
+  # ONLY; max is pinned at the revision level in cloud_run.tf (template.scaling)
+  # and is NOT a valid service-level field, so the bodies below send just
+  # minInstanceCount.
   serving_admin_url = "https://run.googleapis.com/v2/projects/${google_project.serving.project_id}/locations/${var.serving_region}/services/${google_cloud_run_v2_service.earnings_serving.name}?updateMask=scaling.minInstanceCount"
 }
 
@@ -92,7 +95,7 @@ resource "google_cloud_scheduler_job" "sse_warm" {
   http_target {
     http_method = "PATCH"
     uri         = local.serving_admin_url
-    body        = base64encode("{\"scaling\":{\"minInstanceCount\":1,\"maxInstanceCount\":1}}")
+    body        = base64encode("{\"scaling\":{\"minInstanceCount\":1}}")
 
     headers = {
       "Content-Type" = "application/json"
@@ -115,7 +118,7 @@ resource "google_cloud_scheduler_job" "sse_cool" {
   http_target {
     http_method = "PATCH"
     uri         = local.serving_admin_url
-    body        = base64encode("{\"scaling\":{\"minInstanceCount\":0,\"maxInstanceCount\":1}}")
+    body        = base64encode("{\"scaling\":{\"minInstanceCount\":0}}")
 
     headers = {
       "Content-Type" = "application/json"