From 08aafadb655eab2247dc79cfa9a73f181a2614cb Mon Sep 17 00:00:00 2001
From: helloiamvu <vuhcze@gmail.com>
Date: Fri, 3 Jul 2026 21:48:36 +0200
Subject: [PATCH 1/3] fix(28): move Cloud Run max_instance_count into
 template.scaling for provider 6.50.0

The pinned hashicorp/google 6.50.0 schema does not accept max_instance_count
in the TOP-LEVEL `scaling` block of google_cloud_run_v2_service (that block
only supports min_instance_count / manual_instance_count / scaling_mode);
autoscaling max belongs in `template { scaling { ... } }`. `tofu validate`
failed with "An argument named max_instance_count is not expected here" at
all three services, blocking Phase 28 `tofu apply`.

Move min+max_instance_count into each service's template.scaling block,
keeping the same values (earnings_serving min=1/max=1, weather_serving
min=0/max=var, stt min=0/max=var). earnings_serving had the same bug, not
just weather_serving and stt. Placement-only change; no value changes.

`tofu -chdir=infra validate` now reports 0 cloud_run.tf errors.

Pre-existing on main (PR #92), independent of the deploy-runtime-layer branch.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 infra/cloud_run.tf | 50 ++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 22 deletions(-)
diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf
index b6220f0..c912ea0 100644
--- a/infra/cloud_run.tf
+++ b/infra/cloud_run.tf
@@ -29,11 +29,11 @@ locals {
   ar_image_base = var.artifact_registry
 
   image = {
-    capture     = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}"
-    stt         = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}"
-    rolefact    = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}"
-    serving     = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}"
-    wx_serving  = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}"
+    capture    = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}"
+    stt        = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}"
+    rolefact   = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}"
+    serving    = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}"
+    wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}"
   }
 
   # R2 endpoint host: https://<account-id>.r2.cloudflarestorage.com. The account
@@ -59,14 +59,15 @@ resource "google_cloud_run_v2_service" "earnings_serving" {
   name     = "earnings-serving"
   location = var.serving_region
 
-  # Pin to exactly one instance for the single-instance SSE fan-out (H2). The
-  # scheduler.tf live-window job flips min 0<->1; max is always 1.
-  scaling {
-    min_instance_count = 1
-    max_instance_count = 1
-  }
-
   template {
+    # Pin to exactly one instance for the single-instance SSE fan-out (H2). The
+    # scheduler.tf live-window job flips min 0<->1; max is always 1. max_instance_count
+    # lives in template.scaling (the top-level scaling block does not accept it).
+    scaling {
+      min_instance_count = 1
+      max_instance_count = 1
+    }
+
     # Session affinity so a reconnecting EventSource sticks to the one instance
     # holding the ring buffer (H2/H3 Last-Event-ID replay).
     session_affinity = true
@@ -171,12 +172,14 @@ resource "google_cloud_run_v2_service" "weather_serving" {
   name     = "weather-serving"
   location = var.serving_region
 
-  scaling {
-    min_instance_count = 0
-    max_instance_count = var.serving_rest_max_instances
-  }
-
   template {
+    # max_instance_count lives in template.scaling (the top-level scaling block
+    # does not accept it under the pinned provider); min stays 0 for idle-cheap.
+    scaling {
+      min_instance_count = 0
+      max_instance_count = var.serving_rest_max_instances
+    }
+
     service_account = google_service_account.serving.email
 
     containers {
@@ -280,12 +283,15 @@ resource "google_cloud_run_v2_service" "stt" {
   provider     = google-beta
   launch_stage = "BETA"
 
-  scaling {
-    min_instance_count = 0
-    max_instance_count = var.stt_max_concurrency
-  }
-
   template {
+    # Scale-to-zero (min=0); max bounded to the confirmed L4 quota (H8).
+    # max_instance_count lives in template.scaling (the top-level scaling block
+    # does not accept it under the pinned provider).
+    scaling {
+      min_instance_count = 0
+      max_instance_count = var.stt_max_concurrency
+    }
+
     service_account               = google_service_account.earnings_stt.email
     gpu_zonal_redundancy_disabled = true
 

From 2568ff90896bcf1a5b1dc360155d7832775f0419 Mon Sep 17 00:00:00 2001
From: helloiamvu <vuhcze@gmail.com>
Date: Fri, 3 Jul 2026 22:12:55 +0200
Subject: [PATCH 2/3] fix(28): keep earnings-serving min at service-level
 scaling for scheduler cool-down

Codex P2: moving min_instance_count into template.scaling decoupled it from
scheduler.tf, which PATCHes the service-level scaling.minInstanceCount to flip
the live-window warm/cool 0<->1. A revision-level min would pin the SSE
instance warm 24/7. Keep min at service level; only max moves to
template.scaling (max is invalid on the service-level block).
---
 infra/cloud_run.tf | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf
index c912ea0..53deb7f 100644
--- a/infra/cloud_run.tf
+++ b/infra/cloud_run.tf
@@ -59,12 +59,22 @@ resource "google_cloud_run_v2_service" "earnings_serving" {
   name     = "earnings-serving"
   location = var.serving_region
 
+  # Service-level min floor. scheduler.tf PATCHes THIS field
+  # (updateMask=scaling.minInstanceCount) to flip the live-window warm/cool
+  # 0<->1, so min MUST stay at the service level: a revision-level
+  # template.scaling.min_instance_count is invisible to the scheduler and would
+  # pin the SSE instance warm 24/7 (the cool-down never scales it to zero).
+  # max_instance_count is NOT a valid field on the service-level scaling block;
+  # it lives in template.scaling below.
+  scaling {
+    min_instance_count = 1
+  }
+
   template {
-    # Pin to exactly one instance for the single-instance SSE fan-out (H2). The
-    # scheduler.tf live-window job flips min 0<->1; max is always 1. max_instance_count
-    # lives in template.scaling (the top-level scaling block does not accept it).
+    # Pin max to exactly one instance for the single-instance SSE fan-out (H2):
+    # two instances would split-brain the in-process fan-out. max_instance_count
+    # is a revision-level field (the service-level scaling block does not accept it).
     scaling {
-      min_instance_count = 1
       max_instance_count = 1
     }
 

From 45a04ff6ac4af77a69ccb5b3fd8de347409a4e11 Mon Sep 17 00:00:00 2001
From: helloiamvu <vuhcze@gmail.com>
Date: Fri, 3 Jul 2026 22:15:57 +0200
Subject: [PATCH 3/3] fix(28): scheduler patches only service-level
 minInstanceCount (drop invalid max)

Codex P1: the warm/cool PATCH bodies sent
{"scaling":{"minInstanceCount":N,"maxInstanceCount":1}} to the service-level
scaling field. maxInstanceCount is not a valid service-level ServiceScaling
field (it is revision-level, template.scaling in cloud_run.tf) and is outside
the updateMask=scaling.minInstanceCount, so it risks a 400 that stalls the SSE
warm/cool automation. Send only minInstanceCount.
---
 infra/scheduler.tf | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/infra/scheduler.tf b/infra/scheduler.tf
index 9d81121..5758c11 100644
--- a/infra/scheduler.tf
+++ b/infra/scheduler.tf
@@ -77,7 +77,10 @@ resource "google_cloud_run_v2_service_iam_member" "sched_sse_developer" {
 
 locals {
   # Cloud Run Admin API endpoint to PATCH the serving service's scaling. The
-  # scheduler bodies set min-instances via the annotation; max stays 1.
+  # updateMask restricts the patch to the service-level scaling.minInstanceCount
+  # ONLY; max is pinned at the revision level in cloud_run.tf (template.scaling)
+  # and is NOT a valid service-level field, so the bodies below send just
+  # minInstanceCount.
   serving_admin_url = "https://run.googleapis.com/v2/projects/${google_project.serving.project_id}/locations/${var.serving_region}/services/${google_cloud_run_v2_service.earnings_serving.name}?updateMask=scaling.minInstanceCount"
 }
 
@@ -92,7 +95,7 @@ resource "google_cloud_scheduler_job" "sse_warm" {
   http_target {
     http_method = "PATCH"
     uri         = local.serving_admin_url
-    body        = base64encode("{\"scaling\":{\"minInstanceCount\":1,\"maxInstanceCount\":1}}")
+    body        = base64encode("{\"scaling\":{\"minInstanceCount\":1}}")
 
     headers = {
       "Content-Type" = "application/json"
@@ -115,7 +118,7 @@ resource "google_cloud_scheduler_job" "sse_cool" {
   http_target {
     http_method = "PATCH"
     uri         = local.serving_admin_url
-    body        = base64encode("{\"scaling\":{\"minInstanceCount\":0,\"maxInstanceCount\":1}}")
+    body        = base64encode("{\"scaling\":{\"minInstanceCount\":0}}")
 
     headers = {
       "Content-Type" = "application/json"