From 08aafadb655eab2247dc79cfa9a73f181a2614cb Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 21:48:36 +0200 Subject: [PATCH 1/3] fix(28): move Cloud Run max_instance_count into template.scaling for provider 6.50.0 The pinned hashicorp/google 6.50.0 schema does not accept max_instance_count in the TOP-LEVEL `scaling` block of google_cloud_run_v2_service (that block only supports min_instance_count / manual_instance_count / scaling_mode); autoscaling max belongs in `template { scaling { ... } }`. `tofu validate` failed with "An argument named max_instance_count is not expected here" at all three services, blocking Phase 28 `tofu apply`. Move min+max_instance_count into each service's template.scaling block, keeping the same values (earnings_serving min=1/max=1, weather_serving min=0/max=var, stt min=0/max=var). earnings_serving had the same bug, not just weather_serving and stt. Placement-only change; no value changes. `tofu -chdir=infra validate` now reports 0 cloud_run.tf errors. Pre-existing on main (PR #92), independent of the deploy-runtime-layer branch. Co-Authored-By: Claude Opus 4.8 --- infra/cloud_run.tf | 50 ++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index b6220f0..c912ea0 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -29,11 +29,11 @@ locals { ar_image_base = var.artifact_registry image = { - capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}" - stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}" - rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}" - serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}" - wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}" + capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}" + stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}" + rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}" + serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}" + wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}" } # R2 endpoint host: https://.r2.cloudflarestorage.com. The account @@ -59,14 +59,15 @@ resource "google_cloud_run_v2_service" "earnings_serving" { name = "earnings-serving" location = var.serving_region - # Pin to exactly one instance for the single-instance SSE fan-out (H2). The - # scheduler.tf live-window job flips min 0<->1; max is always 1. - scaling { - min_instance_count = 1 - max_instance_count = 1 - } - template { + # Pin to exactly one instance for the single-instance SSE fan-out (H2). The + # scheduler.tf live-window job flips min 0<->1; max is always 1. max_instance_count + # lives in template.scaling (the top-level scaling block does not accept it). + scaling { + min_instance_count = 1 + max_instance_count = 1 + } + # Session affinity so a reconnecting EventSource sticks to the one instance # holding the ring buffer (H2/H3 Last-Event-ID replay). session_affinity = true @@ -171,12 +172,14 @@ resource "google_cloud_run_v2_service" "weather_serving" { name = "weather-serving" location = var.serving_region - scaling { - min_instance_count = 0 - max_instance_count = var.serving_rest_max_instances - } - template { + # max_instance_count lives in template.scaling (the top-level scaling block + # does not accept it under the pinned provider); min stays 0 for idle-cheap. + scaling { + min_instance_count = 0 + max_instance_count = var.serving_rest_max_instances + } + service_account = google_service_account.serving.email containers { @@ -280,12 +283,15 @@ resource "google_cloud_run_v2_service" "stt" { provider = google-beta launch_stage = "BETA" - scaling { - min_instance_count = 0 - max_instance_count = var.stt_max_concurrency - } - template { + # Scale-to-zero (min=0); max bounded to the confirmed L4 quota (H8). + # max_instance_count lives in template.scaling (the top-level scaling block + # does not accept it under the pinned provider). + scaling { + min_instance_count = 0 + max_instance_count = var.stt_max_concurrency + } + service_account = google_service_account.earnings_stt.email gpu_zonal_redundancy_disabled = true From 2568ff90896bcf1a5b1dc360155d7832775f0419 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 22:12:55 +0200 Subject: [PATCH 2/3] fix(28): keep earnings-serving min at service-level scaling for scheduler cool-down Codex P2: moving min_instance_count into template.scaling decoupled it from scheduler.tf, which PATCHes the service-level scaling.minInstanceCount to flip the live-window warm/cool 0<->1. A revision-level min would pin the SSE instance warm 24/7. Keep min at service level; only max moves to template.scaling (max is invalid on the service-level block). --- infra/cloud_run.tf | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index c912ea0..53deb7f 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -59,12 +59,22 @@ resource "google_cloud_run_v2_service" "earnings_serving" { name = "earnings-serving" location = var.serving_region + # Service-level min floor. scheduler.tf PATCHes THIS field + # (updateMask=scaling.minInstanceCount) to flip the live-window warm/cool + # 0<->1, so min MUST stay at the service level: a revision-level + # template.scaling.min_instance_count is invisible to the scheduler and would + # pin the SSE instance warm 24/7 (the cool-down never scales it to zero). + # max_instance_count is NOT a valid field on the service-level scaling block; + # it lives in template.scaling below. + scaling { + min_instance_count = 1 + } + template { - # Pin to exactly one instance for the single-instance SSE fan-out (H2). The - # scheduler.tf live-window job flips min 0<->1; max is always 1. max_instance_count - # lives in template.scaling (the top-level scaling block does not accept it). + # Pin max to exactly one instance for the single-instance SSE fan-out (H2): + # two instances would split-brain the in-process fan-out. max_instance_count + # is a revision-level field (the service-level scaling block does not accept it). scaling { - min_instance_count = 1 max_instance_count = 1 } From 45a04ff6ac4af77a69ccb5b3fd8de347409a4e11 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 22:15:57 +0200 Subject: [PATCH 3/3] fix(28): scheduler patches only service-level minInstanceCount (drop invalid max) Codex P1: the warm/cool PATCH bodies sent {"scaling":{"minInstanceCount":N,"maxInstanceCount":1}} to the service-level scaling field. maxInstanceCount is not a valid service-level ServiceScaling field (it is revision-level, template.scaling in cloud_run.tf) and is outside the updateMask=scaling.minInstanceCount, so it risks a 400 that stalls the SSE warm/cool automation. Send only minInstanceCount. --- infra/scheduler.tf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/infra/scheduler.tf b/infra/scheduler.tf index 9d81121..5758c11 100644 --- a/infra/scheduler.tf +++ b/infra/scheduler.tf @@ -77,7 +77,10 @@ resource "google_cloud_run_v2_service_iam_member" "sched_sse_developer" { locals { # Cloud Run Admin API endpoint to PATCH the serving service's scaling. The - # scheduler bodies set min-instances via the annotation; max stays 1. + # updateMask restricts the patch to the service-level scaling.minInstanceCount + # ONLY; max is pinned at the revision level in cloud_run.tf (template.scaling) + # and is NOT a valid service-level field, so the bodies below send just + # minInstanceCount. serving_admin_url = "https://run.googleapis.com/v2/projects/${google_project.serving.project_id}/locations/${var.serving_region}/services/${google_cloud_run_v2_service.earnings_serving.name}?updateMask=scaling.minInstanceCount" } @@ -92,7 +95,7 @@ resource "google_cloud_scheduler_job" "sse_warm" { http_target { http_method = "PATCH" uri = local.serving_admin_url - body = base64encode("{\"scaling\":{\"minInstanceCount\":1,\"maxInstanceCount\":1}}") + body = base64encode("{\"scaling\":{\"minInstanceCount\":1}}") headers = { "Content-Type" = "application/json" @@ -115,7 +118,7 @@ resource "google_cloud_scheduler_job" "sse_cool" { http_target { http_method = "PATCH" uri = local.serving_admin_url - body = base64encode("{\"scaling\":{\"minInstanceCount\":0,\"maxInstanceCount\":1}}") + body = base64encode("{\"scaling\":{\"minInstanceCount\":0}}") headers = { "Content-Type" = "application/json"