Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 34 additions & 18 deletions infra/cloud_run.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ locals {
ar_image_base = var.artifact_registry

image = {
capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}"
stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}"
rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}"
serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}"
wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}"
capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}"
stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}"
rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}"
serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}"
wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}"
}

# R2 endpoint host: https://<account-id>.r2.cloudflarestorage.com. The account
Expand All @@ -59,14 +59,25 @@ resource "google_cloud_run_v2_service" "earnings_serving" {
name = "earnings-serving"
location = var.serving_region

# Pin to exactly one instance for the single-instance SSE fan-out (H2). The
# scheduler.tf live-window job flips min 0<->1; max is always 1.
# Service-level min floor. scheduler.tf PATCHes THIS field
# (updateMask=scaling.minInstanceCount) to flip the live-window warm/cool
# 0<->1, so min MUST stay at the service level: a revision-level
# template.scaling.min_instance_count is invisible to the scheduler and would
# pin the SSE instance warm 24/7 (the cool-down never scales it to zero).
# max_instance_count is NOT a valid field on the service-level scaling block;
# it lives in template.scaling below.
scaling {
min_instance_count = 1
max_instance_count = 1
}

template {
# Pin max to exactly one instance for the single-instance SSE fan-out (H2):
# two instances would split-brain the in-process fan-out. max_instance_count
# is a revision-level field (the service-level scaling block does not accept it).
scaling {
max_instance_count = 1
}

# Session affinity so a reconnecting EventSource sticks to the one instance
# holding the ring buffer (H2/H3 Last-Event-ID replay).
session_affinity = true
Expand Down Expand Up @@ -171,12 +182,14 @@ resource "google_cloud_run_v2_service" "weather_serving" {
name = "weather-serving"
location = var.serving_region

scaling {
min_instance_count = 0
max_instance_count = var.serving_rest_max_instances
}

template {
# max_instance_count lives in template.scaling (the top-level scaling block
# does not accept it under the pinned provider); min stays 0 for idle-cheap.
scaling {
min_instance_count = 0
max_instance_count = var.serving_rest_max_instances
}

service_account = google_service_account.serving.email

containers {
Expand Down Expand Up @@ -280,12 +293,15 @@ resource "google_cloud_run_v2_service" "stt" {
provider = google-beta
launch_stage = "BETA"

scaling {
min_instance_count = 0
max_instance_count = var.stt_max_concurrency
}

template {
# Scale-to-zero (min=0); max bounded to the confirmed L4 quota (H8).
# max_instance_count lives in template.scaling (the top-level scaling block
# does not accept it under the pinned provider).
scaling {
min_instance_count = 0
max_instance_count = var.stt_max_concurrency
}

service_account = google_service_account.earnings_stt.email
gpu_zonal_redundancy_disabled = true

Expand Down
9 changes: 6 additions & 3 deletions infra/scheduler.tf
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ resource "google_cloud_run_v2_service_iam_member" "sched_sse_developer" {

locals {
# Cloud Run Admin API endpoint to PATCH the serving service's scaling. The
# scheduler bodies set min-instances via the annotation; max stays 1.
# updateMask restricts the patch to the service-level scaling.minInstanceCount
# ONLY; max is pinned at the revision level in cloud_run.tf (template.scaling)
# and is NOT a valid service-level field, so the bodies below send just
# minInstanceCount.
serving_admin_url = "https://run.googleapis.com/v2/projects/${google_project.serving.project_id}/locations/${var.serving_region}/services/${google_cloud_run_v2_service.earnings_serving.name}?updateMask=scaling.minInstanceCount"
}

Expand All @@ -92,7 +95,7 @@ resource "google_cloud_scheduler_job" "sse_warm" {
http_target {
http_method = "PATCH"
uri = local.serving_admin_url
body = base64encode("{\"scaling\":{\"minInstanceCount\":1,\"maxInstanceCount\":1}}")
body = base64encode("{\"scaling\":{\"minInstanceCount\":1}}")

headers = {
"Content-Type" = "application/json"
Expand All @@ -115,7 +118,7 @@ resource "google_cloud_scheduler_job" "sse_cool" {
http_target {
http_method = "PATCH"
uri = local.serving_admin_url
body = base64encode("{\"scaling\":{\"minInstanceCount\":0,\"maxInstanceCount\":1}}")
body = base64encode("{\"scaling\":{\"minInstanceCount\":0}}")

headers = {
"Content-Type" = "application/json"
Expand Down
Loading