From a89ad0d7798399083e05683401962b9a03a9c4b0 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 21:48:21 +0200 Subject: [PATCH] fix(28-21): drop non-existent google_batch_job resource; keep fleet spec as docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit google_batch_job is not a real resource type in the hashicorp/google or google-beta provider (verified vs pinned 6.50.0) — Cloud Batch is submit-only, so `tofu -chdir=infra validate` failed with "provider hashicorp/google-beta does not support resource type google_batch_job", blocking any Phase 28 apply. Pre-existing (merged in #92), independent of the phase28/deploy-runtime-layer branch. Remove the phantom resource and preserve the shard/Spot/duration/secret/ marker-bucket spec as a reviewable comment. run-weather-backfill.yml is the real submit path (`gcloud batch jobs submit --config`). Document task_count=65 (Kalshi∪Polymarket roster minus non-satellite HKO) to match the workflow's TASK_COUNT=65 — the committed file had drifted to 66. The durable progress bucket + its IAM binding stay as standing resources. Eliminates the batch.tf validate error; the remaining cloud_run.tf max_instance_count errors are a separate task. Co-Authored-By: Claude Opus 4.8 --- infra/batch.tf | 172 ++++++++++++++++++------------------------------- 1 file changed, 61 insertions(+), 111 deletions(-) diff --git a/infra/batch.tf b/infra/batch.tf index 75d0118..e2681b4 100644 --- a/infra/batch.tf +++ b/infra/batch.tf @@ -15,10 +15,14 @@ # # Cloud Batch was chosen over a hand-rolled GCE MIG (28-21 Task 1): array tasks # map directly to C4 shards, Spot is native, and provision→run→teardown is -# managed. The google_batch_job below is a TEMPLATE the run workflow submits -# (a Batch job is a one-shot submission, not a standing resource) — kept in -# Terraform so the shard/Spot/duration/marker-bucket wiring is reviewable and -# the run workflow references a single source of truth. +# managed. Cloud Batch is SUBMIT-ONLY — a job is a one-shot submission, NOT a +# standing Terraform resource (there is no google_batch_job type in the google / +# google-beta provider; Cloud Batch has no first-class TF resource). The submit +# path is run-weather-backfill.yml, which builds the job config and runs +# `gcloud batch jobs submit --config`. The fleet SPEC is documented below as a +# comment (not a resource) so the shard/Spot/duration/marker-bucket wiring stays +# reviewable and the workflow has a single reference. The durable progress +# bucket + its IAM binding ARE standing resources and remain declared here. locals { weather_image = { @@ -66,113 +70,59 @@ resource "google_storage_bucket_iam_member" "backfill_progress_rw" { # ===================================================================== # Backfill fleet — Cloud Batch array tasks, Spot, bounded (28-21, C4, H1) # ===================================================================== -# Submitted by run-weather-backfill.yml AFTER the H5 pilot cost sign-off. Shards -# the Kalshi∪Polymarket roster across array tasks (task_count); each task owns a -# disjoint out dir and rehydrates its markers from the progress bucket. -resource "google_batch_job" "weather_backfill" { - provider = google-beta - - project = var.satellite_project_id - location = var.weather_region - name = "weather-backfill" - - # Prevent an apply from re-submitting a finished run; the run workflow submits - # a fresh job (with a run-scoped name) at execution time. This resource is the - # canonical SPEC. Task count is the shard count (roster-driven). - task_groups { - task_count = 66 # ~Kalshi∪Polymarket roster (D-28.8); one shard per station - parallelism = 16 # bounded concurrent Spot slices - - task_spec { - # Bounded maxRunDuration caps a runaway slice (T-28.21-02). - max_run_duration = "21600s" # 6h per shard ceiling - - max_retry_count = 3 # a preempted Spot task is retried; markers make it idempotent - - compute_resource { - cpu_milli = 4000 - memory_mib = 16384 - } - - runnables { - container { - image_uri = local.weather_image.backfill - - # --mirror gcp keeps reads in-cloud/in-region near the NODD mirror - # (big-bytes firewall). The shard index + progress bucket drive the - # disjoint out dir + durable markers (C4). - commands = [ - "--mirror", "gcp", - "--roster", "kalshi,polymarket", - "--progress-bucket", google_storage_bucket.backfill_progress.name, - "--r2-bucket", var.r2_bucket, - ] - } - } - - environment { - variables = { - R2_BUCKET = var.r2_bucket - R2_REGION = local.r2_region - PROGRESS_BUCKET = google_storage_bucket.backfill_progress.name - } - # R2 WRITE token + EUMETSAT creds (Cloud Batch injects secrets via - # secret_variables, not value_source). Without these the write sink's - # _require_env(R2_WRITE_ACCESS_KEY_ID / R2_WRITE_SECRET_ACCESS_KEY / - # R2_ACCOUNT_ID) raises ValueError and the fleet uploads zero derived - # parquet — the serving read path would then have nothing to serve. - # EUMETSAT creds are needed for the keyed Meteosat family. - secret_variables = { - R2_ACCOUNT_ID = "${data.google_secret_manager_secret.r2_account_id.id}/versions/latest" - R2_WRITE_ACCESS_KEY_ID = "${data.google_secret_manager_secret.r2_write_access_key_id.id}/versions/latest" - R2_WRITE_SECRET_ACCESS_KEY = "${data.google_secret_manager_secret.r2_write_secret_access_key.id}/versions/latest" - EUMETSAT_CONSUMER_KEY = "${data.google_secret_manager_secret.eumetsat_consumer_key.id}/versions/latest" - EUMETSAT_CONSUMER_SECRET = "${data.google_secret_manager_secret.eumetsat_consumer_secret.id}/versions/latest" - } - } - } - } - - # Spot provisioning (native Batch); no external IP; tears down on completion. - allocation_policy { - # Run as the DEDICATED backfill SA (firewall D / least-privilege). Without - # this the job runs as the project default compute SA — which cannot read the - # R2-write/EUMETSAT secrets granted only to this SA, and is over-broad. The - # SA is bound to r2-write + eumetsat + the progress bucket in secrets.tf. - service_account { - email = google_service_account.weather_backfill.email - } - - instances { - policy { - machine_type = "n2-standard-4" - provisioning_model = "SPOT" - } - } - } - - # Batch logs to Cloud Logging (freshness/failed-execution monitoring reads it). - logs_policy { - destination = "CLOUD_LOGGING" - } - - labels = { - phase = "28" - role = "weather-backfill" - } - - # The run SA needs its R2-write + EUMETSAT secret bindings (secrets.tf) + the - # progress-bucket grant before the fleet runs. - depends_on = [ - google_secret_manager_secret_iam_member.access, - google_storage_bucket_iam_member.backfill_progress_rw, - ] - - # A submitted Batch job is immutable; ignore server-side status churn. - lifecycle { - ignore_changes = [task_groups] - } -} +# NOT a Terraform resource: Cloud Batch is submit-only, so the fleet is launched +# by run-weather-backfill.yml (`gcloud batch jobs submit --config`) AFTER the H5 +# pilot cost sign-off. The job config the workflow builds is spec'd here as the +# single reviewable reference. Any change to the shape below MUST be mirrored in +# run-weather-backfill.yml (and vice versa) — they have no compile-time link. +# +# Fleet SPEC (mirrors the workflow-built config): +# project = var.satellite_project_id (mostlyright-satellite, H1) +# location = var.weather_region (us-central1, big-bytes firewall §4b) +# name = run-scoped at submit time (avoids re-submitting a finished run) +# +# task_group: +# task_count = 65 # Kalshi∪Polymarket roster (66) MINUS the one non- +# # satellite station HKO (_roster._NON_SATELLITE_STATIONS; +# # HKO has no satellite StationInfo → a shard for it would +# # resolve to zero partitions). One shard per satellite- +# # resolvable station (D-28.8). run-weather-backfill.yml +# # sets the SAME TASK_COUNT=65 — keep them in LOCKSTEP +# # with the roster. +# parallelism = 16 # bounded concurrent Spot slices +# task_spec: +# max_run_duration = 21600s # 6h per-shard ceiling, caps a runaway (T-28.21-02) +# max_retry_count = 3 # preempted Spot task retried; markers idempotent (C4) +# compute_resource = { cpu_milli = 4000, memory_mib = 16384 } +# container: +# image_uri = local.weather_image.backfill +# commands = ["--mirror","gcp","--roster","kalshi,polymarket", +# "--progress-bucket", , +# "--r2-bucket", var.r2_bucket] +# # --mirror gcp keeps reads in-cloud/in-region near the NODD mirror. +# environment.variables = { R2_BUCKET, R2_REGION, PROGRESS_BUCKET } +# environment.secret_variables = { # Batch injects via secret_variables +# R2_ACCOUNT_ID, R2_WRITE_ACCESS_KEY_ID, R2_WRITE_SECRET_ACCESS_KEY, +# EUMETSAT_CONSUMER_KEY, EUMETSAT_CONSUMER_SECRET # all .../versions/latest +# } +# # Without the R2-write creds the write sink _require_env(...) raises and the +# # fleet uploads zero derived parquet; EUMETSAT creds serve the keyed Meteosat family. +# +# allocation_policy: +# service_account = google_service_account.weather_backfill.email +# # DEDICATED backfill SA (firewall D / least-privilege). The default compute +# # SA cannot read the R2-write/EUMETSAT secrets granted only to this SA and +# # is over-broad. Bound to r2-write + eumetsat + the progress bucket (secrets.tf). +# instances.policy = { machine_type = "n2-standard-4", provisioning_model = "SPOT" } +# # Spot; no external IP; tears down on completion. +# +# logs_policy.destination = "CLOUD_LOGGING" # freshness/failed-execution monitoring reads it +# labels = { phase = "28", role = "weather-backfill" } +# +# Preconditions (satisfied by standing resources in this root): the SA's +# R2-write + EUMETSAT secret bindings (secrets.tf) and the progress-bucket +# grant (google_storage_bucket_iam_member.backfill_progress_rw, above) must +# exist before the fleet runs. # ===================================================================== # Incremental daily ingest — Cloud Run Job, us-central1 (28-22, H1)