Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 61 additions & 111 deletions infra/batch.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
#
# Cloud Batch was chosen over a hand-rolled GCE MIG (28-21 Task 1): array tasks
# map directly to C4 shards, Spot is native, and provision→run→teardown is
# managed. The google_batch_job below is a TEMPLATE the run workflow submits
# (a Batch job is a one-shot submission, not a standing resource) — kept in
# Terraform so the shard/Spot/duration/marker-bucket wiring is reviewable and
# the run workflow references a single source of truth.
# managed. Cloud Batch is SUBMIT-ONLY — a job is a one-shot submission, NOT a
# standing Terraform resource (there is no google_batch_job type in the google /
# google-beta provider; Cloud Batch has no first-class TF resource). The submit
# path is run-weather-backfill.yml, which builds the job config and runs
# `gcloud batch jobs submit --config`. The fleet SPEC is documented below as a
# comment (not a resource) so the shard/Spot/duration/marker-bucket wiring stays
# reviewable and the workflow has a single reference. The durable progress
# bucket + its IAM binding ARE standing resources and remain declared here.

locals {
weather_image = {
Expand Down Expand Up @@ -66,113 +70,59 @@ resource "google_storage_bucket_iam_member" "backfill_progress_rw" {
# =====================================================================
# Backfill fleet — Cloud Batch array tasks, Spot, bounded (28-21, C4, H1)
# =====================================================================
# Submitted by run-weather-backfill.yml AFTER the H5 pilot cost sign-off. Shards
# the Kalshi∪Polymarket roster across array tasks (task_count); each task owns a
# disjoint out dir and rehydrates its markers from the progress bucket.
resource "google_batch_job" "weather_backfill" {
provider = google-beta

project = var.satellite_project_id
location = var.weather_region
name = "weather-backfill"

# Prevent an apply from re-submitting a finished run; the run workflow submits
# a fresh job (with a run-scoped name) at execution time. This resource is the
# canonical SPEC. Task count is the shard count (roster-driven).
task_groups {
task_count = 66 # ~Kalshi∪Polymarket roster (D-28.8); one shard per station
parallelism = 16 # bounded concurrent Spot slices

task_spec {
# Bounded maxRunDuration caps a runaway slice (T-28.21-02).
max_run_duration = "21600s" # 6h per shard ceiling

max_retry_count = 3 # a preempted Spot task is retried; markers make it idempotent

compute_resource {
cpu_milli = 4000
memory_mib = 16384
}

runnables {
container {
image_uri = local.weather_image.backfill

# --mirror gcp keeps reads in-cloud/in-region near the NODD mirror
# (big-bytes firewall). The shard index + progress bucket drive the
# disjoint out dir + durable markers (C4).
commands = [
"--mirror", "gcp",
"--roster", "kalshi,polymarket",
"--progress-bucket", google_storage_bucket.backfill_progress.name,
"--r2-bucket", var.r2_bucket,
]
}
}

environment {
variables = {
R2_BUCKET = var.r2_bucket
R2_REGION = local.r2_region
PROGRESS_BUCKET = google_storage_bucket.backfill_progress.name
}
# R2 WRITE token + EUMETSAT creds (Cloud Batch injects secrets via
# secret_variables, not value_source). Without these the write sink's
# _require_env(R2_WRITE_ACCESS_KEY_ID / R2_WRITE_SECRET_ACCESS_KEY /
# R2_ACCOUNT_ID) raises ValueError and the fleet uploads zero derived
# parquet — the serving read path would then have nothing to serve.
# EUMETSAT creds are needed for the keyed Meteosat family.
secret_variables = {
R2_ACCOUNT_ID = "${data.google_secret_manager_secret.r2_account_id.id}/versions/latest"
R2_WRITE_ACCESS_KEY_ID = "${data.google_secret_manager_secret.r2_write_access_key_id.id}/versions/latest"
R2_WRITE_SECRET_ACCESS_KEY = "${data.google_secret_manager_secret.r2_write_secret_access_key.id}/versions/latest"
EUMETSAT_CONSUMER_KEY = "${data.google_secret_manager_secret.eumetsat_consumer_key.id}/versions/latest"
EUMETSAT_CONSUMER_SECRET = "${data.google_secret_manager_secret.eumetsat_consumer_secret.id}/versions/latest"
}
}
}
}

# Spot provisioning (native Batch); no external IP; tears down on completion.
allocation_policy {
# Run as the DEDICATED backfill SA (firewall D / least-privilege). Without
# this the job runs as the project default compute SA — which cannot read the
# R2-write/EUMETSAT secrets granted only to this SA, and is over-broad. The
# SA is bound to r2-write + eumetsat + the progress bucket in secrets.tf.
service_account {
email = google_service_account.weather_backfill.email
}

instances {
policy {
machine_type = "n2-standard-4"
provisioning_model = "SPOT"
}
}
}

# Batch logs to Cloud Logging (freshness/failed-execution monitoring reads it).
logs_policy {
destination = "CLOUD_LOGGING"
}

labels = {
phase = "28"
role = "weather-backfill"
}

# The run SA needs its R2-write + EUMETSAT secret bindings (secrets.tf) + the
# progress-bucket grant before the fleet runs.
depends_on = [
google_secret_manager_secret_iam_member.access,
google_storage_bucket_iam_member.backfill_progress_rw,
]

# A submitted Batch job is immutable; ignore server-side status churn.
lifecycle {
ignore_changes = [task_groups]
}
}
# NOT a Terraform resource: Cloud Batch is submit-only, so the fleet is launched
# by run-weather-backfill.yml (`gcloud batch jobs submit --config`) AFTER the H5
# pilot cost sign-off. The job config the workflow builds is spec'd here as the
# single reviewable reference. Any change to the shape below MUST be mirrored in
# run-weather-backfill.yml (and vice versa) — they have no compile-time link.
#
# Fleet SPEC (mirrors the workflow-built config):
# project = var.satellite_project_id (mostlyright-satellite, H1)
# location = var.weather_region (us-central1, big-bytes firewall §4b)
# name = run-scoped at submit time (avoids re-submitting a finished run)
#
# task_group:
# task_count = 65 # Kalshi∪Polymarket roster (66) MINUS the one non-
# # satellite station HKO (_roster._NON_SATELLITE_STATIONS;
# # HKO has no satellite StationInfo → a shard for it would
# # resolve to zero partitions). One shard per satellite-
# # resolvable station (D-28.8). run-weather-backfill.yml
# # sets the SAME TASK_COUNT=65 — keep them in LOCKSTEP
# # with the roster.
# parallelism = 16 # bounded concurrent Spot slices
# task_spec:
# max_run_duration = 21600s # 6h per-shard ceiling, caps a runaway (T-28.21-02)
# max_retry_count = 3 # preempted Spot task retried; markers idempotent (C4)
# compute_resource = { cpu_milli = 4000, memory_mib = 16384 }
# container:
# image_uri = local.weather_image.backfill
# commands = ["--mirror","gcp","--roster","kalshi,polymarket",
# "--progress-bucket", <backfill_progress bucket name>,
# "--r2-bucket", var.r2_bucket]
# # --mirror gcp keeps reads in-cloud/in-region near the NODD mirror.
# environment.variables = { R2_BUCKET, R2_REGION, PROGRESS_BUCKET }
# environment.secret_variables = { # Batch injects via secret_variables
# R2_ACCOUNT_ID, R2_WRITE_ACCESS_KEY_ID, R2_WRITE_SECRET_ACCESS_KEY,
# EUMETSAT_CONSUMER_KEY, EUMETSAT_CONSUMER_SECRET # all .../versions/latest
# }
# # Without the R2-write creds the write sink _require_env(...) raises and the
# # fleet uploads zero derived parquet; EUMETSAT creds serve the keyed Meteosat family.
#
# allocation_policy:
# service_account = google_service_account.weather_backfill.email
# # DEDICATED backfill SA (firewall D / least-privilege). The default compute
# # SA cannot read the R2-write/EUMETSAT secrets granted only to this SA and
# # is over-broad. Bound to r2-write + eumetsat + the progress bucket (secrets.tf).
# instances.policy = { machine_type = "n2-standard-4", provisioning_model = "SPOT" }
# # Spot; no external IP; tears down on completion.
#
# logs_policy.destination = "CLOUD_LOGGING" # freshness/failed-execution monitoring reads it
# labels = { phase = "28", role = "weather-backfill" }
#
# Preconditions (satisfied by standing resources in this root): the SA's
# R2-write + EUMETSAT secret bindings (secrets.tf) and the progress-bucket
# grant (google_storage_bucket_iam_member.backfill_progress_rw, above) must
# exist before the fleet runs.

# =====================================================================
# Incremental daily ingest — Cloud Run Job, us-central1 (28-22, H1)
Expand Down
Loading