diff --git a/.github/workflows/deploy-earnings-capture.yml b/.github/workflows/deploy-earnings-capture.yml new file mode 100644 index 0000000..d41ef4f --- /dev/null +++ b/.github/workflows/deploy-earnings-capture.yml @@ -0,0 +1,73 @@ +name: Deploy earnings-capture (28-10) + +# Phase 28 (28-10) — WIF build+deploy for the earnings CAPTURE Cloud Run Job +# (Chromium/ffmpeg webcast capture) in mr-earnings-ingest / eu-west3. This is the +# AUDIO side of the firewall (D-27.9): audio dies on the Job's ephemeral disk and +# NEVER gets an R2 key. Image build/push + Job image-swap only; the job's args, +# the static-egress VPC pin (IVS), secrets + SA are Terraform-owned (infra/). +# +# OPERATOR-GATED: the live capture pipeline (IVS edge, static egress IP) is +# validated by an operator with a scheduled live call (28-10 Task 3). This +# workflow ships the deploy path; it does not run a live capture. +# +# Setup Variables (from `tofu -chdir=infra output`): WIF_PROVIDER, DEPLOY_SA_INGEST, +# AR_HOST, INGEST_PROJECT_ID. + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-capture + JOB: earnings-capture + REGION: europe-west3 + +jobs: + deploy: + name: Build + push capture image, roll the capture Job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_INGEST }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build capture image + env: + IMAGE_TAG: ${{ inputs.image_tag }} + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + docker build -f deploy/earnings/capture.Dockerfile -t "${IMAGE}" . + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy earnings-capture Job (image swap only; config is Terraform-owned) + run: | + gcloud run jobs deploy "${JOB}" \ + --project "${{ vars.INGEST_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy-earnings-rolefact.yml b/.github/workflows/deploy-earnings-rolefact.yml new file mode 100644 index 0000000..3bae666 --- /dev/null +++ b/.github/workflows/deploy-earnings-rolefact.yml @@ -0,0 +1,70 @@ +name: Deploy earnings-rolefact (28-13) + +# Phase 28 (28-13) — WIF build+deploy for the role/fact-builder Cloud Run Job +# (CPU) in mr-earnings-ingest / eu-west3. POST-audio side of the firewall: it +# reads transcript text, builds derived facts, and writes transcript + fact +# parquet to R2 with the WRITE token — it holds NO audio and needs no audio +# toolchain. Image build/push + Job image-swap only; args + R2-write secrets + SA +# are Terraform-owned (infra/). +# +# Setup Variables (from `tofu -chdir=infra output`): WIF_PROVIDER, DEPLOY_SA_INGEST, +# AR_HOST, INGEST_PROJECT_ID. + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-rolefact + JOB: earnings-rolefact + REGION: europe-west3 + +jobs: + deploy: + name: Build + push rolefact image, roll the rolefact Job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_INGEST }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build rolefact image + env: + IMAGE_TAG: ${{ inputs.image_tag }} + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + docker build -f deploy/earnings/rolefact.Dockerfile -t "${IMAGE}" . + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy earnings-rolefact Job (image swap only; config is Terraform-owned) + run: | + gcloud run jobs deploy "${JOB}" \ + --project "${{ vars.INGEST_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy-earnings-serving.yml b/.github/workflows/deploy-earnings-serving.yml new file mode 100644 index 0000000..4d7a92c --- /dev/null +++ b/.github/workflows/deploy-earnings-serving.yml @@ -0,0 +1,106 @@ +name: Deploy earnings-serving (28-12) + +# Phase 28 (28-12) — WIF-authenticated build+deploy for the earnings serving +# Cloud Run service (/transcripts /facts /capabilities /stream) in +# mr-serving/eu-west3. KEYLESS auth via Workload Identity Federation — no SA key +# files. The service resource + its R2-read-only secret wiring + EARNINGS_API_KEY +# + EARNINGS_STREAMING_SUBSCRIPTION + the H2 min=max=1 scaling all live in infra/ +# (cloud_run.tf earnings_serving); this workflow only builds+pushes the AUDIO-FREE +# image (deploy/earnings/serving.Dockerfile) and rolls a new revision. +# +# H2 (load-bearing): the SSE fan-out over ONE shared earnings-streaming +# subscription is correct ONLY at exactly one always-warm instance. The smoke +# step asserts min-instances=1 is preserved (a broken scaling config would +# silently split-brain the stream). Config is Terraform-owned; --image only swaps +# the container on the current config. +# +# Setup (repo/environment Variables, from `tofu -chdir=infra output`): +# WIF_PROVIDER = +# DEPLOY_SA_SERVING = deploy@mr-serving... +# AR_HOST = europe-west3-docker.pkg.dev +# SERVING_PROJECT_ID = mr-serving + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy (e.g. a git SHA or 'latest')." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-serving + SERVICE: earnings-serving + REGION: europe-west3 + +jobs: + deploy: + name: Build + push audio-free image, roll earnings-serving revision + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_SERVING }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build audio-free earnings-serving image + env: + IMAGE_TAG: ${{ inputs.image_tag }} + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + # Build from the repo root so the Dockerfile can COPY packages/ + services/. + docker build \ + -f deploy/earnings/serving.Dockerfile \ + -t "${IMAGE}" \ + . + + - name: Assert the built image carries NO audio toolchain (firewall a) + run: | + # Defense in depth: the serving image must physically omit + # faster-whisper / av / ffmpeg / chromium (D-27.9). Fail the deploy if + # any slipped in via a transitive dep. + if docker run --rm --entrypoint sh "${IMAGE}" -c \ + "pip list 2>/dev/null | grep -Eiq 'faster-whisper|^av |ffmpeg' && exit 1 || exit 0"; then + echo "audio-free image OK" + else + echo "::error::earnings-serving image contains an audio dependency — firewall a (D-27.9) breach" + exit 1 + fi + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy revision (image swap only; config is Terraform-owned) + run: | + gcloud run deploy "${SERVICE}" \ + --project "${{ vars.SERVING_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet + + - name: Verify H2 min-instances=1 preserved (single always-warm SSE instance) + run: | + MIN=$(gcloud run services describe "${SERVICE}" \ + --project "${{ vars.SERVING_PROJECT_ID }}" \ + --region "${REGION}" \ + --format="value(spec.template.metadata.annotations['autoscaling.knative.dev/minScale'])") + echo "min-instances = ${MIN:-0}" + test "${MIN:-0}" = "1" || { echo "::error::expected H2 min-instances=1 (single always-warm SSE instance)"; exit 1; } diff --git a/.github/workflows/deploy-earnings-stt.yml b/.github/workflows/deploy-earnings-stt.yml new file mode 100644 index 0000000..3c33f54 --- /dev/null +++ b/.github/workflows/deploy-earnings-stt.yml @@ -0,0 +1,86 @@ +name: Deploy earnings-stt (28-11) + +# Phase 28 (28-11) — WIF build+deploy for the STT Cloud Run service (NVIDIA L4 +# GPU, scale-to-zero) in mr-earnings-ingest / us-central1 (L4 GPU is NOT in +# eu-west3 — 28-OPERATOR-INPUTS). faster-whisper / CTranslate2, NO torch (D-27.5). +# Image build/push + service image-swap only; the GPU config, bounded concurrency +# (≤ L4 quota, H8), secrets + SA are Terraform-owned (infra/cloud_run.tf stt). +# +# OPERATOR-GATED: the live GPU smoke (a real transcription on L4) is 28-11 Task 4 +# (operator, autonomous:false) — this workflow ships the deploy path only. +# +# NOTE: the STT image is CUDA-based and large; the GitHub-hosted runner builds it +# but does not need a GPU (the build installs the CUDA runtime + faster-whisper; +# it never runs inference here). +# +# Setup Variables (from `tofu -chdir=infra output`): WIF_PROVIDER, DEPLOY_SA_INGEST, +# AR_HOST, INGEST_PROJECT_ID. + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-stt + SERVICE: earnings-stt + REGION: us-central1 + +jobs: + deploy: + name: Build + push STT (CUDA) image, roll the STT service + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_INGEST }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build STT (CUDA + faster-whisper, no torch) image + env: + IMAGE_TAG: ${{ inputs.image_tag }} + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + docker build -f deploy/earnings/stt.Dockerfile -t "${IMAGE}" . + + - name: Assert the STT image has NO torch (D-27.5 — CTranslate2 only) + run: | + if docker run --rm --entrypoint sh "${IMAGE}" -c \ + "pip list 2>/dev/null | grep -Eiq '^torch ' && exit 1 || exit 0"; then + echo "no-torch OK" + else + echo "::error::earnings-stt image pulled torch — D-27.5 forbids it (faster-whisper/CTranslate2 only)" + exit 1 + fi + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy earnings-stt service (image swap only; GPU config is Terraform-owned) + run: | + gcloud run deploy "${SERVICE}" \ + --project "${{ vars.INGEST_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy-weather-ingest.yml b/.github/workflows/deploy-weather-ingest.yml new file mode 100644 index 0000000..67fcfad --- /dev/null +++ b/.github/workflows/deploy-weather-ingest.yml @@ -0,0 +1,103 @@ +name: Deploy weather-ingest image + incremental job (28-22) + +# Phase 28 (28-20/28-22) — WIF-authenticated build+push of the SHARED weather +# ingest image and deploy of the daily INCREMENTAL Cloud Run Job in +# mostlyright-satellite (H1) / us-central1. KEYLESS via Workload Identity +# Federation — no SA key files. +# +# ONE image (deploy/weather/ingest.Dockerfile) backs BOTH the backfill fleet +# (weather-backfill, Cloud Batch — submitted by run-weather-backfill.yml) and the +# incremental job (weather-incremental, Cloud Run Job — deployed here). Infra +# references two AR image names (var.image_weather_backfill / +# image_weather_incremental), so this workflow builds ONCE and pushes the SAME +# image under BOTH names/tags — the backfill and incremental deploys then resolve +# the identical bytes. +# +# This is the CHEAP PATH (serving + daily incremental) — deploy it freely. The +# expensive 28 TB backfill fleet is gated behind run-weather-backfill.yml's cost +# sign-off, not this workflow. +# +# Setup (repo/environment Variables, from `tofu -chdir=infra output`): +# WIF_PROVIDER = +# DEPLOY_SA_SATELLITE = deploy@mostlyright-satellite... (H1: EXISTING project) +# AR_HOST = europe-west3-docker.pkg.dev +# SATELLITE_PROJECT_ID = mostlyright-satellite + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + push under both weather image names." + required: true + default: "latest" + type: string + deploy_incremental: + description: "Also roll the weather-incremental Cloud Run Job onto the new image." + required: true + default: true + type: boolean + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + BACKFILL_IMAGE_NAME: weather-backfill + INCREMENTAL_IMAGE_NAME: weather-incremental + INCREMENTAL_JOB: weather-incremental + REGION: us-central1 + +jobs: + build-and-deploy: + name: Build once, push both names, roll the incremental job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_SATELLITE }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build the shared ingest image + env: + IMAGE_TAG: ${{ inputs.image_tag }} + run: | + BASE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}" + BACKFILL_IMAGE="${BASE}/${BACKFILL_IMAGE_NAME}:${IMAGE_TAG}" + INCREMENTAL_IMAGE="${BASE}/${INCREMENTAL_IMAGE_NAME}:${IMAGE_TAG}" + echo "BACKFILL_IMAGE=${BACKFILL_IMAGE}" >> "$GITHUB_ENV" + echo "INCREMENTAL_IMAGE=${INCREMENTAL_IMAGE}" >> "$GITHUB_ENV" + docker build \ + -f deploy/weather/ingest.Dockerfile \ + -t "${BACKFILL_IMAGE}" \ + -t "${INCREMENTAL_IMAGE}" \ + . + + - name: Push both image names (identical bytes) + run: | + docker push "${BACKFILL_IMAGE}" + docker push "${INCREMENTAL_IMAGE}" + + # Roll the daily incremental Cloud Run Job onto the new image (image swap + # only; the --roster/--incremental args + R2-write secrets + SA are + # Terraform-owned in infra/batch.tf). The backfill Batch job is NOT touched + # here — it is submitted separately, after cost sign-off. + - name: Deploy weather-incremental job (image swap only) + if: ${{ inputs.deploy_incremental }} + run: | + gcloud run jobs deploy "${INCREMENTAL_JOB}" \ + --project "${{ vars.SATELLITE_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${INCREMENTAL_IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 0105c03..76c4ac9 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -9,10 +9,16 @@ name: Deploy (hosted GCE platform) # and no SA key is stored in repo secrets. The WIF provider + per-project deploy SAs # are provisioned by the Terraform root in infra/ (28-00). # -# The image-build/push + `gcloud run deploy` / Cloud Batch / MIG steps are -# STUBBED here — later waves (W1 serving, W2 ingest/fleet) fill them in against -# the existing Artifact Registry -# (europe-west3-docker.pkg.dev/mostlyright-backend/mostlyright). +# Per-service build/push + deploy now live in dedicated manual-only workflows +# (this file stays a WIF-auth identity smoke + index): +# - deploy-earnings-serving.yml earnings-serving Cloud Run svc (mr-serving) +# - deploy-weather-serving.yml weather-serving Cloud Run svc (mr-serving) +# - deploy-earnings-capture.yml earnings-capture Cloud Run Job (ingest) +# - deploy-earnings-stt.yml earnings-stt Cloud Run GPU svc (ingest, us-central1) +# - deploy-earnings-rolefact.yml earnings-rolefact Cloud Run Job (ingest) +# - deploy-weather-ingest.yml weather ingest img + weather-incremental Job (satellite) +# - run-weather-backfill.yml weather-backfill Cloud Batch fleet (satellite; cost-gated) +# All push to europe-west3-docker.pkg.dev/mostlyright-backend/mostlyright. # # Setup (one-time, after `tofu apply` in infra/): # Set the following repo/environment variables (Settings -> Variables), read @@ -82,17 +88,17 @@ jobs: - name: Verify auth (identity smoke test) run: gcloud auth list --filter=status:ACTIVE --format="value(account)" - # --------------------------------------------------------------------- - # STUB — per-service deploy workflows fill these in (28-10/11/12/13/21/22/30): - # serving : earnings-serving + weather-serving → Cloud Run (eu-west3), - # timeout 3600, SSE max-instances=1 + affinity deploy check (H2). - # ingest : capture Job + rolefact Job (eu-west3) + STT Cloud Run GPU L4 - # (us-central1, bounded concurrency ≤ L4 quota, H8). - # satellite : weather backfill (Cloud Batch, us-central1) + incremental Job - # (H1: EXISTING mostlyright-satellite project). - # All push to europe-west3-docker.pkg.dev/mostlyright-backend/mostlyright. - # --------------------------------------------------------------------- - - name: Build & deploy (placeholder) + # Per-service build+deploy moved to the dedicated workflows listed in the + # header. This job stays a keyless-auth identity smoke: it proves the + # selected deploy SA can federate, then points the operator at the right + # per-service workflow to run. + - name: Identity smoke + workflow pointer run: | - echo "Deploy target: ${{ inputs.target }}" - echo "Image build + push + gcloud run deploy stubbed — filled in by W1/W2." + echo "Authenticated deploy SA for target '${{ inputs.target }}':" + gcloud auth list --filter=status:ACTIVE --format="value(account)" + case "${{ inputs.target }}" in + serving) echo "Run: deploy-earnings-serving.yml / deploy-weather-serving.yml" ;; + ingest) echo "Run: deploy-earnings-capture.yml / deploy-earnings-stt.yml / deploy-earnings-rolefact.yml" ;; + satellite) echo "Run: deploy-weather-ingest.yml (incremental) then run-weather-backfill.yml (fleet, cost-gated)" ;; + staging) echo "Staging is gated off (enable_staging=false) until the billing quota increase." ;; + esac diff --git a/.github/workflows/run-weather-backfill.yml b/.github/workflows/run-weather-backfill.yml new file mode 100644 index 0000000..5f3cb45 --- /dev/null +++ b/.github/workflows/run-weather-backfill.yml @@ -0,0 +1,174 @@ +name: Run weather backfill fleet (28-21) + +# Phase 28 (28-21) — submit the weather backfill as a Cloud Batch job in +# mostlyright-satellite (H1) / us-central1. KEYLESS via Workload Identity +# Federation. +# +# ROLLOUT GATE (operator sequence): the DEFAULT run is a 1-STATION PILOT +# (task_count=1) — cheap, proves the read→reduce→R2-upload loop + the big-bytes +# firewall end to end. The FULL 65-shard fleet (the ~28 TB Kalshi∪Polymarket +# roster minus the non-satellite HKO, D-28.8) reduces ~28 TB of raw imagery +# in-region and is the phase's +# largest spend — it is BLOCKED unless the operator sets mode=full AND +# confirm_cost_signoff=true (the H5 pilot cost sign-off). This encodes the +# "serving + incremental first → 1-station pilot → stop at the 28 TB cost number +# for sign-off" rollout as a workflow gate, not a convention. +# +# The Batch config below mirrors infra/batch.tf (Spot n2-standard-4, parallelism +# 16, 6h/shard ceiling, dedicated weather-backfill SA, R2-write + EUMETSAT +# secrets, durable progress bucket for crash-safe resume). Keep it in sync with +# infra/batch.tf if that spec changes. +# +# Setup (repo/environment Variables, from `tofu -chdir=infra output`): +# WIF_PROVIDER = +# DEPLOY_SA_SATELLITE = deploy@mostlyright-satellite... +# AR_HOST = europe-west3-docker.pkg.dev +# SATELLITE_PROJECT_ID = mostlyright-satellite +# RUNTIME_SA_WEATHER_BACKFILL= +# R2_BUCKET = mostlyright-derived +# PROGRESS_BUCKET = mostlyright-backfill-progress- + +on: + workflow_dispatch: + inputs: + image_tag: + description: "weather-backfill image tag to run (must already be pushed by deploy-weather-ingest)." + required: true + default: "latest" + type: string + mode: + description: "pilot = 1 station (cheap, default). full = the 65-shard ~28 TB fleet (needs cost sign-off; default GOES satellites cover Americas/E-Pacific stations — non-GOES shards no-op, native ring is 28-26)." + required: true + default: "pilot" + type: choice + options: + - pilot + - full + pilot_station: + description: "ICAO for the pilot (mode=pilot only), e.g. KNYC." + required: true + default: "KNYC" + type: string + confirm_cost_signoff: + description: "H5 pilot cost sign-off — REQUIRED true to run mode=full (the ~28 TB spend)." + required: true + default: false + type: boolean + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: weather-backfill + REGION: us-central1 + +jobs: + submit: + name: Submit the backfill Batch job (pilot by default; full gated on cost sign-off) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Enforce the cost sign-off gate for a full run + run: | + if [ "${{ inputs.mode }}" = "full" ] && [ "${{ inputs.confirm_cost_signoff }}" != "true" ]; then + echo "::error::mode=full runs the 65-shard ~28 TB fleet (the phase's largest spend)." + echo "::error::Set confirm_cost_signoff=true (H5 pilot cost sign-off) to proceed, or use mode=pilot." + exit 1 + fi + echo "gate OK: mode=${{ inputs.mode }}" + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_SATELLITE }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + # Untrusted workflow_dispatch string inputs are passed via env (NOT + # interpolated into the shell/JSON text) and the JSON is assembled with jq + # --arg, so a hostile image_tag / pilot_station cannot inject shell or break + # the Batch JSON (GitHub script-injection-safe pattern). + - name: Build the Batch job config (task_count + args by mode) + env: + IMAGE_TAG: ${{ inputs.image_tag }} + MODE: ${{ inputs.mode }} + PILOT_STATION: ${{ inputs.pilot_station }} + R2_BUCKET: ${{ vars.R2_BUCKET }} + PROGRESS_BUCKET: ${{ vars.PROGRESS_BUCKET }} + RUNTIME_SA: ${{ vars.RUNTIME_SA_WEATHER_BACKFILL }} + run: | + set -euo pipefail + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + if [ "$MODE" = "full" ]; then + # 65 = the Kalshi∪Polymarket union (66) minus the one non-satellite + # station HKO (_roster._NON_SATELLITE_STATIONS); one shard per + # satellite-resolvable station so no shard resolves to zero partitions. + TASK_COUNT=65 + # Roster mode: the CLI resolves + shards the 65-station roster by + # BATCH_TASK_INDEX and supplies satellite/product/year defaults. + COMMANDS=$(jq -nc --arg pb "$PROGRESS_BUCKET" --arg rb "$R2_BUCKET" \ + '["--mirror","gcp","--roster","kalshi,polymarket","--progress-bucket",$pb,"--r2-bucket",$rb]') + else + TASK_COUNT=1 + YEAR=$(date -u +%Y) + # Pilot: explicit single-station backfill. The CLI's explicit mode + # (no --roster) requires satellites/products/year-window/out, so pass + # them all. GOES-East ACMC covers the default KNYC pilot; an + # international pilot station needs --satellites overridden. + COMMANDS=$(jq -nc --arg st "$PILOT_STATION" --arg y "$YEAR" \ + --arg pb "$PROGRESS_BUCKET" --arg rb "$R2_BUCKET" \ + '["--mirror","gcp","--satellites","goes16","--products","ABI-L2-ACMC","--stations",$st,"--year-start",$y,"--year-end",$y,"--out","/tmp/derived","--r2-target","--r2-bucket",$rb,"--progress-bucket",$pb]') + fi + jq -n \ + --arg img "${IMAGE}" \ + --argjson tc "${TASK_COUNT}" \ + --argjson cmds "${COMMANDS}" \ + --arg secrets_proj "${AR_PROJECT}" \ + --arg rb "${R2_BUCKET}" \ + --arg pb "${PROGRESS_BUCKET}" \ + --arg sa "${RUNTIME_SA}" \ + '{ + taskGroups: [{ + taskCount: $tc, + parallelism: 16, + taskSpec: { + maxRunDuration: "21600s", + maxRetryCount: 3, + computeResource: { cpuMilli: 4000, memoryMib: 16384 }, + runnables: [{ container: { imageUri: $img, commands: $cmds } }], + environment: { + variables: { R2_BUCKET: $rb, R2_REGION: "auto", PROGRESS_BUCKET: $pb }, + secretVariables: { + R2_ACCOUNT_ID: ("projects/" + $secrets_proj + "/secrets/r2-account-id/versions/latest"), + R2_WRITE_ACCESS_KEY_ID: ("projects/" + $secrets_proj + "/secrets/r2-write-access-key-id/versions/latest"), + R2_WRITE_SECRET_ACCESS_KEY: ("projects/" + $secrets_proj + "/secrets/r2-write-secret-access-key/versions/latest"), + EUMETSAT_CONSUMER_KEY: ("projects/" + $secrets_proj + "/secrets/eumetsat-consumer-key/versions/latest"), + EUMETSAT_CONSUMER_SECRET: ("projects/" + $secrets_proj + "/secrets/eumetsat-consumer-secret/versions/latest") + } + } + } + }], + allocationPolicy: { + serviceAccount: { email: $sa }, + instances: [{ policy: { machineType: "n2-standard-4", provisioningModel: "SPOT" } }] + }, + logsPolicy: { destination: "CLOUD_LOGGING" } + }' > batch-job.json + echo "task_count=${TASK_COUNT}" + cat batch-job.json + + - name: Submit the Batch job (run-scoped name) + run: | + RUN_NAME="weather-backfill-${{ inputs.mode }}-${{ github.run_id }}" + gcloud batch jobs submit "${RUN_NAME}" \ + --project "${{ vars.SATELLITE_PROJECT_ID }}" \ + --location "${REGION}" \ + --config batch-job.json + echo "Submitted ${RUN_NAME}. Monitor: gcloud batch jobs describe ${RUN_NAME} --location ${REGION} --project ${{ vars.SATELLITE_PROJECT_ID }}" diff --git a/deploy/earnings/capture.Dockerfile b/deploy/earnings/capture.Dockerfile new file mode 100644 index 0000000..418d228 --- /dev/null +++ b/deploy/earnings/capture.Dockerfile @@ -0,0 +1,64 @@ +# Earnings CAPTURE Cloud Run Job — the AUDIO side of the firewall (28-13). +# +# The VOD/replay cold-fetch stage (mr-earnings-ingest). It packages +# `services/earnings/` + the two SDK packages with the `[earnings]` extra +# (faster-whisper + `av`/PyAV) and runs `python -m services.earnings.jobs.capture`. +# +# On the AUDIO side by design (unlike the SLIM weather serving image, which OMITS +# any audio toolchain). The shipped Q4 capture surface extracts the transient +# audio track via PyAV (`av`), which ships FFmpeg's libraries as binary wheels — +# so a system `ffmpeg` is NOT strictly required. We still `apt-get install ffmpeg` +# as belt-and-suspenders for any codec PyAV's bundled libs defer to. +# +# NO chromium: the 27-03 Q4 static-MP4 capture is a cold ranged-GET over httpx +# (the guest form gates the PAGE, not the asset — RESEARCH §2). There is NO +# headless-browser navigation on this path, so no Chromium is installed. +# +# The captured audio is a TRANSIENT artifact on the task's ephemeral disk — it is +# NEVER uploaded, NEVER served, NEVER a ledger column (D-27.9). NON-published: +# this image COPYs `services/` (a monorepo service), never a PyPI wheel. + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# ffmpeg for any codec PyAV's bundled FFmpeg libs defer to (audio side — expected +# here, unlike serving). No chromium: the Q4 static-MP4 path is a cold httpx GET. +RUN apt-get update \ + && apt-get install -y --no-install-recommends ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +# --- Dependency layer -------------------------------------------------------- +# Copy the package sources first so the dep layer caches across app-code edits. +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# Install core + weather[earnings] (the [earnings] extra pulls faster-whisper + +# av/PyAV — the audio extract + STT engine deps, lazy-imported at runtime). +# google-cloud-pubsub pulls ONE capture-job spec off CAPTURE_JOBS_SUBSCRIPTION; +# google-cloud-storage uploads the transient audio to the private +# AUDIO_HANDOFF_BUCKET for the cross-service handoff to STT (capture + STT are +# separate Cloud Run resources with NO shared disk). Both are lazy-imported +# inside capture.py (never at module load). The handoff bucket is a private, +# in-firewall GCS bucket — audio never gets an R2 key, never served (D-27.9). +RUN pip install \ + ./packages/core \ + "./packages/weather[earnings]" \ + "google-cloud-pubsub>=2.18,<3" \ + "google-cloud-storage>=2.10,<4" + +# --- App layer --------------------------------------------------------------- +# The non-published service is imported as `services.earnings.*` (matching the +# repo-root conftest sys.path convention), so it is copied under /app/services. +COPY services/earnings/ services/earnings/ + +# Cloud Run Job env (infra path): CAPTURE_JOBS_SUBSCRIPTION (per-call spec pulled +# off Pub/Sub) + AUDIO_HANDOFF_BUCKET (private GCS bucket the transient audio is +# uploaded to for the STT handoff). Operator-override manual path: CAPTURE_TICKER / +# CAPTURE_CALL_ID / CAPTURE_WEBCAST_URL. The entrypoint fails loud on a missing var. +ENTRYPOINT ["python", "-m", "services.earnings.jobs.capture"] diff --git a/deploy/earnings/rolefact.Dockerfile b/deploy/earnings/rolefact.Dockerfile new file mode 100644 index 0000000..34ccf1f --- /dev/null +++ b/deploy/earnings/rolefact.Dockerfile @@ -0,0 +1,56 @@ +# Earnings ROLEFACT Cloud Run Job — role-attribution + fact-building (28-13). +# +# The POST-audio, CPU-only stage. It reads the persisted transcript TEXT from the +# transcript ledger, role-attributes turns, counts mentions, builds +# `schema.earnings_fact.v1` rows (fail-closed Kalshi filter), writes the fact +# ledger, and OPTIONALLY uploads the derived fact parquet to R2 via the shipped +# write sink. It packages `services/earnings/` + the two SDK packages with the +# `[parquet]` extra + boto3 (R2 write) and runs +# `python -m services.earnings.jobs.rolefact`. +# +# IMAGE-LEVEL AUDIO FIREWALL: this image does NOT install the `[earnings]` extra +# (which pulls the AUDIO toolchain — faster-whisper + av — into a POST-audio CPU +# image, breaking the firewall + bloating the image). rolefact only needs the +# ledger / fact-builder / role-parser code, which lazy-import NOTHING from the +# audio toolchain (verified: `mostlyright.weather.earnings.fact_builder` / +# `role_parser` / `ledger` / `classify_mentions` all import cleanly with +# faster-whisper + av absent). The `[parquet]` extra adds pandas; pyarrow + +# filelock are already base `mostlyrightmd-weather` runtime deps the ledger uses. +# boto3 is added explicitly for the R2 write sink +# (`mostlyright.weather.satellite._r2_sink`, boto3 lazy-imported there, reads the +# write-token creds from the env by NAME). +# +# NO ffmpeg / NO faster-whisper / NO av / NO chromium: entirely post-audio (never +# touches audio bytes — D-27.9). SLIM CPython base. NON-published: COPYs +# `services/`, never a PyPI wheel. + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# --- Dependency layer -------------------------------------------------------- +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# core + weather[parquet] (fact_builder / ledger / role_parser / classify_mentions +# live in the weather earnings module; they need pyarrow + filelock — base weather +# runtime deps — plus pandas from [parquet]; the AUDIO toolchain from [earnings] is +# deliberately NOT installed) + boto3 for the R2 write sink. +RUN pip install \ + ./packages/core \ + "./packages/weather[parquet]" \ + "boto3>=1.34,<2.0" + +# --- App layer --------------------------------------------------------------- +COPY services/earnings/ services/earnings/ + +# Cloud Run Jobs pass the rolefact spec via env (ROLEFACT_TICKER / ROLEFACT_CALL_ID +# / ROLEFACT_TERMS / ROLEFACT_ROSTER; R2_BUCKET — the infra env name — + R2_* write +# creds opt in to the R2 upload; ROLEFACT_R2_BUCKET is a manual-run override). The +# entrypoint fails loud on a missing var. +ENTRYPOINT ["python", "-m", "services.earnings.jobs.rolefact"] diff --git a/deploy/earnings/serving.Dockerfile b/deploy/earnings/serving.Dockerfile new file mode 100644 index 0000000..3dfc077 --- /dev/null +++ b/deploy/earnings/serving.Dockerfile @@ -0,0 +1,67 @@ +# Earnings serving image — the hosted /transcripts + /facts + /capabilities + +# /stream (SSE) REST app (28-12) in mr-serving/eu-west3. +# +# AUDIO FIREWALL (D-27.9, legal — Swatch v. Bloomberg). This SERVING image +# PHYSICALLY OMITS the audio toolchain: it installs mostlyrightmd-weather with +# the [parquet] extra ONLY (pandas for the ledger DataFrame path) — NOT the +# [earnings] extra, so faster-whisper (CTranslate2) and av (PyAV/FFmpeg) are +# ABSENT. There is no ffmpeg, no Chromium, no whisper in this image. Audio never +# reaches serving; only text/fact parquet does (via the ledger, and the +# earnings-streaming Pub/Sub bridge which carries a closed text/facts-only +# envelope). The earnings engine's heavy deps are lazy-imported, so importing +# `mostlyright.weather.earnings.{ledger,segment_bus,streaming_transcriber}` here +# needs none of them. +# +# NON-PUBLISHED: this COPYs the `services/earnings/` monorepo service; it is NOT +# a PyPI wheel and MUST NOT enter any published dist (the wheel grep-gate stays +# clean). uvicorn/fastapi/google-cloud-pubsub live in THIS image, never in a dist. +# +# Read-closed by construction: the app fails CLOSED at startup if EARNINGS_API_KEY +# is unset (services/earnings/app.py::_resolve_env_key), so a misconfigured deploy +# crashes loud rather than serving unauthenticated. Cloud Run injects PORT. + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# --- Dependency layer -------------------------------------------------------- +# Copy just the package sources the serving app imports (core + weather) first so +# the dep layer caches across app-code edits. +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# Install the two published distributions with the [parquet] extra ONLY (pandas +# for the ledger read path) — deliberately NOT [earnings] (that pulls +# faster-whisper + av, the audio toolchain the firewall forbids on serving) — plus +# the serving runtime (FastAPI + uvicorn), google-cloud-pubsub (the SSE streaming +# subscriber; only started when EARNINGS_STREAMING_SUBSCRIPTION is set), and boto3 +# for the READ-ONLY R2 source (services/earnings/r2_read.py — a fresh serving +# instance reads the durable transcript/fact parquet the ingest jobs wrote to R2, +# not its empty local disk). All three are lazy-imported, so a local ledger-only +# tier needs none of them at import. +RUN pip install \ + ./packages/core \ + "./packages/weather[parquet]" \ + "fastapi>=0.115,<1" \ + "uvicorn[standard]>=0.30" \ + "google-cloud-pubsub>=2.20,<3" \ + "boto3>=1.34,<2.0" + +# --- App layer --------------------------------------------------------------- +# The non-published serving app is imported as `services.earnings.*` (matching the +# repo-root conftest sys.path convention), so it is copied under /app/services. +COPY services/earnings/ services/earnings/ + +ENV PORT=8080 +EXPOSE 8080 + +# One uvicorn worker: the in-process per-key rate limiter + the single-instance +# SSE bus fan-out (H2) are per-process. Cloud Run pins this service at +# min=max=1 (infra/cloud_run.tf) so the shared earnings-streaming subscription is +# consumed by exactly one instance. /healthz is unauthenticated for the probe. +CMD ["sh", "-c", "uvicorn services.earnings.app:app --host 0.0.0.0 --port ${PORT} --workers 1"] diff --git a/deploy/earnings/stt.Dockerfile b/deploy/earnings/stt.Dockerfile new file mode 100644 index 0000000..477f6cf --- /dev/null +++ b/deploy/earnings/stt.Dockerfile @@ -0,0 +1,88 @@ +# Earnings STT Cloud Run GPU Job — faster-whisper transcription (28-13). +# +# The GPU transcription stage of the audio-side ingest pipeline. Runs on an L4 GPU +# in us-central1 (there is NO Cloud Run GPU in europe-west3, so this stage is +# region-split from the eu-west3 serving/ingest). It packages `services/earnings/` +# + the two SDK packages with the `[earnings]` extra and runs +# `python -m services.earnings.jobs.stt`. +# +# STT engine is faster-whisper (CTranslate2) — NO torch (D-27.5). CTranslate2 is +# the CUDA runtime; the cudnn-runtime CUDA base supplies the cuDNN/CUDA libs +# CTranslate2's GPU path links against. faster-whisper + av come from the +# `[earnings]` extra and are LAZY-imported at runtime (never at module load). +# +# Audio is a TRANSIENT input — only the transcript TEXT crosses into the ledger, +# never audio (D-27.9). NON-published: COPYs `services/`, never a PyPI wheel. + +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + DEBIAN_FRONTEND=noninteractive + +WORKDIR /app + +# python3.12 on the CUDA base (Ubuntu 22.04 ships 3.10; add the deadsnakes 3.12 +# the SDK floors target). ffmpeg for any codec PyAV's bundled libs defer to. +# +# pip bootstrap: `python3-pip` installs pip for the DISTRO python (3.10), but we +# symlink `python`->3.12 — so `python -m pip` would run under 3.12 with NO pip +# module and the build would fail. Instead bootstrap pip for 3.12 itself via +# `python3.12 -m ensurepip` (needs python3.12-venv, which ships ensurepip's +# wheels), then verify the pip target IS 3.12 before any install. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + python3.12 \ + python3.12-venv \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3.12 /usr/local/bin/python \ + && ln -sf /usr/bin/python3.12 /usr/local/bin/python3 \ + && python3.12 -m ensurepip --upgrade \ + && python3.12 -m pip install --upgrade --break-system-packages pip \ + # Fail the build loudly if `python -m pip` is not running under 3.12. + && python -m pip --version \ + && python -m pip --version | grep -q "python 3.12" + +# --- Dependency layer -------------------------------------------------------- +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# core + weather[earnings] — the [earnings] extra pins faster-whisper>=1.0,<2.0 +# (CTranslate2 Whisper; NO torch) + av. python3.12's pip resolves the wheels. +# fastapi + uvicorn back the HTTP surface: infra declares STT as a Cloud Run +# SERVICE (google_cloud_run_v2_service.stt), so the container MUST serve $PORT. +# google-cloud-storage downloads the transient audio HANDOFF object from the +# private AUDIO_HANDOFF_BUCKET (capture + STT are separate Cloud Run resources +# with NO shared disk) — lazy-imported inside stt._resolve_audio_reference. boto3 +# backs the R2 write sink (mostlyright.weather.satellite._r2_sink, lazy-imported): +# STT publishes the durable TEXT transcript parquet to R2 so the SEPARATE role/fact +# Job can read it across containers (Codex R7 P1) — never audio (D-27.9). NO torch +# anywhere (D-27.5): the STT engine is CTranslate2/faster-whisper only. +RUN python -m pip install --break-system-packages \ + ./packages/core \ + "./packages/weather[earnings]" \ + "google-cloud-storage>=2.10,<4" \ + "google-cloud-pubsub>=2.18,<3" \ + "boto3>=1.34,<2.0" \ + "fastapi>=0.115,<1" \ + "uvicorn[standard]>=0.30" + +# --- App layer --------------------------------------------------------------- +COPY services/earnings/ services/earnings/ + +# STT runs as a Cloud Run SERVICE: serve the HTTP transcription surface +# (services.earnings.jobs.stt_server:app — GET /healthz + POST /transcribe) so the +# revision becomes ready and can accept GPU transcription requests. The one-shot +# `python -m services.earnings.jobs.stt` CLI is retained in the image for the GCE +# L4 MIG fallback. faster-whisper is lazy-loaded on the first /transcribe, so +# /healthz answers without the GPU model-load cost. Cloud Run injects $PORT. +ENV PORT=8080 +EXPOSE 8080 +CMD ["sh", "-c", "uvicorn services.earnings.jobs.stt_server:app --host 0.0.0.0 --port ${PORT} --workers 1"] diff --git a/deploy/weather/ingest.Dockerfile b/deploy/weather/ingest.Dockerfile new file mode 100644 index 0000000..ca20b1d --- /dev/null +++ b/deploy/weather/ingest.Dockerfile @@ -0,0 +1,41 @@ +# Weather ingest image — the satellite backfill fleet (Cloud Batch, 28-21) AND +# the daily incremental job (Cloud Run Job, 28-22), both in mostlyright-satellite +# (H1) / us-central1. ONE image, two invocations (the ENTRYPOINT is the backfill +# subcommand; the batch/job args select roster/shard/incremental). +# +# BIG-BYTES FIREWALL (§4b): the ~28 TB raw imagery NEVER leaves the US. This image +# runs the Phase-25/26 satellite CLI near the GCS NODD mirror (--mirror gcp), +# reduces in-region, and uploads ONLY tiny derived per-station×date parquet to R2 +# (--r2-bucket). The R2 WRITE token + EUMETSAT creds are injected from Secret +# Manager as env by the deploy layer (infra/batch.tf) — never baked in. +# +# NON-PUBLISHED deploy image: it installs the PUBLISHED mostlyrightmd-weather +# [satellite] extra (boto3/s3fs/gcsfs/h5netcdf/xarray/numpy/eumdac), so unlike the +# earnings services it COPYs NO `services/` tree — it is a thin CLI wrapper around +# shipped wheel code. The heavy HDF5/xarray codecs ship as wheels (no apt). + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# Copy sources first so the (heavy) dep layer caches across edits. +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# Install core + weather[satellite] (the native satellite ring: anonymous NODD +# S3/GCS + keyed EUMETSAT Data Store, parquet reduce). No serving/audio deps. +RUN pip install \ + ./packages/core \ + "./packages/weather[satellite]" + +# The container command is the satellite `backfill` subcommand; Cloud Batch / +# Cloud Run Job append the flags (--mirror gcp --roster kalshi,polymarket +# [--incremental yesterday] --progress-bucket --r2-bucket ). Cloud Batch +# sets BATCH_TASK_INDEX/BATCH_TASK_COUNT so the CLI selects this shard's station +# from the roster (one array-task shard per station, D-28.8). +ENTRYPOINT ["python", "-m", "mostlyright.weather.satellite", "backfill"] diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index 53deb7f..75941c2 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -308,6 +308,14 @@ resource "google_cloud_run_v2_service" "stt" { # One request per instance: GPU transcription is not multiplexed. max_instance_request_concurrency = 1 + # /transcribe is SYNCHRONOUS — it transcribes the whole call before responding, + # and the capture Job holds its Pub/Sub lease waiting on that response. The + # default 300s request timeout would cut a real (multi-minute → up-to-~an-hour) + # transcription short (→ 5xx → capture NACK → duplicate recapture, Codex R7 P1), + # so pin it to the Cloud Run maximum. capture's STT_TRIGGER_TIMEOUT_SECONDS + # default (3600s, services/earnings/jobs/capture.py) matches this ceiling. + timeout = "3600s" + node_selector { accelerator = var.stt_gpu_type } @@ -327,12 +335,131 @@ resource "google_cloud_run_v2_service" "stt" { name = "AUDIO_HANDOFF_BUCKET" value = "earnings-audio-handoff-${google_project.ingest.number}" } + + # R2 (text data plane) so STT publishes the durable transcript parquet the + # SEPARATE role/fact Job reads across containers (Codex R7 P1). This is the + # TEXT transcript, never audio — the audio firewall (audio never gets an R2 + # key) is unchanged; STT is on the ingest/write side (write token, D-28.9). + env { + name = "R2_BUCKET" + value = var.r2_bucket + } + env { + name = "R2_REGION" + value = local.r2_region + } + env { + name = "R2_WRITE_ACCESS_KEY_ID" + value_source { + secret_key_ref { + secret = data.google_secret_manager_secret.r2_write_access_key_id.id + version = "latest" + } + } + } + env { + name = "R2_WRITE_SECRET_ACCESS_KEY" + value_source { + secret_key_ref { + secret = data.google_secret_manager_secret.r2_write_secret_access_key.id + version = "latest" + } + } + } + env { + name = "R2_ACCOUNT_ID" + value_source { + secret_key_ref { + secret = data.google_secret_manager_secret.r2_account_id.id + version = "latest" + } + } + } + + # Live SSE publish: the serving app ALWAYS starts the earnings-streaming + # subscriber (EARNINGS_STREAMING_SUBSCRIPTION is set unconditionally on + # earnings_serving), so STT must PUBLISH its transcript segments to that + # topic or /stream has nothing to fan out for hosted calls (Codex R7-6 P2). + # The capture->STT trigger posts no streaming fields, so the service derives + # them from these env vars. The STT SA holds pubsub.publisher on the topic + # (deploy_iam.tf). Set ENABLED=0 to turn hosted live-publish off. + env { + name = "EARNINGS_STREAMING_ENABLED" + value = "1" + } + env { + name = "EARNINGS_STREAMING_PROJECT" + value = google_project.ingest.project_id + } + env { + name = "EARNINGS_STREAMING_TOPIC" + value = google_pubsub_topic.earnings_streaming.name + } } } + # The STT SA needs its R2-write + account secret bindings (secrets.tf) before the + # revision can mount them. + depends_on = [ + google_project_service.enabled, + google_secret_manager_secret_iam_member.access, + ] +} + +# ===================================================================== +# Private in-firewall AUDIO HANDOFF bucket (28-10/28-13) — capture -> STT +# ===================================================================== +# capture and STT are SEPARATE Cloud Run resources with NO shared disk, so the +# transient audio crosses between them via this PRIVATE, in-firewall GCS object +# (never an R2 key, never served — D-27.9). Both jobs reference it by NAME via the +# AUDIO_HANDOFF_BUCKET env; it MUST exist and both SAs must be able to reach it, or +# the first real capture upload / STT download fails with a missing-bucket / 403 +# and the pipeline can never run end-to-end (Codex R7-2 P1). Co-located with STT +# (var.stt_region) so the GPU download is in-region. +# +# Lifecycle: the audio is transient — STT deletes each object right after the +# transcript is durably written (stt._delete_handoff_source), and this 1-day reaper +# is the BACKSTOP the code comments cite for any object a failed run orphans. +resource "google_storage_bucket" "earnings_audio_handoff" { + project = google_project.ingest.project_id + name = "earnings-audio-handoff-${google_project.ingest.number}" + location = upper(var.stt_region) + uniform_bucket_level_access = true + force_destroy = true # transient audio only; safe to empty on destroy + public_access_prevention = "enforced" + + lifecycle_rule { + condition { + age = 1 # days — transient audio; STT deletes post-ledger, this reaps orphans + } + action { + type = "Delete" + } + } + + labels = { + phase = "28" + role = "earnings-audio-handoff" + } + depends_on = [google_project_service.enabled] } +# capture WRITES the transient audio (create + overwrite on an idempotent retry). +resource "google_storage_bucket_iam_member" "capture_handoff_writer" { + bucket = google_storage_bucket.earnings_audio_handoff.name + role = "roles/storage.objectAdmin" + member = local.sa_earnings_capture +} + +# STT READS the handoff object and DELETES it after the transcript is durable +# (post-ledger cleanup, D-27.9) — so it needs get+delete, not read-only. +resource "google_storage_bucket_iam_member" "stt_handoff_admin" { + bucket = google_storage_bucket.earnings_audio_handoff.name + role = "roles/storage.objectAdmin" + member = local.sa_earnings_stt +} + # ===================================================================== # Capture Job — mr-earnings-ingest / europe-west3 (28-10) # ===================================================================== @@ -341,7 +468,11 @@ resource "google_cloud_run_v2_service" "stt" { # NEVER an R2 key. Egress is pinned to one static IP via the VPC connector → # Cloud NAT (28-10 earnings_network.tf) so the Amazon-IVS session pin holds; # the connector is referenced by name via env so this file stays decoupled from -# the network plan. Long task timeout covers a 90-min call. +# the network plan. The task timeout must cover the capture (60-90 min) PLUS the +# SYNCHRONOUS STT trigger wait (capture blocks on /transcribe until the transcript +# is written, up to STT's 3600s ceiling) — otherwise the job is killed before it +# acks and Pub/Sub redelivers → duplicate recapture (Codex R7-2 P1). The decoupled +# (fire-and-forget) trigger seam removes this coupling; until then, size for both. resource "google_cloud_run_v2_job" "capture" { project = google_project.ingest.project_id name = "earnings-capture" @@ -350,7 +481,7 @@ resource "google_cloud_run_v2_job" "capture" { template { template { service_account = google_service_account.earnings_capture.email - timeout = "5400s" # 90 min + timeout = "9000s" # 150 min = ~90 min capture + up to 60 min synchronous STT wait # Scratch disk sized for a 90-min call; audio dies here or in the handoff # bucket (never R2). @@ -374,6 +505,14 @@ resource "google_cloud_run_v2_job" "capture" { name = "CAPTURE_JOBS_SUBSCRIPTION" value = google_pubsub_subscription.capture_jobs.id } + # Capture triggers STT after the handoff upload by POSTing the gs:// ref to + # the STT service's /transcribe. STT is a PRIVATE Cloud Run service, so the + # capture SA holds run.invoker on it (deploy_iam.tf) and the POST carries a + # metadata-server ID token minted for this URL as its audience. + env { + name = "STT_SERVICE_URL" + value = google_cloud_run_v2_service.stt.uri + } } } } diff --git a/infra/deploy_iam.tf b/infra/deploy_iam.tf new file mode 100644 index 0000000..523e1cb --- /dev/null +++ b/infra/deploy_iam.tf @@ -0,0 +1,182 @@ +# Phase 28 — DEPLOY-TIME IAM (the deploy-runtime layer). +# +# infra/{cloud_run,batch,weather_serving,secrets,pubsub}.tf declare the compute + +# RUNTIME identities + the firewall secret bindings. This file adds the four +# deploy-time grants a CI `gcloud run deploy` / `gcloud batch jobs submit` / image +# push actually needs — the bindings a Codex review flagged as ABSENT: +# +# 1. Public run.invoker (allUsers) on the two internet-facing serving services +# (GATE #2 signed 2026-07-02: public exposure approved; the services stay +# behind the fail-closed MOSTLYRIGHT_API_KEY / EARNINGS_API_KEY auth + the +# global request ceiling — run.invoker=allUsers only allows the request to +# REACH the app, the app's auth middleware is the real gate). /healthz is +# unauthenticated by design (the Cloud Run probe idiom). +# 2. Deploy SA → roles/run.developer in each target project (roll new Cloud Run +# service/job revisions). +# 3. Deploy SA → roles/iam.serviceAccountUser (ACT-AS) on each RUNTIME SA it +# assigns to a service/job/Batch task — without this `--service-account=` +# is denied. +# 4. Deploy SA → roles/artifactregistry.writer on the REUSED backend repo, so +# the per-service workflows can `docker push`. This AMENDS the reader-only +# posture (artifact_registry.tf, T-28-00-04): images are built+pushed FROM CI +# as the deploy SA (build-and-push-from-Actions), which needs writer on the +# target repo. Scoped to the one repo, additive, WRITER (not admin) — the +# backend project is otherwise untouched (D-28.1). +# +# Least privilege: run.developer (not run.admin); serviceAccountUser scoped to the +# specific runtime SA (not project-wide); writer (not admin) on one repo. + +# ===================================================================== +# 1. Public invoker on the two serving services (GATE #2, fail-closed auth) +# ===================================================================== +resource "google_cloud_run_v2_service_iam_member" "earnings_serving_public" { + project = google_cloud_run_v2_service.earnings_serving.project + location = google_cloud_run_v2_service.earnings_serving.location + name = google_cloud_run_v2_service.earnings_serving.name + role = "roles/run.invoker" + member = "allUsers" +} + +resource "google_cloud_run_v2_service_iam_member" "weather_serving_public" { + project = google_cloud_run_v2_service.weather_serving.project + location = google_cloud_run_v2_service.weather_serving.location + name = google_cloud_run_v2_service.weather_serving.name + role = "roles/run.invoker" + member = "allUsers" +} + +# NOTE: the STT service (google_cloud_run_v2_service.stt) is deliberately NOT +# granted public invoker — it is an internal ingest GPU workload, not an +# internet-facing surface (audio firewall). + +# ===================================================================== +# 2. Deploy SA → run.developer in each target project +# ===================================================================== +resource "google_project_iam_member" "deploy_serving_run_developer" { + project = google_project.serving.project_id + role = "roles/run.developer" + member = "serviceAccount:${google_service_account.deploy["serving"].email}" +} + +resource "google_project_iam_member" "deploy_ingest_run_developer" { + project = google_project.ingest.project_id + role = "roles/run.developer" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +# Satellite deploy SA rolls the weather-incremental Cloud Run Job (H1). +resource "google_project_iam_member" "deploy_satellite_run_developer" { + project = var.satellite_project_id + role = "roles/run.developer" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +# Satellite deploy SA also SUBMITS the backfill Cloud Batch fleet +# (run-weather-backfill.yml). batch.jobsEditor is the submit/get/delete role. +resource "google_project_iam_member" "deploy_satellite_batch_editor" { + project = var.satellite_project_id + role = "roles/batch.jobsEditor" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +# ===================================================================== +# 3. Deploy SA → act-as (serviceAccountUser) on each runtime SA it assigns +# ===================================================================== +# Serving deploy SA runs both serving services as the `serving` runtime SA. +resource "google_service_account_iam_member" "deploy_serving_actas_serving" { + service_account_id = google_service_account.serving.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["serving"].email}" +} + +# Ingest deploy SA runs capture / stt / rolefact as their dedicated runtime SAs. +resource "google_service_account_iam_member" "deploy_ingest_actas_capture" { + service_account_id = google_service_account.earnings_capture.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +resource "google_service_account_iam_member" "deploy_ingest_actas_stt" { + service_account_id = google_service_account.earnings_stt.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +resource "google_service_account_iam_member" "deploy_ingest_actas_rolefact" { + service_account_id = google_service_account.earnings_rolefact.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +# Satellite deploy SA runs the incremental Job + the backfill Batch fleet as +# their dedicated runtime SAs (both need act-as: the Job's service_account and +# the Batch allocation_policy.service_account). +resource "google_service_account_iam_member" "deploy_satellite_actas_incremental" { + service_account_id = google_service_account.weather_incremental.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +resource "google_service_account_iam_member" "deploy_satellite_actas_backfill" { + service_account_id = google_service_account.weather_backfill.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +# ===================================================================== +# 4. Deploy SA → artifactregistry.writer on the REUSED backend repo (CI push) +# ===================================================================== +# The created-project deploy SAs (serving + ingest [+ staging when enabled]). +resource "google_artifact_registry_repository_iam_member" "writer" { + for_each = google_service_account.deploy + + project = local.ar_project + location = local.ar_location + repository = local.ar_repository + role = "roles/artifactregistry.writer" + member = "serviceAccount:${each.value.email}" +} + +# The satellite deploy SA (EXISTING project, H1) also pushes the weather ingest +# image, so it needs writer too (it already has reader from wif.tf). +resource "google_artifact_registry_repository_iam_member" "writer_satellite" { + project = local.ar_project + location = local.ar_location + repository = local.ar_repository + role = "roles/artifactregistry.writer" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +# ===================================================================== +# 5. STT runtime SA → pubsub.publisher on earnings-streaming (live SSE path) +# ===================================================================== +# The LIVE path publishes transcript segments to the earnings-streaming topic +# straight from the STT service as they are transcribed (28-GCE-ARCHITECTURE §3: +# "live: STT publishes segments -> in-process bus -> SSE /stream"), gated by the +# opt-in EARNINGS_STREAMING_ENABLED / publish_live. pubsub.tf grants +# roles/pubsub.publisher to the ROLE/FACT SA (the post-call fact publisher) but +# NOT the STT SA, so the live publish would 403. Grant the STT runtime SA +# publisher on the SAME topic (least privilege: publisher, one topic). +resource "google_pubsub_topic_iam_member" "stt_earnings_streaming_publisher" { + project = google_project.ingest.project_id + topic = google_pubsub_topic.earnings_streaming.name + role = "roles/pubsub.publisher" + member = local.sa_earnings_stt +} + +# ===================================================================== +# 6. Capture SA → run.invoker on the PRIVATE STT service (capture->STT trigger) +# ===================================================================== +# After the handoff upload, the capture Job POSTs the gs:// audio ref to the STT +# service's /transcribe to schedule transcription (capture->STT end-to-end). STT +# is deliberately NOT public (audio-side, no allUsers invoker), so the capture +# runtime SA needs run.invoker on it — and the capture POST carries a +# metadata-server ID token (audience = the STT URL). Least privilege: invoker on +# the ONE service. +resource "google_cloud_run_v2_service_iam_member" "capture_invokes_stt" { + project = google_cloud_run_v2_service.stt.project + location = google_cloud_run_v2_service.stt.location + name = google_cloud_run_v2_service.stt.name + role = "roles/run.invoker" + member = local.sa_earnings_capture +} diff --git a/infra/outputs.tf b/infra/outputs.tf index e281f65..d69b304 100644 --- a/infra/outputs.tf +++ b/infra/outputs.tf @@ -69,11 +69,11 @@ output "serving_urls" { output "pubsub_topics" { description = "Pub/Sub transport resource IDs — the earnings-streaming SSE bridge (C2) + capture-jobs (+ dead-letter, H7)." value = { - earnings_streaming = google_pubsub_topic.earnings_streaming.id - earnings_streaming_sub = google_pubsub_subscription.earnings_streaming.id - capture_jobs = google_pubsub_topic.capture_jobs.id - capture_jobs_sub = google_pubsub_subscription.capture_jobs.id - capture_jobs_deadletter = google_pubsub_topic.capture_jobs_deadletter.id + earnings_streaming = google_pubsub_topic.earnings_streaming.id + earnings_streaming_sub = google_pubsub_subscription.earnings_streaming.id + capture_jobs = google_pubsub_topic.capture_jobs.id + capture_jobs_sub = google_pubsub_subscription.capture_jobs.id + capture_jobs_deadletter = google_pubsub_topic.capture_jobs_deadletter.id } } @@ -84,3 +84,22 @@ output "budget_notification_channels" { pubsub = google_monitoring_notification_channel.budget_pubsub.id } } + +# --- Deploy-runtime layer (28 deploy workflows) --- +# The extra repo Variables the per-service deploy workflows read, beyond the +# project_ids / deploy_service_accounts / wif_provider_name above. + +output "satellite_project_number" { + description = "mostlyright-satellite project number (H1) — general-purpose output (the Batch secrets now resolve via the backend secrets project, not this number)." + value = var.satellite_project_number +} + +output "backfill_progress_bucket" { + description = "Durable GCS completion-marker bucket for the backfill fleet (C4 crash-safe resume). Set as PROGRESS_BUCKET for run-weather-backfill.yml." + value = google_storage_bucket.backfill_progress.name +} + +output "r2_bucket" { + description = "The single platform R2 bucket derived parquet is written to / served from. Set as R2_BUCKET for the ingest deploy workflows." + value = var.r2_bucket +} diff --git a/infra/service_accounts.tf b/infra/service_accounts.tf index 8e46b9b..ae691e4 100644 --- a/infra/service_accounts.tf +++ b/infra/service_accounts.tf @@ -29,7 +29,7 @@ resource "google_service_account" "earnings_stt" { project = google_project.ingest.project_id account_id = "earnings-stt" display_name = "Earnings STT (Cloud Run GPU L4) runtime SA" - description = "Runs the STT GPU workload (28-11). Read-only on the audio handoff bucket; emits transcript segments; no serving grant." + description = "Runs the STT GPU workload (28-11). Reads + deletes (post-ledger cleanup) the audio handoff bucket; publishes the text transcript to R2; no serving grant." depends_on = [google_project_service.enabled] } @@ -72,17 +72,20 @@ resource "google_service_account" "weather_incremental" { locals { # Convenience member strings for the firewall bindings downstream. - sa_earnings_capture = "serviceAccount:${google_service_account.earnings_capture.email}" - sa_earnings_stt = "serviceAccount:${google_service_account.earnings_stt.email}" - sa_earnings_rolefact = "serviceAccount:${google_service_account.earnings_rolefact.email}" - sa_serving = "serviceAccount:${google_service_account.serving.email}" - sa_weather_backfill = "serviceAccount:${google_service_account.weather_backfill.email}" - sa_weather_incremental = "serviceAccount:${google_service_account.weather_incremental.email}" + sa_earnings_capture = "serviceAccount:${google_service_account.earnings_capture.email}" + sa_earnings_stt = "serviceAccount:${google_service_account.earnings_stt.email}" + sa_earnings_rolefact = "serviceAccount:${google_service_account.earnings_rolefact.email}" + sa_serving = "serviceAccount:${google_service_account.serving.email}" + sa_weather_backfill = "serviceAccount:${google_service_account.weather_backfill.email}" + sa_weather_incremental = "serviceAccount:${google_service_account.weather_incremental.email}" - # The SHARED R2 write token members (v1 honest posture): ingest role/fact + - # BOTH satellite weather SAs. R2 tokens are bucket-scoped, not prefix-scoped — - # there is NO per-zone write isolation in v1 (Task 4 v1.x hardening splits it). + # The SHARED R2 write token members (v1 honest posture): ingest STT (publishes + # the durable transcript parquet so the SEPARATE role/fact Job can read it across + # containers — text only, never audio, D-27.9) + ingest role/fact + BOTH + # satellite weather SAs. R2 tokens are bucket-scoped, not prefix-scoped — there is + # NO per-zone write isolation in v1 (Task 4 v1.x hardening splits it). r2_write_members = [ + local.sa_earnings_stt, local.sa_earnings_rolefact, local.sa_weather_backfill, local.sa_weather_incremental, diff --git a/packages/weather/src/mostlyright/weather/earnings/ledger.py b/packages/weather/src/mostlyright/weather/earnings/ledger.py index fb7a969..4a4cb7f 100644 --- a/packages/weather/src/mostlyright/weather/earnings/ledger.py +++ b/packages/weather/src/mostlyright/weather/earnings/ledger.py @@ -192,6 +192,39 @@ def append(self, rows: Sequence[Mapping[str, object]], *, ticker: str, call_id: os.replace(tmp, path) return len(merged) + def replace(self, rows: Sequence[Mapping[str, object]], *, ticker: str, call_id: str) -> int: + """Idempotently OVERWRITE the ``(ticker, call_id)`` partition with ``rows``. + + Unlike :meth:`append` (read-modify-write concatenate), this REPLACES the + partition under the same ``FileLock`` — so a producer that writes the + COMPLETE artifact for a call (the batch STT transcript, the role/fact facts) + is idempotent across retries / Pub/Sub redelivery: re-running yields the same + partition, never DOUBLED rows that would make downstream counting + double-count (Codex R7-2 P1). Same audio-free normalization + cross-field + write-guard as :meth:`append`. An empty ``rows`` removes the partition (an + idempotent zero-row write). Returns the row count written. + """ + path = self.path(ticker, call_id) + lock = FileLock(str(path) + ".lock", timeout=LOCK_TIMEOUT_SECONDS) + with lock: + if not rows: + # Idempotent empty write: drop any stale partition so a re-run that + # legitimately produces zero rows does not leave prior rows behind. + if path.exists(): + os.remove(path) + return 0 + normalized = [self._strip_to_schema(r) for r in rows] + self._validate_normalized(normalized) + table = pa.Table.from_pylist( + [{name: r.get(name) for name in self._column_names} for r in normalized], + schema=self._pa_schema, + ) + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + pq.write_table(table, tmp, version="2.6", coerce_timestamps="us") + os.replace(tmp, path) + return len(normalized) + def read(self, ticker: str, call_id: str) -> list[dict[str, object]]: """Return all persisted rows for ``(ticker, call_id)`` (empty on miss).""" path = self.path(ticker, call_id) diff --git a/packages/weather/src/mostlyright/weather/satellite/__main__.py b/packages/weather/src/mostlyright/weather/satellite/__main__.py index c0b3afc..b94c437 100644 --- a/packages/weather/src/mostlyright/weather/satellite/__main__.py +++ b/packages/weather/src/mostlyright/weather/satellite/__main__.py @@ -7,6 +7,19 @@ and the D9 ``--mirror aws|gcp`` transport selector (default ``aws``, validated by argparse ``choices`` so an unknown mirror is rejected BEFORE any run). Dispatches to :func:`_backfill.bulk_backfill`. + + Two invocation modes (28-21): + * **Explicit** — pass ``--satellites/--products/--stations/--year-start/ + --year-end/--out`` (all required). Unchanged from Phase 25. + * **Roster** — pass ``--roster NAME`` (e.g. ``kalshi,polymarket``) and the + CLI resolves the committed settlement-station roster, selects THIS + array-task's shard (``--shard-index``/``--shard-count``, else the Cloud + Batch ``BATCH_TASK_INDEX``/``BATCH_TASK_COUNT`` env, else the whole + roster), and supplies documented defaults for satellites/products/years/ + out. ``--incremental yesterday`` scopes it to the current UTC year with + resume forced on. A ``--r2-bucket`` in roster mode ENABLES the R2 upload + sink (the fleet's whole purpose). This is the path the shipped + ``infra/batch.tf`` invokes. - ``probe`` — the on-demand / live throughput probe (D10 SAT-25-11). Measures the anonymous-throttle / diminishing-returns knee against the LIVE NOAA buckets and writes the SOURCE-LIMITS findings artifact + satellite section @@ -23,10 +36,33 @@ from __future__ import annotations import argparse +import os import sys +from datetime import UTC, datetime, timedelta from pathlib import Path from ._backfill import bulk_backfill +from ._roster import resolve_roster, shard_roster + +#: Default satellites for roster/incremental mode. batch.tf passes NO +#: ``--satellites``, so roster mode must supply the canonical native-ring +#: default. The ACMC-only MVP (project memory) runs on the GOES East/West pair — +#: the two operational GOES platforms — so the fleet covers the Americas span the +#: settlement stations concentrate in. (Explicit-args mode still requires +#: ``--satellites``; this default ONLY applies when ``--roster`` is given.) +_DEFAULT_ROSTER_SATELLITES: tuple[str, ...] = ("goes16", "goes18") + +#: Default products for roster/incremental mode: the ACMC-only MVP cloud mask +#: (``ABI-L2-ACMC``, the cheap CONUS product — matches the ``_sources`` GOES +#: ``default_product``). batch.tf passes NO ``--products``. +_DEFAULT_ROSTER_PRODUCTS: tuple[str, ...] = ("ABI-L2-ACMC",) + +#: Default historical backfill start year for roster mode (batch.tf passes NO +#: ``--year-start``). GOES-16 first light is 2017; the fleet backfills the full +#: modern GOES record. The per-slice ``available_since`` clamp in ``_backfill`` +#: skips any (satellite, month) before a platform's first-light with no I/O, so a +#: conservative floor here is safe. year_end defaults to the current UTC year. +_DEFAULT_ROSTER_YEAR_START: int = 2017 def _split_csv(value: str) -> list[str]: @@ -47,27 +83,86 @@ def _build_parser() -> argparse.ArgumentParser: help="Fleet bulk backfill — per-(satellite,year,month) slices, resume, " "Thread/Process split, --mirror aws|gcp.", ) + # NOTE (28-21): --satellites/--products/--stations/--year-start/--year-end + # are REQUIRED in explicit mode but OPTIONAL when --roster is given (roster + # mode supplies documented defaults — batch.tf passes none of them). The + # required-ness is enforced in _run_backfill, not by argparse ``required=``, + # so the two modes can share one subparser. Explicit mode stays byte-identical. bf.add_argument( "--satellites", type=_split_csv, - required=True, - help="Comma-separated satellites, e.g. goes16,goes19 (GOES-East) or goes18 (GOES-West).", + default=None, + help="Comma-separated satellites, e.g. goes16,goes19 (GOES-East) or goes18 " + "(GOES-West). REQUIRED in explicit mode; in --roster mode defaults to the " + "native-ring default (goes16,goes18 — the operational GOES pair).", ) bf.add_argument( "--products", type=_split_csv, - required=True, - help="Comma-separated ABI L2 products, e.g. ABI-L2-ACMC.", + default=None, + help="Comma-separated ABI L2 products, e.g. ABI-L2-ACMC. REQUIRED in " + "explicit mode; in --roster mode defaults to ABI-L2-ACMC (the ACMC-only MVP).", ) bf.add_argument( "--stations", type=_split_csv, - required=True, - help="Comma-separated ICAO/NWS station codes, e.g. KNYC.", + default=None, + help="Comma-separated ICAO/NWS station codes, e.g. KNYC. REQUIRED in " + "explicit mode; MUTUALLY EXCLUSIVE with --roster (the roster supplies the " + "shard's stations).", + ) + bf.add_argument("--year-start", type=int, default=None, dest="year_start") + bf.add_argument("--year-end", type=int, default=None, dest="year_end") + bf.add_argument( + "--out", + type=Path, + default=None, + help="Cache root output dir. REQUIRED in explicit mode; in --roster mode " + "defaults to the resolved home/env cache root (MOSTLYRIGHT_CACHE_DIR).", + ) + # ---- 28-21 roster / shard / progress / incremental -------------------- + bf.add_argument( + "--roster", + default=None, + metavar="NAME", + help="Resolve the committed settlement-station roster NAME (e.g. " + "'kalshi,polymarket') and back-fill only THIS array-task's shard. The " + "shard is selected by --shard-index/--shard-count, else the Cloud Batch " + "env (BATCH_TASK_INDEX/BATCH_TASK_COUNT), else the whole roster " + "(index=0,count=1). Mutually exclusive with --stations.", + ) + bf.add_argument( + "--shard-index", + type=int, + default=None, + dest="shard_index", + help="0-based shard index for --roster (default: env BATCH_TASK_INDEX, else 0).", + ) + bf.add_argument( + "--shard-count", + type=int, + default=None, + dest="shard_count", + help="Total shard count for --roster (default: env BATCH_TASK_COUNT, else 1).", + ) + bf.add_argument( + "--progress-bucket", + default=None, + dest="progress_bucket", + metavar="BUCKET", + help="Durable GCS completion-marker bucket for crash-safe resume (C4). " + "Wired to a durable GcsProgressStore in roster/incremental mode so preempted " + "Spot slices rehydrate markers from GCS and skip completed partitions.", + ) + bf.add_argument( + "--incremental", + choices=["yesterday"], + default=None, + help="Scope the run to a recent window instead of the full historical " + "backfill. 'yesterday' sets year_start=year_end=current UTC year and " + "forces resume=True so only new/missing partitions are fetched. " + "(Day-granular incremental is a deferred 28-22 SDK follow-up.)", ) - bf.add_argument("--year-start", type=int, required=True, dest="year_start") - bf.add_argument("--year-end", type=int, required=True, dest="year_end") - bf.add_argument("--out", type=Path, required=True, help="Cache root output dir.") bf.add_argument( "--max-workers", type=int, @@ -151,23 +246,197 @@ def _build_parser() -> argparse.ArgumentParser: return parser +def _default_out() -> Path: + """Resolve the default ``--out`` cache root for roster/incremental mode. + + batch.tf passes no ``--out``, so roster mode defaults to the SAME home/env + cache root the cache tier resolves (honoring ``MOSTLYRIGHT_CACHE_DIR``), so + the fleet's partitions land in the canonical cache layout. + """ + from mostlyright._internal._cache_dir import resolve_cache_root_without_v1 + + return resolve_cache_root_without_v1() + + +def _resolve_shard_index_count(args: argparse.Namespace) -> tuple[int, int]: + """Resolve (shard_index, shard_count) for --roster mode. + + Precedence: explicit ``--shard-index``/``--shard-count`` > the Cloud Batch + env (``BATCH_TASK_INDEX``/``BATCH_TASK_COUNT``) > the default ``(0, 1)`` (the + whole roster — e.g. the incremental Cloud Run Job, which is NOT an array job). + Index and count are resolved INDEPENDENTLY (either can come from its flag or + its env). Non-integer env values raise a clear error. + """ + + def _from(flag: int | None, env_name: str, default: int) -> int: + if flag is not None: + return flag + raw = os.environ.get(env_name) + if raw is None or raw == "": + return default + try: + return int(raw) + except ValueError as exc: + raise ValueError(f"env {env_name}={raw!r} is not an integer") from exc + + index = _from(args.shard_index, "BATCH_TASK_INDEX", 0) + count = _from(args.shard_count, "BATCH_TASK_COUNT", 1) + return index, count + + +#: ICAO first-letter prefixes inside the GOES-East/West footprint (Americas + +#: E-Pacific). A roster station outside this set is NOT visible to the GOES-only +#: default satellites and needs Himawari (Asia/Pacific) / Meteosat (Europe/Africa) +#: / VIIRS. Coarse by design — it drives a WARNING, never a silent skip. +_GOES_FOOTPRINT_ICAO_PREFIXES: tuple[str, ...] = ("K", "C", "M", "P", "S", "T") + + +def _filter_to_goes_footprint(stations: list[str]) -> list[str]: + """Drop (and loudly log) shard stations the GOES-only default satellites can't see. + + The default roster satellites are GOES-East/West, whose footprint is the + Americas / E-Pacific. A roster station outside that footprint (EDDM, RJTT, + FACT, ...) resolves to ZERO GOES coverage, so backfilling it under the default + satellites would fetch nothing and then mark the slice ``completed`` — a SILENT + empty "success" that leaves the advertised backfill unpopulated and wastes Spot + (Codex R7-3 P1). So EXCLUDE those stations from the default-satellite run + entirely (no empty slices, no misleading complete markers) and log WHY. Global + coverage is the native-ring path: pass ``--satellites`` (Himawari/Meteosat/VIIRS) + to back-fill them (the 28-26 native-ring roster backfill), which bypasses this + filter. Returns the GOES-coverable subset (possibly empty for a non-GOES shard). + """ + kept = [s for s in stations if s[:1].upper().startswith(_GOES_FOOTPRINT_ICAO_PREFIXES)] + excluded = [s for s in stations if s not in kept] + if excluded: + print( + "NOTE: excluding roster stations OUTSIDE the GOES footprint from this " + f"default-satellite run (no GOES coverage): {', '.join(excluded)}. " + "They are NOT back-filled here — pass --satellites " + "(Himawari/Meteosat/VIIRS) for native-ring global coverage (28-26). " + "Excluding them avoids empty slices being marked complete.", + file=sys.stderr, + ) + return kept + + def _run_backfill(args: argparse.Namespace) -> int: + # --- 28-21: reconcile roster mode vs the explicit-args mode -------------- + if args.roster is not None and args.stations is not None: + raise ValueError( + "--roster and --stations are mutually exclusive: the roster supplies " + "this shard's stations, so do not also pass --stations." + ) + + resume = args.resume + year_start = args.year_start + year_end = args.year_end + + # --incremental yesterday: year-granular resume window (28-22 deferred the + # true day-granular incremental). Force resume so only new/missing partitions + # for yesterday's year are fetched. + if args.incremental == "yesterday": + # Key on YESTERDAY's year, not today's: on Jan 1 (UTC) "yesterday" is + # Dec 31 of the PRIOR year, so using today's year would never refresh that + # prior-year December partition (Codex R7-3 P2). + yesterday = datetime.now(UTC).date() - timedelta(days=1) + year_start = yesterday.year + year_end = yesterday.year + resume = True + + if args.roster is not None: + # Roster mode: resolve the committed roster, select this task's shard, + # and supply documented defaults for the params batch.tf does NOT pass. + roster = resolve_roster(args.roster) # raises loud ValueError on unknown + index, count = _resolve_shard_index_count(args) + stations = list(shard_roster(roster, index, count)) + satellites = args.satellites or list(_DEFAULT_ROSTER_SATELLITES) + products = args.products or list(_DEFAULT_ROSTER_PRODUCTS) + # Coverage guard (no SILENT under-coverage): the settlement-station roster + # spans the globe, but the GOES-only default satellites see only the + # Americas / E-Pacific. EXCLUDE any shard station outside the GOES footprint + # from the default-satellite run (rather than fetching nothing and marking + # the empty slice complete — Codex R7-3 P1); the exclusion is logged loudly + # to the Cloud Batch logs. Global coverage = pass --satellites + # (Himawari/Meteosat/VIIRS), which bypasses this filter (28-26 native ring). + if args.satellites is None: + stations = _filter_to_goes_footprint(stations) + if not stations: + print( + "no GOES-coverable stations in this shard under the default " + "satellites — nothing to back-fill (pass --satellites for " + "native-ring coverage). Exiting cleanly.", + file=sys.stderr, + ) + return 0 + if year_start is None: + year_start = _DEFAULT_ROSTER_YEAR_START + if year_end is None: + year_end = datetime.now(UTC).year + out = args.out if args.out is not None else _default_out() + else: + # Explicit mode: unchanged contract — all of these are REQUIRED. + missing = [ + name + for name, val in ( + ("--satellites", args.satellites), + ("--products", args.products), + ("--stations", args.stations), + ("--year-start", year_start), + ("--year-end", year_end), + ("--out", args.out), + ) + if val is None + ] + if missing: + raise ValueError( + f"missing required argument(s) {missing} (required in explicit mode; " + f"pass --roster to run the committed settlement-station roster instead)" + ) + satellites = args.satellites + products = args.products + stations = args.stations + out = args.out + kwargs: dict = { - "satellites": args.satellites, - "products": args.products, - "stations": args.stations, - "year_start": args.year_start, - "year_end": args.year_end, - "out": args.out, - "resume": args.resume, + "satellites": satellites, + "products": products, + "stations": stations, + "year_start": year_start, + "year_end": year_end, + "out": out, + "resume": resume, "executor": args.executor, "mirror": args.mirror, } if args.max_workers is not None: kwargs["max_workers"] = args.max_workers - # 28-20: thread the OPT-IN R2 sink target. Off (None) unless --r2-target. - if getattr(args, "r2_target", False): + + # 28-20/28-21: the OPT-IN R2 sink. Explicit mode keeps the pre-28-20 gate + # (--r2-target flag required, backward compatible). Roster mode is the fleet + # upload path: batch.tf passes ONLY --r2-bucket and EXPECTS the derived + # parquet to be uploaded, so a --r2-bucket in roster mode ENABLES the sink + # even without the explicit --r2-target flag. + r2_enabled = bool(getattr(args, "r2_target", False)) or ( + args.roster is not None and args.r2_bucket is not None + ) + if r2_enabled: kwargs["r2_target"] = args.r2_bucket + + # C4 (28-21): --progress-bucket is the durable GCS completion-marker bucket for + # crash-safe resume. Wire it to the GCS-backed progress store so a preempted + # Spot slice (or a replacement VM) rehydrates its markers from GCS and SKIPS + # already-uploaded partitions instead of reprocessing under a fresh local disk. + # Each shard writes a DISJOINT marker object (shard-index in the key) so array + # tasks never clobber each other's progress. + if args.progress_bucket is not None: + from ._progress import GcsProgressStore + + shard_index, _shard_count = _resolve_shard_index_count(args) + bucket = args.progress_bucket.rstrip("/") + gcs_uri = f"gs://{bucket}/progress/shard-{shard_index:05d}/progress.json" + kwargs["progress_store"] = GcsProgressStore(gcs_uri) + print(f"using durable GCS progress store for crash-safe resume: {gcs_uri}") + result = bulk_backfill(**kwargs) print( f"backfill done: {result.slices_completed} slices completed, " diff --git a/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py b/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py index 137d5f2..eb931d7 100644 --- a/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py +++ b/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py @@ -110,4 +110,47 @@ def upload(local_path: Path | str, bucket: str, key: str, *, r2_target: str | No return key -__all__ = ["upload"] +def download(bucket: str, key: str, local_path: Path | str, *, r2_target: str | None = None) -> str: + """Download one R2 object (``s3.download_file``) to ``local_path``; return the path. + + The read complement of :func:`upload`, used by the hosted ingest chain to + rehydrate a durable partition into a fresh Cloud Run container's local cache + (e.g. the role/fact Job pulling the STT-written transcript parquet — capture, + STT, and role/fact run in SEPARATE Cloud Run resources with isolated ephemeral + disks, so the transcript crosses between them only via the R2 data plane). Uses + the SAME write-token client as :func:`upload` (an R2 read-and-write token + grants ``GetObject``); the ingest SAs already hold that token. + + ``r2_target`` is accepted for signature symmetry with :func:`upload`; the + effective bucket is the explicit ``bucket`` argument. The parent directory of + ``local_path`` must already exist (the ledger path resolver creates it). + + Raises: + Whatever ``botocore`` raises on a missing key / transport error — the + caller decides whether a miss is fatal (fail loud) or a soft fallback. + """ + client = _get_r2_client() + client.download_file(bucket, key, str(local_path)) + return str(local_path) + + +def delete(bucket: str, key: str, *, r2_target: str | None = None) -> None: + """Delete one R2 object (``s3.delete_object``); idempotent (a missing key is a no-op). + + Used by the hosted ingest chain to TOMBSTONE a durable partition that an + idempotent replace has emptied — e.g. a role/fact rerun that legitimately + produces zero facts must clear the previously-uploaded + ``earnings/facts//.parquet`` so serving (which reads R2 as the + durable store) does not keep serving stale facts. ``delete_object`` is + idempotent on S3/R2 (deleting an absent key succeeds), so a first-time + zero-fact call is a safe no-op. Uses the same write-token client as + :func:`upload`. + + ``r2_target`` is accepted for signature symmetry; the effective bucket is the + explicit ``bucket`` argument. + """ + client = _get_r2_client() + client.delete_object(Bucket=bucket, Key=key) + + +__all__ = ["delete", "download", "upload"] diff --git a/packages/weather/src/mostlyright/weather/satellite/_roster.py b/packages/weather/src/mostlyright/weather/satellite/_roster.py new file mode 100644 index 0000000..9eeb1a1 --- /dev/null +++ b/packages/weather/src/mostlyright/weather/satellite/_roster.py @@ -0,0 +1,292 @@ +"""Committed Kalshi/Polymarket settlement-station roster for the fleet backfill. + +Phase 28 (28-21). The hosted weather backfill runs as a Cloud Batch ARRAY JOB: +one shard per SATELLITE-RESOLVABLE settlement station (``task_count = 65``: the +66-station Kalshi∪Polymarket union MINUS the one non-satellite station HKO — see +``_NON_SATELLITE_STATIONS``). Each array task resolves its shard from a STABLE, +DETERMINISTIC roster so that shard +``N`` always maps to the same station across Spot retries — a stable +shard-index→station mapping is load-bearing for crash-safe resume (a retried +shard must re-derive the SAME station's partitions, never a different one). + +**Source of truth (why this is a committed snapshot, NOT a runtime import).** +The canonical roster is the union of two live ``markets``-package catalogs: + + - ``markets.catalog.kalshi_stations.KALSHI_SETTLEMENT_STATIONS`` — the Kalshi + NHIGH/NLOW settlement stations (values are ``StationCitation`` objects whose + ``.station`` is the 4-letter ICAO). + - ``markets.polymarket.load_polymarket_city_stations()`` — the Polymarket + city→role→ICAO map (inner values are the ICAO strings). + +The union of those two — sorted, deduped — is 66 ICAOs; the satellite roster +frozen below is those 66 MINUS ``_NON_SATELLITE_STATIONS`` (HKO), i.e. the 65 +that resolve to a satellite ``StationInfo`` (D-28.8; ``infra/batch.tf`` +``task_count = 65``). We snapshot it here, in ``packages/weather``, rather than +importing ``markets`` at runtime because: + + 1. The satellite/weather deploy image MUST NOT pull the ``markets`` package + (dependency + audit isolation — the weather backfill has no business + importing the markets catalogs on the fleet). + 2. A frozen roster is deterministic and reviewable; drift is caught in CI. + +``tests/satellite/test_roster.py`` imports the LIVE ``markets`` catalogs and +asserts this snapshot still equals their sorted union, so any catalog drift +(a station added/removed upstream) fails CI loudly and forces a conscious +re-snapshot here — the roster can never silently diverge from the markets truth. +""" + +from __future__ import annotations + +__all__ = [ + "ROSTERS", + "SETTLEMENT_STATION_ROSTER", + "resolve_roster", + "shard_roster", +] + +#: The canonical Kalshi/Polymarket SATELLITE-backfill roster (D-28.8): the 65 +#: 4-letter ICAO codes that are the union of the live Kalshi + Polymarket +#: settlement catalogs (66) MINUS ``_NON_SATELLITE_STATIONS`` (HKO, which has no +#: satellite StationInfo), SORTED and deduped. Verified against the live +#: ``markets`` union by ``test_roster.py`` (both drift AND non-resolvable-station +#: regressions fail CI). Sorted + frozen so the shard-index→station mapping is +#: stable across array-task retries. +SETTLEMENT_STATION_ROSTER: tuple[str, ...] = ( + "CYYZ", + "EDDM", + "EFHK", + "EGLC", + "EHAM", + "EPWA", + "FACT", + "KATL", + "KAUS", + "KBKF", + "KBNA", + "KBOS", + "KCVG", + "KDAL", + "KDCA", + "KDEN", + "KDFW", + "KDTW", + "KHOU", + "KIAH", + "KLAS", + "KLAX", + "KLGA", + "KMDW", + "KMIA", + "KMSP", + "KNYC", + "KORD", + "KPHL", + "KPHX", + "KSEA", + "KSFO", + "KSLC", + "LEMD", + "LFPB", + "LIMC", + "LLBG", + "LTAC", + "LTFM", + "MMMX", + "MPMG", + "NZWN", + "OEJN", + "OPKC", + "RCSS", + "RJTT", + "RKPK", + "RKSI", + "RPLL", + "SAEZ", + "SBGR", + "UUWW", + "VILK", + "WMKK", + "WSSS", + "ZBAA", + "ZGGG", + "ZGSZ", + "ZHCC", + "ZHHH", + "ZSJN", + "ZSPD", + "ZSQD", + "ZUCK", + "ZUUU", +) + + +# Committed sub-snapshot: the Kalshi-only membership of the union. This is NOT a +# second source of truth — ``test_roster.py`` asserts (a) the full union equals +# the live Kalshi/Polymarket catalog and (b) this set equals every live Kalshi +# settlement ICAO, so drift in either fails CI. The two markets overlap (many +# stations appear in both), which is why the union — not the sum — is 66 (the satellite roster is 65: that's 66 minus HKO). The +# split rosters below are derived from this set so they stay in lockstep. +_KALSHI_STATIONS: frozenset[str] = frozenset( + { + "KATL", + "KAUS", + "KBNA", + "KBOS", + "KCVG", + "KDCA", + "KDEN", + "KDFW", + "KDTW", + "KIAH", + "KLAS", + "KLAX", + "KMDW", + "KMIA", + "KMSP", + "KNYC", + "KPHL", + "KPHX", + "KSEA", + "KSFO", + "KSLC", + } +) + + +# Committed sub-snapshot: the Polymarket-only membership of the union (the inner +# ICAOs of ``load_polymarket_city_stations()``). Like ``_KALSHI_STATIONS`` this is +# NOT a second source of truth — ``test_roster.py`` asserts it equals the live +# Polymarket catalog. Kalshi + Polymarket == 66 (the two overlap); the satellite +# roster is those 66 minus HKO (65). len(_KALSHI) 21 + len(_POLYMARKET) 51 != 66. +_POLYMARKET_STATIONS: frozenset[str] = frozenset( + { + "CYYZ", + "EDDM", + "EFHK", + "EGLC", + "EHAM", + "EPWA", + "FACT", + "HKO", + "KATL", + "KAUS", + "KBKF", + "KDAL", + "KHOU", + "KLAX", + "KLGA", + "KMIA", + "KORD", + "KSEA", + "KSFO", + "LEMD", + "LFPB", + "LIMC", + "LLBG", + "LTAC", + "LTFM", + "MMMX", + "MPMG", + "NZWN", + "OEJN", + "OPKC", + "RCSS", + "RJTT", + "RKPK", + "RKSI", + "RPLL", + "SAEZ", + "SBGR", + "UUWW", + "VILK", + "WMKK", + "WSSS", + "ZBAA", + "ZGGG", + "ZGSZ", + "ZHCC", + "ZHHH", + "ZSJN", + "ZSPD", + "ZSQD", + "ZUCK", + "ZUUU", + } +) + + +#: Settlement stations that appear in the market catalogs but have NO satellite +#: ``StationInfo`` in the SDK station registry, so a satellite backfill shard for +#: them would resolve to ZERO partitions (a silent data hole in the fleet). They +#: are EXCLUDED from the satellite roster. ``HKO`` is the Hong Kong Observatory +#: pseudo-identifier (NOT a standard ICAO — Hong Kong's airport is ``VHHH``); its +#: markets settle against the HKO Open Data API, not satellite imagery, so it is a +#: deferred non-satellite station (project HKO-unblock). ``test_roster.py`` asserts +#: every ROSTER station DOES resolve, so this list stays honest against the +#: registry — a newly-unresolvable catalog station fails CI rather than adding a +#: silent empty shard. +_NON_SATELLITE_STATIONS: frozenset[str] = frozenset({"HKO"}) + + +#: The CLI ``--roster NAME`` registry. ``batch.tf`` passes the literal +#: ``"kalshi,polymarket"`` (the full union MINUS non-satellite stations). The split +#: names are provided as a convenience; every value is a sorted slice of the +#: committed snapshot. +ROSTERS: dict[str, tuple[str, ...]] = { + "kalshi,polymarket": SETTLEMENT_STATION_ROSTER, + "kalshi": tuple(s for s in SETTLEMENT_STATION_ROSTER if s in _KALSHI_STATIONS), + "polymarket": tuple(s for s in SETTLEMENT_STATION_ROSTER if s in _POLYMARKET_STATIONS), +} + + +def resolve_roster(name: str) -> tuple[str, ...]: + """Resolve a ``--roster`` name to its station tuple. + + Fails LOUD and EARLY (before any I/O) on an unknown roster name so a typo in + the Terraform container args surfaces as a clear error at startup rather than + a silent empty run. + + Args: + name: The roster name, e.g. ``"kalshi,polymarket"`` (the batch.tf value). + + Returns: + The sorted station tuple for that roster. + + Raises: + ValueError: ``name`` is not a registered roster. + """ + try: + return ROSTERS[name] + except KeyError: + raise ValueError(f"unknown roster {name!r}; expected one of {sorted(ROSTERS)}") from None + + +def shard_roster(roster: tuple[str, ...], index: int, count: int) -> tuple[str, ...]: + """Return this array-task's deterministic shard of ``roster``. + + Round-robin slice (``roster[index::count]``). Round-robin keeps every shard + NON-EMPTY whenever ``count <= len(roster)`` (each of the first ``len(roster)`` + shards gets ≥1 station), and — because it is a pure function of + ``(index, count)`` — shard ``index`` maps to the SAME stations across retries. + For ``count == len(roster)`` (the batch.tf ``task_count = 65`` case) each shard + is exactly one station. + + Args: + roster: The full station tuple (e.g. from :func:`resolve_roster`). + index: This task's 0-based shard index (``BATCH_TASK_INDEX``). + count: The total number of shards (``BATCH_TASK_COUNT``). + + Returns: + The stations owned by shard ``index`` (possibly empty when + ``count > len(roster)`` and ``index >= len(roster)``). + + Raises: + ValueError: ``count < 1`` or ``index`` is out of ``[0, count)``. + """ + if count < 1: + raise ValueError(f"shard count must be >= 1; got {count}") + if not (0 <= index < count): + raise ValueError( + f"shard index {index} out of range for count {count} (need 0 <= index < count)" + ) + return roster[index::count] diff --git a/packages/weather/tests/earnings/test_ledger_kalshi_validation.py b/packages/weather/tests/earnings/test_ledger_kalshi_validation.py index c3a5e66..b967b1e 100644 --- a/packages/weather/tests/earnings/test_ledger_kalshi_validation.py +++ b/packages/weather/tests/earnings/test_ledger_kalshi_validation.py @@ -108,3 +108,40 @@ def test_each_canonical_compound_type_persists(self, tmp_path) -> None: ledger = FactLedger(root=tmp_path) rows = [_row(compound_type=ct) for ct in COMPOUND_TYPE_VALUES] assert ledger.append(rows, ticker="ORCL", call_id="C1") == len(COMPOUND_TYPE_VALUES) + + +class TestReplaceIsIdempotent: + """``replace`` OVERWRITES a partition so a retried complete-artifact write is + idempotent — STT (transcript) and role/fact (facts) use it so a redelivered + call never doubles rows and double-counts (Codex R7-2 P1).""" + + def test_replace_overwrites_not_appends(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + rows = [_row(matched_surface_form="AI")] + assert ledger.replace(rows, ticker="ORCL", call_id="C1") == 1 + # A second identical replace (a retry) must NOT double the partition. + assert ledger.replace(rows, ticker="ORCL", call_id="C1") == 1 + assert len(ledger.read("ORCL", "C1")) == 1 + + def test_replace_shrinks_partition(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + ledger.replace([_row(), _row(), _row()], ticker="ORCL", call_id="C1") + assert len(ledger.read("ORCL", "C1")) == 3 + # A re-run producing FEWER rows must not leave the stale extras behind. + assert ledger.replace([_row()], ticker="ORCL", call_id="C1") == 1 + assert len(ledger.read("ORCL", "C1")) == 1 + + def test_replace_empty_removes_partition(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + ledger.replace([_row()], ticker="ORCL", call_id="C1") + assert len(ledger.read("ORCL", "C1")) == 1 + assert ledger.replace([], ticker="ORCL", call_id="C1") == 0 + assert ledger.read("ORCL", "C1") == [] + + def test_replace_still_fails_closed_on_bad_kalshi_row(self, tmp_path) -> None: + # The fail-closed Kalshi guard must run on replace, exactly like append. + ledger = FactLedger(root=tmp_path) + bad = _row(role_source="diarization_advisory", kalshi_counted=True) + with pytest.raises(KalshiCountRuleViolation): + ledger.replace([bad], ticker="ORCL", call_id="C1") + assert ledger.read("ORCL", "C1") == [] diff --git a/packages/weather/tests/satellite/test_cli_roster.py b/packages/weather/tests/satellite/test_cli_roster.py new file mode 100644 index 0000000..7004451 --- /dev/null +++ b/packages/weather/tests/satellite/test_cli_roster.py @@ -0,0 +1,354 @@ +"""CLI roster/shard/incremental tests (28-21). + +Argparse-level tests: call ``main([...])`` with the module-level ``bulk_backfill`` +monkeypatched so no network / no cache I/O happens. We assert the CLI resolves +the roster shard, defaults, incremental window, and R2 enablement into the exact +kwargs ``bulk_backfill`` receives — proving the shipped ``infra/batch.tf`` +container args actually run. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import pytest +from mostlyright.weather.satellite._roster import ( + SETTLEMENT_STATION_ROSTER, + resolve_roster, + shard_roster, +) + +# ``__main__`` imports ``_backfill`` which imports the transport (_goes_s3 → +# boto3/s3fs) at module scope, so importing the CLI requires the [satellite] +# optional extra. In the base no-extra CI fast-suite skip this whole module +# cleanly — the dedicated satellite-coverage lane installs the extra and runs +# these. Mirrors test_satellite_backfill.py. +try: + from mostlyright.weather.satellite import __main__ as cli + + _HAVE_SATELLITE_DEPS = True +except ImportError: # pragma: no cover - exercised only without the extra + cli = None # type: ignore[assignment] + _HAVE_SATELLITE_DEPS = False + +pytestmark = pytest.mark.skipif( + not _HAVE_SATELLITE_DEPS, + reason="satellite CLI tests require the [satellite] optional extra (boto3/s3fs)", +) + + +@dataclass +class _FakeResult: + slices_completed: int = 0 + slices_skipped_resume: int = 0 + total_rows_written: int = 0 + duration_s: float = 0.0 + + +@pytest.fixture +def captured(monkeypatch): + """Monkeypatch the module-level bulk_backfill; capture its kwargs.""" + calls: list[dict] = [] + + def _fake(**kwargs): + calls.append(kwargs) + return _FakeResult() + + monkeypatch.setattr(cli, "bulk_backfill", _fake) + return calls + + +def test_batch_tf_backfill_args_run(captured, monkeypatch): + """The exact backfill container args from batch.tf parse + dispatch.""" + # batch.tf appends these to the `backfill` entrypoint; shard via flags here. + rc = cli.main( + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--r2-bucket", + "b", + "--shard-index", + "0", + "--shard-count", + "66", + ] + ) + assert rc == 0 + assert len(captured) == 1 + kw = captured[0] + expected_shard = list(shard_roster(resolve_roster("kalshi,polymarket"), 0, 66)) + assert kw["stations"] == expected_shard + assert len(kw["stations"]) == 1 # count == len(roster) -> one station/shard + assert kw["mirror"] == "gcp" + # roster mode defaults + assert kw["satellites"] == list(cli._DEFAULT_ROSTER_SATELLITES) + assert kw["products"] == list(cli._DEFAULT_ROSTER_PRODUCTS) + assert kw["year_start"] == cli._DEFAULT_ROSTER_YEAR_START + # --r2-bucket in roster mode ENABLES the sink even without --r2-target. + assert kw["r2_target"] == "b" + + +def test_shard_from_batch_task_env(captured, monkeypatch): + """Shard index/count come from BATCH_TASK_INDEX/COUNT when flags are absent.""" + monkeypatch.setenv("BATCH_TASK_INDEX", "5") + monkeypatch.setenv("BATCH_TASK_COUNT", "66") + # Explicit --satellites so this test isolates env-driven shard resolution from + # the default-satellite GOES-footprint filter (tested separately below). + rc = cli.main( + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--satellites", + "goes16", + "--r2-bucket", + "b", + ] + ) + assert rc == 0 + kw = captured[0] + expected_shard = list(shard_roster(resolve_roster("kalshi,polymarket"), 5, 66)) + assert kw["stations"] == expected_shard + + +def test_shard_index_flag_overrides_env(captured, monkeypatch): + """Explicit --shard-index/--shard-count win over the env.""" + monkeypatch.setenv("BATCH_TASK_INDEX", "5") + monkeypatch.setenv("BATCH_TASK_COUNT", "66") + rc = cli.main( + [ + "backfill", + "--roster", + "kalshi,polymarket", + "--satellites", + "goes16", + "--shard-index", + "2", + "--shard-count", + "8", + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["stations"] == list(shard_roster(resolve_roster("kalshi,polymarket"), 2, 8)) + + +def test_roster_no_shard_defaults_to_whole_roster(captured, monkeypatch): + """No shard flags + no env -> whole roster (index=0, count=1), e.g. Cloud Run Job.""" + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + # Explicit --satellites so the whole roster is passed through unfiltered (the + # default-satellite GOES footprint filter is tested separately below). + rc = cli.main( + ["backfill", "--roster", "kalshi,polymarket", "--satellites", "goes16", "--r2-bucket", "b"] + ) + assert rc == 0 + kw = captured[0] + assert kw["stations"] == list(SETTLEMENT_STATION_ROSTER) + + +def test_roster_default_satellites_filter_non_goes_stations(captured, monkeypatch): + """Default (GOES-only) satellites EXCLUDE roster stations outside the GOES + footprint — rather than fetching them empty and marking the slice complete + (silent under-coverage, Codex R7-3 P1).""" + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + rc = cli.main(["backfill", "--roster", "kalshi,polymarket", "--r2-bucket", "b"]) + assert rc == 0 + passed = captured[0]["stations"] + assert passed # non-empty — the roster has many GOES-footprint stations + assert all(s[:1].upper().startswith(cli._GOES_FOOTPRINT_ICAO_PREFIXES) for s in passed) + # A GOES station survives; European/Asian stations are dropped. + assert "KNYC" in passed + assert "EDDM" not in passed and "RJTT" not in passed + # Exactly the GOES-footprint subset of the committed roster. + expected = [ + s + for s in SETTLEMENT_STATION_ROSTER + if s[:1].upper().startswith(cli._GOES_FOOTPRINT_ICAO_PREFIXES) + ] + assert passed == expected + + +def test_roster_all_non_goes_shard_noops(captured, monkeypatch): + """A shard whose stations are all outside the GOES footprint cleanly no-ops under + the default satellites — no bulk_backfill, no empty-complete markers.""" + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + # shard 5 of 66 resolves to EPWA (Warsaw) — outside the GOES footprint. + non_goes = list(shard_roster(resolve_roster("kalshi,polymarket"), 5, 66)) + assert non_goes and all( + not s[:1].upper().startswith(cli._GOES_FOOTPRINT_ICAO_PREFIXES) for s in non_goes + ) + rc = cli.main( + [ + "backfill", + "--roster", + "kalshi,polymarket", + "--shard-index", + "5", + "--shard-count", + "66", + "--r2-bucket", + "b", + ] + ) + assert rc == 0 + assert captured == [] # filtered to empty -> nothing dispatched + + +def test_incremental_yesterday_single_year_resume(captured, monkeypatch): + """--incremental yesterday sets a single-year window and forces resume.""" + from datetime import UTC, datetime, timedelta + + # Yesterday's year (== today's except across the Jan-1 UTC boundary). + year = (datetime.now(UTC).date() - timedelta(days=1)).year + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + rc = cli.main( + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--incremental", + "yesterday", + "--r2-bucket", + "b", + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["year_start"] == year + assert kw["year_end"] == year + assert kw["resume"] is True + + +def test_roster_and_stations_mutually_exclusive(captured): + """--roster + --stations raises a clear error before any dispatch.""" + with pytest.raises(ValueError, match="mutually exclusive"): + cli.main( + [ + "backfill", + "--roster", + "kalshi,polymarket", + "--stations", + "KNYC", + ] + ) + assert captured == [] + + +def test_progress_bucket_wires_gcs_progress_store(captured, monkeypatch): + """--progress-bucket constructs a durable GcsProgressStore and threads it in (C4). + + Each shard gets a DISJOINT marker object (shard-index in the key) so array + tasks never clobber. The store is faked (no real GCS I/O at construction). + """ + built: list[str] = [] + + class _FakeStore: + def __init__(self, gcs_uri, *, fs=None): + built.append(gcs_uri) + + # The CLI does `from ._progress import GcsProgressStore` inside _run_backfill. + # Patch on the module object (the string path would resolve the `satellite` + # public function, not the submodule). + from mostlyright.weather.satellite import _progress + + monkeypatch.setattr(_progress, "GcsProgressStore", _FakeStore) + # Explicit --satellites so this test isolates progress-store wiring from the + # default-satellite GOES-footprint filter (shard 3 is a non-GOES station). + rc = cli.main( + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--satellites", + "goes16", + "--progress-bucket", + "marker-bkt", + "--r2-bucket", + "b", + "--shard-index", + "3", + "--shard-count", + "65", + ] + ) + assert rc == 0 + assert len(captured) == 1 + # A durable GCS progress store was wired into bulk_backfill's kwargs... + assert isinstance(captured[0]["progress_store"], _FakeStore) + # ...at a shard-disjoint marker URI. + assert built == ["gs://marker-bkt/progress/shard-00003/progress.json"] + + +def test_explicit_mode_unchanged(captured, tmp_path): + """Explicit-args path stays backward compatible (no roster).""" + rc = cli.main( + [ + "backfill", + "--satellites", + "goes16", + "--products", + "ABI-L2-ACMC", + "--stations", + "KNYC", + "--year-start", + "2020", + "--year-end", + "2020", + "--out", + str(tmp_path), + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["satellites"] == ["goes16"] + assert kw["stations"] == ["KNYC"] + assert kw["year_start"] == 2020 + # No --r2-target -> sink stays OFF (byte-identical to pre-28-20). + assert "r2_target" not in kw + + +def test_explicit_mode_missing_required_raises(captured): + """Explicit mode still requires the core args when no --roster.""" + with pytest.raises(ValueError, match="missing required argument"): + cli.main(["backfill", "--satellites", "goes16"]) + assert captured == [] + + +def test_incremental_explicit_mode(captured, tmp_path): + """--incremental works in explicit mode too (year window override + resume).""" + from datetime import UTC, datetime, timedelta + + year = (datetime.now(UTC).date() - timedelta(days=1)).year + rc = cli.main( + [ + "backfill", + "--satellites", + "goes16", + "--products", + "ABI-L2-ACMC", + "--stations", + "KNYC", + "--out", + str(tmp_path), + "--incremental", + "yesterday", + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["year_start"] == year + assert kw["year_end"] == year + assert kw["resume"] is True diff --git a/packages/weather/tests/satellite/test_roster.py b/packages/weather/tests/satellite/test_roster.py new file mode 100644 index 0000000..cca45c8 --- /dev/null +++ b/packages/weather/tests/satellite/test_roster.py @@ -0,0 +1,165 @@ +"""Roster snapshot tests (28-21). + +The committed ``SETTLEMENT_STATION_ROSTER`` MUST stay byte-equal to the live +``markets`` Kalshi/Polymarket union. These tests import the live catalogs and +assert the snapshot matches, so ANY upstream catalog drift (a station added or +removed) fails CI and forces a conscious re-snapshot in ``_roster.py`` — the +weather fleet's roster can never silently diverge from the markets truth. +""" + +from __future__ import annotations + +import pytest +from mostlyright.weather.satellite._roster import ( + _NON_SATELLITE_STATIONS, + ROSTERS, + SETTLEMENT_STATION_ROSTER, + resolve_roster, + shard_roster, +) + + +def _live_kalshi_stations() -> set[str]: + from mostlyright.markets.catalog.kalshi_stations import KALSHI_SETTLEMENT_STATIONS + + return {c.station for c in KALSHI_SETTLEMENT_STATIONS.values()} + + +def _live_polymarket_stations() -> set[str]: + from mostlyright.markets.polymarket import load_polymarket_city_stations + + return {icao for roles in load_polymarket_city_stations().values() for icao in roles.values()} + + +def _live_satellite_union() -> list[str]: + """The live Kalshi/Polymarket union MINUS the non-satellite stations (sorted).""" + union = _live_kalshi_stations() | _live_polymarket_stations() + return sorted(union - _NON_SATELLITE_STATIONS) + + +def test_roster_equals_live_satellite_union() -> None: + """The committed roster == live Kalshi/Polymarket union minus non-sat (drift gate).""" + assert list(SETTLEMENT_STATION_ROSTER) == _live_satellite_union() + + +def test_roster_count_is_65() -> None: + """D-28.8 / batch.tf task_count = 65 (66-station union minus the non-satellite HKO).""" + assert len(SETTLEMENT_STATION_ROSTER) == 65 + + +def test_non_satellite_stations_excluded_but_in_live_catalog() -> None: + """HKO is a REAL live settlement station but excluded (no satellite StationInfo).""" + assert "HKO" in _NON_SATELLITE_STATIONS + # It IS a live catalog station (so this is a conscious exclusion, not drift)... + assert "HKO" in _live_polymarket_stations() + # ...but it is NOT in the satellite roster (would be a zero-partition shard). + for station in _NON_SATELLITE_STATIONS: + assert station not in SETTLEMENT_STATION_ROSTER + + +def test_every_roster_station_resolves_to_a_satellite_station() -> None: + """No silent empty shards: every roster station resolves to a StationInfo.""" + from mostlyright.weather.satellite._backfill import _resolve_station_infos + + for station in SETTLEMENT_STATION_ROSTER: + infos = _resolve_station_infos([station]) + assert infos, ( + f"roster station {station!r} resolves to NO satellite StationInfo (empty shard)" + ) + + +def test_every_kalshi_settlement_station_present() -> None: + """Every live Kalshi NHIGH/NLOW settlement station is in the roster (all satellite-visible).""" + for station in _live_kalshi_stations(): + assert station in SETTLEMENT_STATION_ROSTER + + +def test_every_satellite_polymarket_station_present() -> None: + """Every live Polymarket city ICAO EXCEPT the non-satellite ones is in the roster.""" + for station in _live_polymarket_stations() - _NON_SATELLITE_STATIONS: + assert station in SETTLEMENT_STATION_ROSTER + + +def test_roster_sorted_and_deduped() -> None: + """The roster is sorted (stable shard mapping) and has no duplicates.""" + assert list(SETTLEMENT_STATION_ROSTER) == sorted(SETTLEMENT_STATION_ROSTER) + assert len(SETTLEMENT_STATION_ROSTER) == len(set(SETTLEMENT_STATION_ROSTER)) + + +def test_resolve_roster_kalshi_polymarket() -> None: + """The batch.tf literal 'kalshi,polymarket' resolves to the 65 satellite stations.""" + resolved = resolve_roster("kalshi,polymarket") + assert resolved == SETTLEMENT_STATION_ROSTER + assert len(resolved) == 65 + + +def test_resolve_roster_splits_match_live_catalogs() -> None: + """The convenience split rosters match the live per-market catalogs (minus non-sat).""" + assert set(resolve_roster("kalshi")) == _live_kalshi_stations() - _NON_SATELLITE_STATIONS + assert ( + set(resolve_roster("polymarket")) == _live_polymarket_stations() - _NON_SATELLITE_STATIONS + ) + # Split rosters stay sorted slices of the union. + for name in ("kalshi", "polymarket"): + r = resolve_roster(name) + assert list(r) == sorted(r) + assert set(r) <= set(SETTLEMENT_STATION_ROSTER) + + +def test_resolve_roster_unknown_raises() -> None: + """An unknown roster name fails loud (before any I/O).""" + with pytest.raises(ValueError, match="unknown roster"): + resolve_roster("nope") + + +@pytest.mark.parametrize("count", [65, 8]) +def test_shard_roster_partitions_with_no_overlap_full_coverage(count: int) -> None: + """Sharding partitions the roster: no overlap, full coverage across all shards.""" + roster = SETTLEMENT_STATION_ROSTER + seen: list[str] = [] + for index in range(count): + shard = shard_roster(roster, index, count) + seen.extend(shard) + # Every station covered exactly once across the shards. + assert sorted(seen) == sorted(roster) + assert len(seen) == len(set(seen)) # no overlap + + +def test_shard_roster_full_count_gives_one_station_each() -> None: + """With count == len(roster) every shard is exactly one station (batch.tf case).""" + roster = SETTLEMENT_STATION_ROSTER + for index in range(len(roster)): + shard = shard_roster(roster, index, len(roster)) + assert len(shard) == 1 + + +def test_shard_roster_deterministic() -> None: + """Shard index maps to the SAME stations across calls (retry-stable).""" + roster = SETTLEMENT_STATION_ROSTER + assert shard_roster(roster, 0, 8) == shard_roster(roster, 0, 8) + assert shard_roster(roster, 3, 8) == shard_roster(roster, 3, 8) + + +def test_shard_roster_nonempty_when_count_le_len() -> None: + """Round-robin keeps every shard non-empty when count <= len(roster).""" + roster = SETTLEMENT_STATION_ROSTER + for count in (8, 65): + for index in range(count): + assert len(shard_roster(roster, index, count)) >= 1 + + +@pytest.mark.parametrize( + "index,count", + [(-1, 8), (8, 8), (0, 0), (1, 0), (5, 3)], +) +def test_shard_roster_invalid_index_count_raises(index: int, count: int) -> None: + """Out-of-range index or non-positive count fails loud.""" + with pytest.raises(ValueError): + shard_roster(SETTLEMENT_STATION_ROSTER, index, count) + + +def test_rosters_dict_names() -> None: + """The CLI roster registry exposes the batch.tf name + the cheap splits.""" + assert "kalshi,polymarket" in ROSTERS + assert "kalshi" in ROSTERS + assert "polymarket" in ROSTERS diff --git a/services/earnings/app.py b/services/earnings/app.py index acf1f9f..b6b0831 100644 --- a/services/earnings/app.py +++ b/services/earnings/app.py @@ -15,16 +15,37 @@ from __future__ import annotations +import asyncio +import contextlib +import logging import os import re +import threading from pathlib import Path +from typing import TYPE_CHECKING from fastapi import FastAPI from .deps import ServingState from .middleware.auth import API_KEY_ENV, ApiKeyAuthMiddleware from .middleware.ratelimit import TokenBucketRateLimitMiddleware -from .routes import capabilities, facts, stream, transcripts +from .routes import capabilities, facts, health, stream, transcripts + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + +_LOG = logging.getLogger("services.earnings.app") + +#: Env var naming the cross-project ``earnings-streaming`` Pub/Sub SUBSCRIPTION +#: the serving instance pulls live segments from (28-12 C2). When UNSET (the +#: default — including every test and a ledger-only serving deploy) the app runs +#: WITHOUT a subscriber: ``/stream`` 404s until the live pipeline is wired. When +#: SET, the lifespan starts the :class:`SegmentSubscriber` (H2: single instance). +STREAMING_SUBSCRIPTION_ENV = "EARNINGS_STREAMING_SUBSCRIPTION" + +#: Env var naming the ingest project that owns the earnings-streaming topic +#: (cross-project, C2). Falls back to ``GOOGLE_CLOUD_PROJECT`` (Cloud Run sets it). +INGEST_PROJECT_ENV = "EARNINGS_INGEST_PROJECT" #: Default per-client request budget + window for the public feed. Overridable #: per deploy; a burst beyond this returns 429 (T-27-27). @@ -105,6 +126,115 @@ def assert_no_audio_surface(app: FastAPI) -> None: ) +class _RegistryBusAdapter: + """Route :class:`SegmentSubscriber` republishes into the per-call bus registry. + + The subscriber expects a single ``SegmentBus``-shaped object + (``publish(call_id, item)`` / ``close(call_id)`` returning coroutines). In the + HOSTED serving topology there is no local capture runner to + :meth:`BusRegistry.register` a call's bus — the cross-project Pub/Sub + subscriber IS the bus producer. So this adapter ``get_or_create``s the call's + bus on first message, after which the ``/stream`` route (which uses + :meth:`BusRegistry.get`) can find it. Registry mutation is lock-guarded, so + the StreamingPull transport thread and the serving loop do not race. + """ + + def __init__(self, registry: object) -> None: + self._registry = registry + + def publish(self, call_id: str, item: object) -> object: + return self._registry.get_or_create(call_id).publish(call_id, item) # type: ignore[attr-defined] + + def close(self, call_id: str) -> object: + return self._registry.get_or_create(call_id).close(call_id) # type: ignore[attr-defined] + + +def _parse_streaming_subscription(value: str) -> tuple[str, str]: + """Resolve ``(project, subscription_id)`` from ``EARNINGS_STREAMING_SUBSCRIPTION``. + + The deployed infra sets this to the FULL cross-project resource path + ``projects//subscriptions/`` (the + ``google_pubsub_subscription.id``), so the INGEST project + bare id are parsed + from it — the serving instance's own ``GOOGLE_CLOUD_PROJECT`` is the serving + project, the WRONG one for this cross-project subscription. A BARE subscription + id (manual / test) still works when ``EARNINGS_INGEST_PROJECT`` (or + ``GOOGLE_CLOUD_PROJECT``) names the owning project. + """ + match = re.fullmatch(r"projects/([^/]+)/subscriptions/([^/]+)", value) + if match: + return match.group(1), match.group(2) + project = ( + os.environ.get(INGEST_PROJECT_ENV) or os.environ.get("GOOGLE_CLOUD_PROJECT") or "" + ).strip() + if not project: + raise RuntimeError( + f"{STREAMING_SUBSCRIPTION_ENV}={value!r} is a bare subscription id but no " + f"ingest project is configured — provide the full " + f"projects//subscriptions/ resource path (the infra default), " + f"or set {INGEST_PROJECT_ENV}." + ) + return project, value + + +@contextlib.asynccontextmanager +async def _streaming_lifespan(app: FastAPI) -> AsyncIterator[None]: + """Start/stop the cross-project earnings-streaming subscriber (28-12 C2, H2). + + OPT-IN: only when ``EARNINGS_STREAMING_SUBSCRIPTION`` is set. Absent it (the + default — every test + a ledger-only deploy) this is a no-op and app startup + is unchanged. When set, it records the serving event loop on the bus registry + and runs :meth:`SegmentSubscriber.consume` on a daemon thread so live + segments/facts from the ingest pipeline reach the ``/stream`` fan-out. + + **H2 (load-bearing).** The subscriber shares ONE Pub/Sub subscription; correct + fan-out requires EXACTLY ONE always-warm instance (``max-instances=1`` + + affinity, pinned in ``infra/cloud_run.tf``). Teardown relies on daemon-thread + process exit (Cloud Run always-warm single instance) — a graceful + StreamingPull cancel is a deferred hardening seam. + """ + subscription = os.environ.get(STREAMING_SUBSCRIPTION_ENV, "").strip() + if not subscription: + yield + return + # The deployed infra sets EARNINGS_STREAMING_SUBSCRIPTION to the FULL + # cross-project resource path (projects//subscriptions/) and does + # NOT set EARNINGS_INGEST_PROJECT — the serving instance runs in the SERVING + # project, so GOOGLE_CLOUD_PROJECT is the WRONG project for this cross-project + # subscription. Parse the ingest project + bare id from the resource path. + project, subscription_id = _parse_streaming_subscription(subscription) + # Lazy-import the GCP-SDK-backed bridge factories so importing this module (and + # the ledger-only default deploy) never needs google-cloud-pubsub. + from .pubsub_bridge import ( + SegmentSubscriber, + build_streaming_pull, + make_run_coroutine_threadsafe, + ) + + registry = app.state.serving.buses + registry.serving_loop = asyncio.get_running_loop() + subscriber = SegmentSubscriber( + _RegistryBusAdapter(registry), # type: ignore[arg-type] + run_on_loop=make_run_coroutine_threadsafe(registry.serving_loop), + ) + streaming_pull = build_streaming_pull(project, subscription_id) + + def _run() -> None: + try: + subscriber.consume(streaming_pull) + except Exception: # pragma: no cover - transport-thread crash path + _LOG.exception("earnings-streaming subscriber thread exited unexpectedly") + + thread = threading.Thread(target=_run, name="earnings-streaming-subscriber", daemon=True) + thread.start() + _LOG.info("earnings-streaming subscriber started (subscription=%s)", subscription) + try: + yield + finally: + # Daemon thread: the always-warm single instance tears down on process + # exit (Cloud Run). A graceful StreamingPull cancel is a deferred seam. + _LOG.info("earnings-streaming subscriber shutdown (daemon thread will exit with process)") + + #: Sentinel so ``api_key`` can distinguish "not passed → read env" from an #: explicit ``None`` (keyless local/dev mode). _UNSET = object() @@ -184,9 +314,13 @@ def create_app( title="mostlyright earnings serving API", summary="Transcript + derived-fact serving (text/facts only — never audio).", version="0.1.0", + lifespan=_streaming_lifespan, ) app.state.serving = ServingState.build(ledger_root, stt_tier=stt_tier) + # /healthz is unauthenticated (exempted in the auth + ratelimit middleware) — + # the Cloud Run HTTP probe idiom. Registered first so it is always present. + app.include_router(health.router, tags=["health"]) app.include_router(transcripts.router, tags=["transcripts"]) app.include_router(facts.router, tags=["facts"]) app.include_router(capabilities.router, tags=["capabilities"]) diff --git a/services/earnings/deps.py b/services/earnings/deps.py index 09839eb..852be0e 100644 --- a/services/earnings/deps.py +++ b/services/earnings/deps.py @@ -16,6 +16,8 @@ from mostlyright.weather.earnings.ledger import FactLedger, TranscriptLedger from mostlyright.weather.earnings.segment_bus import SegmentBus +from .r2_read import EarningsR2Reader, R2LedgerSource, r2_read_configured + #: STT tier this deployment runs (RESEARCH-MARKETS §3.4 — large-v3 hosted #: source-of-truth). Reported by /capabilities. DEFAULT_STT_TIER = "large-v3" @@ -226,10 +228,19 @@ def release_and_maybe_evict(self, call_id: str) -> None: @dataclass(slots=True) class ServingState: - """The serving app's read-side state.""" + """The serving app's read-side state. + + ``transcripts`` / ``facts`` are the READ source. In the DEPLOYED hosted topology + they are R2-backed (:class:`R2LedgerSource`) — serving runs in a SEPARATE Cloud + Run container from the ingest jobs, so it reads the durable text/fact parquet + from R2, not a container-local ledger that would be empty (Codex R7-4 P1). In + the local / on-device / test tier they are the engine's local parquet ledgers. + Both expose the same read subset the routes use (``read`` / ``read_ticker`` / + ``list_call_ids`` / ``list_tickers``). + """ - transcripts: TranscriptLedger - facts: FactLedger + transcripts: TranscriptLedger | R2LedgerSource + facts: FactLedger | R2LedgerSource stt_tier: str = DEFAULT_STT_TIER buses: BusRegistry = field(default_factory=BusRegistry) @@ -237,10 +248,23 @@ class ServingState: def build( cls, ledger_root: Path | str | None = None, *, stt_tier: str | None = None ) -> ServingState: - root = Path(ledger_root) if ledger_root is not None else None + # Deployed serving reads the durable R2 corpus the ingest jobs wrote (its + # container has ONLY the R2 READ token). An EXPLICIT ledger_root (tests / + # on-device) always wins → the local ledger; otherwise, when the read + # token is present, read from R2. + transcripts: TranscriptLedger | R2LedgerSource + facts: FactLedger | R2LedgerSource + if ledger_root is None and r2_read_configured(): + reader = EarningsR2Reader() + transcripts = R2LedgerSource(reader, "transcripts") + facts = R2LedgerSource(reader, "facts") + else: + root = Path(ledger_root) if ledger_root is not None else None + transcripts = TranscriptLedger(root=root) + facts = FactLedger(root=root) return cls( - transcripts=TranscriptLedger(root=root), - facts=FactLedger(root=root), + transcripts=transcripts, + facts=facts, stt_tier=stt_tier or DEFAULT_STT_TIER, buses=BusRegistry(), ) diff --git a/services/earnings/jobs/__init__.py b/services/earnings/jobs/__init__.py new file mode 100644 index 0000000..5d36a43 --- /dev/null +++ b/services/earnings/jobs/__init__.py @@ -0,0 +1,29 @@ +"""Cloud Run Jobs entrypoints for the earnings INGEST pipeline (Phase 28, 28-13). + +These three thin ``python -m services.earnings.jobs.`` entrypoints are the +deploy-runtime scaffolding for the audio side of the firewall (the +``mr-earnings-ingest`` project). Each reads its job spec from environment +variables (how Cloud Run Jobs pass per-invocation config), invokes the SHIPPED +engine libraries under ``mostlyright.weather.earnings`` (never re-implementing +capture / STT / fact-building), and exits ``0`` on success / non-zero on failure +(fail loud). + +* :mod:`services.earnings.jobs.capture` — cold-fetch a webcast to TRANSIENT audio. +* :mod:`services.earnings.jobs.stt` — faster-whisper transcribe → transcript ledger. +* :mod:`services.earnings.jobs.rolefact` — role-attribute + build facts → fact ledger. + +**Audio firewall (D-27.9, legal).** Audio is a transient ingest artifact that +dies on ephemeral disk. It is NEVER a ledger column, NEVER uploaded to R2, and +NEVER served — the capture job asserts the audio path stays local and the +ledgers structurally refuse an audio-shaped field. + +**Lazy heavy imports.** ffmpeg/PyAV (``av``), faster-whisper (CTranslate2), and +the httpx-heavy capture bits are imported INSIDE ``main`` (never at module load), +so these modules import cleanly — and ``main()`` runs against injected/fake +inputs — with no audio toolchain, no GPU, and no ffmpeg present (mirrors the +sse.py / pubsub_bridge.py lazy-import discipline). +""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/services/earnings/jobs/_env.py b/services/earnings/jobs/_env.py new file mode 100644 index 0000000..b3721ec --- /dev/null +++ b/services/earnings/jobs/_env.py @@ -0,0 +1,39 @@ +"""Shared env-var helpers for the earnings Cloud Run Jobs entrypoints. + +Cloud Run Jobs inject per-invocation config as environment variables, so every +job resolves its inputs through :func:`require_env` (fail loud, naming the +missing var) or :func:`optional_env` (nullable, with a default). A missing +REQUIRED var must crash the job LOUD at startup — a silently-defaulted ticker or +audio path would mis-attribute a settlement-adjacent transcript. +""" + +from __future__ import annotations + +import os + +__all__ = ["optional_env", "require_env"] + + +def require_env(name: str) -> str: + """Return ``os.environ[name]`` or raise a loud config error naming the var. + + A Cloud Run Job with a required env var unset is a deploy misconfiguration; + the job must fail loud at startup rather than run against an empty/defaulted + value (which could mis-attribute a settlement-adjacent artifact). + """ + value = os.environ.get(name) + if value is None or value == "": + raise ValueError( + f"required environment variable {name!r} is unset or empty — the " + "Cloud Run Job cannot run without it (fail loud rather than default " + "a settlement-adjacent input)." + ) + return value + + +def optional_env(name: str, default: str | None = None) -> str | None: + """Return ``os.environ[name]`` when set + non-empty, else ``default``.""" + value = os.environ.get(name) + if value is None or value == "": + return default + return value diff --git a/services/earnings/jobs/capture.py b/services/earnings/jobs/capture.py new file mode 100644 index 0000000..91c828f --- /dev/null +++ b/services/earnings/jobs/capture.py @@ -0,0 +1,488 @@ +"""Earnings webcast-capture Cloud Run Job (Phase 28, 28-13). + +The AUDIO side of the firewall (``mr-earnings-ingest``). In the DEPLOYED topology +(``infra/cloud_run.tf`` ``google_cloud_run_v2_job.capture``) this job: + + 1. pulls ONE capture-job spec message off the ``CAPTURE_JOBS_SUBSCRIPTION`` + Pub/Sub subscription (the per-call ticker / call_id / webcast_url), then + 2. starts a background lease-extension loop (the capture can run 60-90 min, far + past the subscription ack deadline; without extension Pub/Sub would redeliver + the job mid-capture → duplicate captures + DLQ exhaustion), then + 3. invokes the SHIPPED capture surface + (:class:`mostlyright.weather.earnings.capture.q4.Q4CaptureAdapter`) to + cold-fetch the webcast media into an EPHEMERAL dir, then stops the lease loop, + then + 4. UPLOADS the transient audio to the private, in-firewall GCS handoff bucket + ``AUDIO_HANDOFF_BUCKET`` (``earnings-audio-handoff-``) so the + SEPARATE STT Cloud Run service — which does NOT share this job's ephemeral + filesystem — can fetch it, then + 5. TRIGGERS STT by POSTing the gs:// handoff URI to ``STT_SERVICE_URL/transcribe`` + (the shipped :mod:`services.earnings.jobs.stt_server` surface accepts a gs:// + ref), then + 6. acks the message ONLY after STT was successfully triggered (2xx). On a trigger + failure the message is NOT acked and Pub/Sub redelivers — re-upload to the + same handoff key is idempotent, so no audio is orphaned (captured but never + transcribed). + +**Why the GCS handoff (not a shared local path).** capture and STT are separate +Cloud Run resources with NO shared disk. The audio therefore crosses the two +firewalled stages via a private GCS object in ``AUDIO_HANDOFF_BUCKET`` — an +in-region, in-firewall bucket. It is NEVER an R2 key and NEVER served: only STT +(inside the firewall) reads it, transcribes, and deletes it. The audio bytes +never reach the ledger, the wire, or R2 (D-27.9). + +**Trigger seam (hardening).** The synchronous HTTP trigger to STT is the MVP that +closes capture→STT end-to-end. A decoupled trigger — an ``stt-jobs`` Pub/Sub topic +capture publishes to, or a GCS object-finalize notification on the handoff bucket +fanning out to STT — is a hardening seam that removes capture's dependency on STT +being reachable at that instant. Not wired here (MVP first). + +**Audio firewall (D-27.9, legal).** The captured audio is a TRANSIENT artifact. +This job asserts ``is_transient`` and that the local path stays under the capture +dir before the handoff. The handoff target is the private GCS bucket the infra +provides — audio never gets an R2 key and is never a ledger column. + +**Env contract (DEPLOYED / infra path):** + +* ``CAPTURE_JOBS_SUBSCRIPTION`` (required) — the Pub/Sub subscription id carrying + the per-call capture-job spec (``ticker`` / ``call_id`` / + ``webcast_url``/``media_url``). One message is pulled + acked per run. +* ``AUDIO_HANDOFF_BUCKET`` (required) — the private GCS bucket the transient + audio is uploaded to for the cross-service handoff to STT. +* ``STT_SERVICE_URL`` (required in the deployed subscription path) — the + base URL of the STT Cloud Run service; capture POSTs ``STT_SERVICE_URL/transcribe`` + to trigger transcription. UNSET in the deployed path is a FAIL-LOUD deploy + misconfiguration (raises, non-zero exit, message NOT acked) rather than silently + orphaning the uploaded audio. +* ``CAPTURE_OUT_DIR`` (optional) — the ephemeral dir the transient audio + is written under (default: a fresh ``tempfile`` dir, still ephemeral). + +**Operator-override / manual single-call path.** For a manual single-call run +(no subscription), the per-call spec may instead be supplied directly via env — +``CAPTURE_TICKER`` / ``CAPTURE_CALL_ID`` / ``CAPTURE_WEBCAST_URL`` — which takes +precedence over the subscription pull. This is the operator-gated override (no +lease loop, no ack, and the STT trigger is skipped when no handoff bucket is set); +the DEFAULT deployed path reads ``CAPTURE_JOBS_SUBSCRIPTION`` + +``AUDIO_HANDOFF_BUCKET`` + ``STT_SERVICE_URL``. The handoff upload is skipped ONLY +when ``AUDIO_HANDOFF_BUCKET`` is unset (a bare local operator run), and then the +transient path is emitted on stdout instead. + +**Lazy imports.** ffmpeg/PyAV (``av``) and httpx are pulled in only by the shipped +capture surface (lazy inside its own methods). ``google-cloud-pubsub`` / +``google-cloud-storage`` are lazy-imported inside :func:`_pull_capture_job` / +:func:`_upload_handoff`, and ``httpx`` inside :func:`_trigger_stt` — so this module +and its ``main`` import nothing heavy at module load (the tests stub the capture +surface + fake pubsub / GCS / httpx). + +**Live IVS capture seam (27-09, OPERATOR-GATED).** The live-during-call path +(:meth:`CaptureAdapter.live` → the Amazon-IVS HLS edge for a real in-progress +call) is OPERATOR-GATED and NOT driven here — this job is the VOD/replay +cold-fetch. WHO triggers a live capture job (a scheduler watching the earnings +calendar vs. an operator hand-enqueuing a capture-job message) is the +operator-gated orchestration decision; the streaming STT hand-off (27-10) follows +the HLS edge and is intentionally out of this deploy-runtime scaffolding. +""" + +from __future__ import annotations + +import contextlib +import json +import logging +import os +import tempfile +import threading + +from services.earnings.jobs._env import optional_env, require_env + +_LOG = logging.getLogger("services.earnings.jobs.capture") + +#: The Pub/Sub ack-deadline (seconds) the lease loop resets the message to on each +#: extension. Pub/Sub caps modify_ack_deadline at 600s; a 60-90 min capture needs +#: many extensions, so we re-lease at 600 and refresh well before it lapses. +_LEASE_DEADLINE_SECONDS = 600 +#: Refresh at 70% of the deadline so a slow extension still lands before expiry. +_LEASE_REFRESH_FRACTION = 0.7 + +#: HTTP timeout (seconds) for the SYNCHRONOUS capture->STT ``/transcribe`` trigger. +#: ``/transcribe`` transcribes the WHOLE call before it responds, so the trigger +#: must wait out a real (multi-minute → up-to-~an-hour) GPU transcription — NOT a +#: 60s guess. A 60s cap made every real earnings call time out mid-transcription → +#: capture raised → the message was NOT acked → Pub/Sub redelivered → duplicate +#: recapture while STT was still running (Codex R7 P1). Default 3600s = the Cloud +#: Run max request timeout the STT service is pinned to (``infra/cloud_run.tf``); +#: override with ``STT_TRIGGER_TIMEOUT_SECONDS``. A call whose transcription exceeds +#: this needs the decoupled (fire-and-forget) trigger seam documented above. +_DEFAULT_STT_TRIGGER_TIMEOUT_SECONDS = 3600.0 + + +def _stt_trigger_timeout() -> float: + """Resolve the capture->STT trigger HTTP timeout (seconds). + + Defaults to :data:`_DEFAULT_STT_TRIGGER_TIMEOUT_SECONDS` (cover a full real + transcription); ``STT_TRIGGER_TIMEOUT_SECONDS`` overrides it. A non-numeric or + non-positive value fails loud (a bad timeout that silently fell back to a short + default would reintroduce the mid-transcription-timeout bug). + """ + raw = optional_env("STT_TRIGGER_TIMEOUT_SECONDS") + if not raw: + return _DEFAULT_STT_TRIGGER_TIMEOUT_SECONDS + try: + value = float(raw) + except ValueError as exc: + raise ValueError(f"STT_TRIGGER_TIMEOUT_SECONDS={raw!r} is not a number") from exc + if value <= 0: + raise ValueError(f"STT_TRIGGER_TIMEOUT_SECONDS must be > 0; got {value}") + return value + + +def _assert_audio_local(audio_path: str, out_dir: str) -> None: + """Fail loud if the captured audio is not a local file under ``out_dir`` (D-27.9). + + The captured audio must stay a transient artifact on ephemeral disk — it must + NOT be a remote URL / object-store key and must live under the capture dir. + Anything else means audio has (or could) escape the firewall. + """ + real_out = os.path.realpath(out_dir) + real_audio = os.path.realpath(audio_path) + if not real_audio.startswith(real_out + os.sep): + raise RuntimeError( + f"captured audio {audio_path!r} is not under the local ephemeral " + f"capture dir {out_dir!r} — audio must stay a transient local artifact " + "and NEVER be uploaded or served (D-27.9)." + ) + + +class _MessageHandle: + """A pulled Pub/Sub message: ack + lease-extension over a long capture. + + Wraps the ``SubscriberClient`` + ``(subscription, ack_id)`` so ``main`` can + (a) hold the lease during the whole capture via a background daemon thread and + (b) ack ONLY after the full capture→handoff→STT-trigger succeeds. The + operator-override path uses :class:`_NoopHandle` instead (no message to lease). + """ + + def __init__(self, client: object, subscription: str, ack_id: str) -> None: + self._client = client + self._subscription = subscription + self._ack_id = ack_id + + def ack(self) -> None: + self._client.acknowledge(subscription=self._subscription, ack_ids=[self._ack_id]) + + def _extend_once(self) -> None: + self._client.modify_ack_deadline( + subscription=self._subscription, + ack_ids=[self._ack_id], + ack_deadline_seconds=_LEASE_DEADLINE_SECONDS, + ) + + @contextlib.contextmanager + def hold_lease(self): + """Context manager: keep the message leased for its whole body. + + A daemon thread re-leases the message to ``_LEASE_DEADLINE_SECONDS`` every + ``deadline * _LEASE_REFRESH_FRACTION`` seconds so a 60-90 min capture never + lets the ack deadline lapse (which would trigger redelivery + a duplicate + capture). The thread is stopped BEFORE the caller acks/nacks. The first + extension is issued immediately so a slow capture is protected from second 0. + """ + stop = threading.Event() + interval = _LEASE_DEADLINE_SECONDS * _LEASE_REFRESH_FRACTION + + def _loop() -> None: + # Extend immediately, then every ``interval`` until stopped. Any + # transient extension error is logged but must not kill the capture — + # a lapse only risks a duplicate (idempotent) capture, never data loss. + while not stop.is_set(): + try: + self._extend_once() + except Exception: # pragma: no cover - defensive; transient RPC error + _LOG.warning("lease extension failed (will retry next tick)", exc_info=True) + stop.wait(interval) + + thread = threading.Thread(target=_loop, name="capture-lease-extender", daemon=True) + thread.start() + try: + yield + finally: + stop.set() + thread.join(timeout=5.0) + + +class _NoopHandle: + """The operator-override message handle: no lease, no ack (no Pub/Sub message).""" + + def ack(self) -> None: + return None + + @contextlib.contextmanager + def hold_lease(self): + yield + + +def _pull_capture_job(subscription: str) -> tuple[dict[str, str], _MessageHandle]: + """Pull ONE capture-job spec message off the subscription; return (spec, handle). + + Returns the decoded per-call spec (``ticker`` / ``call_id`` / + ``webcast_url``/``media_url``) plus a :class:`_MessageHandle` the caller uses to + hold the lease during the capture and ack AFTER the full pipeline (capture → + handoff → STT trigger) succeeds (so a crash mid-capture leaves the message + un-acked and the job is retried). ``google-cloud-pubsub`` is lazy-imported here + (never at module load). + + Raises: + RuntimeError: no message is available on the subscription, or the message + payload is missing a required field. + """ + from google.cloud import pubsub_v1 + + client = pubsub_v1.SubscriberClient() + response = client.pull(subscription=subscription, max_messages=1, return_immediately=True) + received = list(response.received_messages) + if not received: + raise RuntimeError( + f"no capture-job message available on subscription {subscription!r} — " + "nothing to capture (fail loud rather than silently no-op)." + ) + msg = received[0] + spec = json.loads(msg.message.data.decode("utf-8")) + ticker = spec.get("ticker") + call_id = spec.get("call_id") + webcast_url = spec.get("webcast_url") or spec.get("media_url") + missing = [ + k + for k, v in (("ticker", ticker), ("call_id", call_id), ("webcast_url", webcast_url)) + if not v + ] + if missing: + raise RuntimeError( + f"capture-job message on {subscription!r} is missing required field(s) " + f"{missing}; cannot capture a settlement-adjacent call from a partial spec." + ) + + handle = _MessageHandle(client, subscription, msg.ack_id) + return {"ticker": ticker, "call_id": call_id, "webcast_url": webcast_url}, handle + + +def _upload_handoff(audio_path: str, bucket: str, *, ticker: str, call_id: str) -> str: + """Upload the transient audio to the private GCS handoff bucket; return the gs:// uri. + + The cross-service handoff to STT (capture and STT do NOT share a disk). The + object key is namespaced by ``(ticker, call_id)``; it lands in the private, + in-firewall ``AUDIO_HANDOFF_BUCKET`` — NOT R2, NEVER served. STT downloads it, + transcribes, and deletes it. ``google-cloud-storage`` is lazy-imported here. + """ + from google.cloud import storage + + ext = os.path.splitext(audio_path)[1] or ".audio" + blob_name = f"handoff/{ticker}/{call_id}{ext}" + client = storage.Client() + blob = client.bucket(bucket).blob(blob_name) + blob.upload_from_filename(audio_path) + uri = f"gs://{bucket}/{blob_name}" + _LOG.info("capture handoff: uploaded transient audio to %s (%s/%s)", uri, ticker, call_id) + return uri + + +def _trigger_stt(handoff_uri: str, *, ticker: str, call_id: str) -> None: + """POST the gs:// handoff URI to the STT service to trigger transcription. + + The synchronous HTTP trigger that closes capture→STT: the shipped + :mod:`services.earnings.jobs.stt_server` ``POST /transcribe`` accepts a gs:// + reference, downloads it from the handoff bucket, transcribes, and writes the + audio-free transcript ledger. ``httpx`` is lazy-imported here. + + The caller acks the Pub/Sub message ONLY after this returns (2xx). A non-2xx or + connection error raises, so ``main`` leaves the message un-acked → Pub/Sub + redelivers → capture re-runs (re-upload to the same handoff key is idempotent), + never orphaning the audio. + + Raises: + RuntimeError: ``STT_SERVICE_URL`` is unset (deploy misconfiguration — fail + loud rather than upload audio nobody will transcribe). + """ + stt_url = optional_env("STT_SERVICE_URL") + if not stt_url: + raise RuntimeError( + "STT_SERVICE_URL is unset in the deployed subscription path — capture " + "uploaded the audio handoff but cannot trigger STT, which would ORPHAN " + "the audio (captured, never transcribed). Set STT_SERVICE_URL to the STT " + "Cloud Run service base URL (fail loud rather than silently orphan)." + ) + + import httpx + + # Service-to-service auth contract: the STT Cloud Run service is PRIVATE (no + # public invoker), so a bare POST would 403. The capture SA holds + # roles/run.invoker on STT (infra/deploy_iam.tf), and Cloud Run authenticates + # by verifying a Google-signed ID token whose AUDIENCE is the receiving + # service's URL. So mint an ID token with audience = STT_SERVICE_URL and send + # it as `Authorization: Bearer `. + try: + import google.auth.transport.requests + from google.oauth2 import id_token as _id_token + + token = _id_token.fetch_id_token(google.auth.transport.requests.Request(), stt_url) + headers = {"Authorization": f"Bearer {token}"} + except Exception: + # No metadata server (local/test) or token mint failed — POST without a + # token. On Cloud Run the metadata server is always present; a mint failure + # there is a real misconfig that STT will reject 401/403, so we still POST + # (don't silently succeed) and let the non-2xx raise + NACK for redelivery. + headers = {} + + endpoint = stt_url.rstrip("/") + "/transcribe" + payload = {"audio_path": handoff_uri, "ticker": ticker, "call_id": call_id} + # /transcribe blocks until the full call is transcribed + the transcript ledger + # is written, so the timeout must cover the whole transcription (default 3600s), + # NOT 60s — the caller holds the Pub/Sub lease across this wait (see main()). + resp = httpx.post(endpoint, json=payload, headers=headers, timeout=_stt_trigger_timeout()) + resp.raise_for_status() + _LOG.info( + "capture triggered STT: POST %s -> %s (%s/%s)", + endpoint, + resp.status_code, + ticker, + call_id, + ) + + +def _resolve_spec() -> tuple[dict[str, str], _MessageHandle | _NoopHandle]: + """Resolve the per-call capture spec + a message handle. + + Operator-override precedence: if ``CAPTURE_TICKER`` is set, read the whole spec + from env (the manual single-call path) with a :class:`_NoopHandle` (no lease, + no ack). Otherwise pull ONE message off ``CAPTURE_JOBS_SUBSCRIPTION`` (the + DEFAULT deployed path) and return its lease-capable :class:`_MessageHandle`. + """ + if optional_env("CAPTURE_TICKER"): + spec = { + "ticker": require_env("CAPTURE_TICKER"), + "call_id": require_env("CAPTURE_CALL_ID"), + "webcast_url": require_env("CAPTURE_WEBCAST_URL"), + } + _LOG.info("capture: using operator-override env spec (manual single-call path)") + return spec, _NoopHandle() + + subscription = require_env("CAPTURE_JOBS_SUBSCRIPTION") + return _pull_capture_job(subscription) + + +def main(argv: list[str] | None = None) -> int: + """Pull a capture job, cold-fetch the webcast media, hand the audio to STT via GCS. + + The DEFAULT deployed path pulls the per-call spec off + ``CAPTURE_JOBS_SUBSCRIPTION``, runs the shipped Q4 capture, uploads the + transient audio to ``AUDIO_HANDOFF_BUCKET`` (the cross-service handoff), and + acks the message. A manual single-call run may instead supply the spec via + ``CAPTURE_TICKER`` / ``CAPTURE_CALL_ID`` / ``CAPTURE_WEBCAST_URL``. Any failure + (missing env, no message, SSRF-rejected URL, no HTTP media, extract failure) + propagates as a non-zero exit (fail loud) and leaves the message un-acked. + """ + logging.basicConfig(level=logging.INFO) + + spec, handle = _resolve_spec() + ticker = spec["ticker"] + call_id = spec["call_id"] + webcast_url = spec["webcast_url"] + + handoff_bucket = optional_env("AUDIO_HANDOFF_BUCKET") + + # A SUBSCRIPTION-pulled capture (real _MessageHandle) MUST have a handoff bucket + # to hand the audio to STT. Without it the run would capture, take the local + # branch, print the ephemeral path, and ACK the message — deleting the only + # audio copy on job exit while Pub/Sub never redelivers → SILENT settlement-audio + # loss (Codex R7-5 P1). Fail loud EARLY (before the expensive capture), NOT + # acked, so the message redelivers once AUDIO_HANDOFF_BUCKET is fixed. The + # no-bucket path is legitimate ONLY for the bare-local operator override + # (_NoopHandle: no Pub/Sub message, ack is a no-op). + if not handoff_bucket and not isinstance(handle, _NoopHandle): + raise RuntimeError( + "AUDIO_HANDOFF_BUCKET is unset but this capture was pulled from " + "CAPTURE_JOBS_SUBSCRIPTION — the deployed path MUST hand the transient " + "audio to STT via the handoff bucket. Refusing to run + ack (which would " + "ORPHAN the captured audio with no redelivery). Set AUDIO_HANDOFF_BUCKET." + ) + + out_dir = optional_env("CAPTURE_OUT_DIR") or tempfile.mkdtemp(prefix="earnings-capture-") + os.makedirs(out_dir, exist_ok=True) + + # Lazy import: the shipped capture surface lazy-imports httpx/PyAV inside its + # own methods, so nothing heavy loads at module import (keeps this entrypoint + # importable with no audio toolchain). + from mostlyright.weather.earnings.capture.q4 import Q4CaptureAdapter + + _LOG.info( + "capture job start: ticker=%s call_id=%s out_dir=%s handoff_bucket=%s", + ticker, + call_id, + out_dir, + handoff_bucket, + ) + + adapter = Q4CaptureAdapter() + event = {"ticker": ticker, "call_id": call_id, "media_url": webcast_url} + + # Hold the Pub/Sub lease for the WHOLE pipeline — the capture (60-90 min) AND + # the SYNCHRONOUS STT handoff+trigger, which blocks until STT transcribes the + # full call (multi-minute → up-to-~an-hour). A daemon thread re-leases the + # message throughout, so it is never redelivered mid-capture OR + # mid-transcription. (Codex R7 P1: the trigger previously ran AFTER the lease + # released, so a multi-minute /transcribe let the ack deadline lapse → + # redelivery → duplicate capture.) The message is acked ONLY after the whole + # pipeline succeeds, BELOW the leased block. + handoff_uri: str | None = None + with handle.hold_lease(): + artifact = adapter.capture(event, tmp_dir=out_dir) + + if not artifact.is_transient: + raise RuntimeError( + f"capture returned a NON-transient artifact for {ticker}/{call_id} — " + "captured earnings audio must always be transient (D-27.9)." + ) + _assert_audio_local(artifact.audio_path, out_dir) + + if handoff_bucket: + # DEFAULT deployed path: hand the transient audio to STT via the private + # in-firewall GCS bucket and TRIGGER STT (a synchronous /transcribe that + # blocks until the transcript ledger is written). Still under the lease, + # so a long transcription never lets the message redeliver. If the + # trigger fails we do NOT ack — Pub/Sub redelivers and capture re-runs + # (re-upload to the same handoff key is idempotent), never orphaning the + # audio. + handoff_uri = _upload_handoff( + artifact.audio_path, handoff_bucket, ticker=ticker, call_id=call_id + ) + _trigger_stt(handoff_uri, ticker=ticker, call_id=call_id) + + if handoff_bucket: + _LOG.info( + "capture job done: ticker=%s call_id=%s handoff=%s source=%s", + artifact.ticker, + artifact.call_id, + handoff_uri, + artifact.source_media_url, + ) + # The gs:// handoff URI is the STT reference — the transient local file + # dies with the ephemeral task; only STT (in-firewall) reads the object. + print(handoff_uri) + # Ack ONLY after the STT trigger succeeded (2xx) — closes capture→STT. + handle.ack() + else: + # Bare local operator run (no handoff bucket): emit the transient path for + # a co-located STT run off the same disk. Audio is still never served. No + # STT trigger + no ack (the _NoopHandle ack is a no-op anyway). + _LOG.info( + "capture job done (local, no handoff bucket): ticker=%s call_id=%s transient_audio=%s", + artifact.ticker, + artifact.call_id, + artifact.audio_path, + ) + print(artifact.audio_path) + handle.ack() + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py new file mode 100644 index 0000000..4cc71c5 --- /dev/null +++ b/services/earnings/jobs/rolefact.py @@ -0,0 +1,334 @@ +"""Earnings role-attribution + fact-building Cloud Run Job (Phase 28, 28-13). + +The POST-audio, CPU-only stage on the ingest side of the firewall. Reads the +persisted transcript for a call from the +:class:`~mostlyright.weather.earnings.ledger.TranscriptLedger`, role-attributes +the turns via :class:`~mostlyright.weather.earnings.role_parser.RoleParser`, +counts per-term mentions with +:func:`~mostlyright.weather.earnings.stt.classify_mentions`, assembles +``schema.earnings_fact.v1`` rows via +:func:`~mostlyright.weather.earnings.fact_builder.build_fact_rows` (which applies +the fail-closed Kalshi filter), and writes them to the +:class:`~mostlyright.weather.earnings.ledger.FactLedger`. It then OPTIONALLY +uploads the derived fact parquet to Cloudflare R2 via the shipped write sink. + +**Env contract:** + +* ``ROLEFACT_TICKER`` (required) — the market ticker (fact/transcript partition). +* ``ROLEFACT_CALL_ID`` (required) — the provider call id. +* ``ROLEFACT_TERMS`` (required) — a JSON array of market-term specs, each with + at least ``term_canonical`` (the counted term). Optional per-term keys: + ``term_match_rule``, ``counting_mode``, ``threshold_n``, ``window_scope``, + ``term_accepted_forms``. +* ``ROLEFACT_ROSTER`` (optional) — a JSON array of ``[speaker_name, label]`` + roster pairs anchoring exec identity for the fail-closed Kalshi rule. +* ``MOSTLYRIGHT_CACHE_DIR`` (optional) — the ledger cache root. +* R2 upload (opt-in; the upload bucket enables it): ``R2_BUCKET`` (the infra env + name — ``infra/cloud_run.tf`` ``google_cloud_run_v2_job.rolefact`` sets it), + with ``ROLEFACT_R2_BUCKET`` accepted as a fallback/override for a manual run, + plus the write-token creds ``R2_ACCOUNT_ID`` / ``R2_WRITE_ACCESS_KEY_ID`` / + ``R2_WRITE_SECRET_ACCESS_KEY`` (read from the env by NAME by the shipped sink). + +**Audio firewall (D-27.9).** This stage is entirely post-audio — it reads TEXT +from the transcript ledger and writes DERIVED FACTS. There is no audio anywhere in +this job; ffmpeg / whisper / chromium are absent from its image. + +**Lazy imports.** boto3 is lazy-imported inside the shipped R2 sink (only reached +when an upload bucket is configured), so this entrypoint imports cleanly. +""" + +from __future__ import annotations + +import json +import logging +from typing import TYPE_CHECKING + +from services.earnings.jobs._env import optional_env, require_env + +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence + +_LOG = logging.getLogger("services.earnings.jobs.rolefact") + + +def _transcript_text(rows: Sequence[Mapping[str, object]]) -> str: + """Join the transcript-ledger rows' ``text`` in segment order into one string.""" + ordered = sorted(rows, key=lambda r: _segment_index(r)) + return "\n".join(str(r.get("text", "")) for r in ordered if r.get("text")) + + +def _segment_index(row: Mapping[str, object]) -> int: + idx = row.get("segment_index") + return idx if isinstance(idx, int) else 0 + + +def _build_stt_counts( + transcript: str, + market_terms: Sequence[Mapping[str, object]], +) -> list[dict[str, object]]: + """Run ``classify_mentions`` per market term → per-occurrence stt_count records. + + ``classify_mentions`` (NOT ``count_mentions``) is the production counter (D-30): + it emits one record per occurrence carrying ``compound_type``, which the fact + builder needs to split a row per ``(term, compound_type)``. Each record is + stamped with the term + a ``mention_count`` of 1 (one record per occurrence). + """ + from mostlyright.weather.earnings.stt import classify_mentions + + counts: list[dict[str, object]] = [] + for spec in market_terms: + term = str(spec.get("term_canonical", "")) + if not term: + continue + match_rule = str(spec.get("term_match_rule", "plural_possessive_ok_no_tense")) + for occ in classify_mentions(transcript, term, match_rule=match_rule): + counts.append( + { + "term": term, + "matched_surface_form": occ.get("surface", term), + "mention_count": 1, + "compound_type": occ.get("compound_type", "standalone"), + # No turn_index linkage in the batch post-call path — the builder + # then treats the occurrence as un-anchorable (diarization_advisory) + # unless a roster-anchored role parser fills turns (below). + } + ) + return counts + + +def main(argv: list[str] | None = None) -> int: + """Read transcript → build + persist fact rows (+ optional R2 upload). + + Returns ``0`` on success. A missing required env var, a missing transcript, a + fact-build/ledger failure, or an R2 upload failure propagates as a non-zero + exit (fail loud). + """ + logging.basicConfig(level=logging.INFO) + + ticker = require_env("ROLEFACT_TICKER") + call_id = require_env("ROLEFACT_CALL_ID") + market_terms = _parse_terms(require_env("ROLEFACT_TERMS")) + roster = _parse_roster(optional_env("ROLEFACT_ROSTER")) + + from mostlyright.weather.earnings.ledger import FactLedger, TranscriptLedger + + _LOG.info( + "rolefact job start: ticker=%s call_id=%s terms=%d", ticker, call_id, len(market_terms) + ) + + # STT and rolefact run in SEPARATE Cloud Run resources with isolated ephemeral + # disks, so STT's transcript is NOT on this container's local disk. Rehydrate it + # from the R2 data plane on a local miss (Codex R7 P1) before reading. + transcript_ledger = TranscriptLedger() + _maybe_download_transcript_r2(transcript_ledger, ticker=ticker, call_id=call_id) + transcript_rows = transcript_ledger.read(ticker, call_id) + if not transcript_rows: + raise RuntimeError( + f"no persisted transcript for {ticker}/{call_id} — the STT job must run " + "(and publish its transcript to R2, or write it to this container's " + "cache) before rolefact (fail loud rather than build zero facts)." + ) + transcript = _transcript_text(transcript_rows) + + # Role-attribute the turns (fail-closed Kalshi rule anchors exec identity to the + # roster). The batch stt_counts carry no turn_index, so the builder scopes each + # occurrence to diarization_advisory (Kalshi-excluded, Polymarket-retained) — + # the conservative default; a full turn↔occurrence join is the operator-gated + # live role-attribution seam (27-04). + from mostlyright.weather.earnings.role_parser import RoleParser + + turns = RoleParser(roster).attribute_turns(transcript) + _LOG.info("rolefact attributed %d turns for %s/%s", len(turns), ticker, call_id) + + from mostlyright.weather.earnings.fact_builder import build_fact_rows + + stt_counts = _build_stt_counts(transcript, market_terms) + fact_rows = build_fact_rows( + stt_counts, + turns, + market_terms, + ticker=ticker, + call_id=call_id, + ) + _LOG.info("rolefact built %d fact rows for %s/%s", len(fact_rows), ticker, call_id) + + fact_ledger = FactLedger() + # Idempotent REPLACE (not append): rolefact rebuilds the COMPLETE fact set for + # the call from the whole transcript, so a retry / redelivery must OVERWRITE — + # appending would double every fact row (Codex R7-2 P1). An empty fact set + # (zero-mention call) removes any stale partition; the R2-upload guard below + # then correctly skips (no parquet to upload). + total = fact_ledger.replace(fact_rows, ticker=ticker, call_id=call_id) + fact_path = fact_ledger.path(ticker, call_id) + _LOG.info( + "rolefact wrote fact ledger: ticker=%s call_id=%s rows_now=%d path=%s", + ticker, + call_id, + total, + fact_path, + ) + + # A legitimate zero-mention call builds no fact rows, so FactLedger.replace + # removes the LOCAL partition. Only upload when a partition actually exists. + if fact_rows and fact_path.exists(): + _maybe_upload_r2(str(fact_path), ticker=ticker, call_id=call_id) + else: + # Zero fact rows: the idempotent replace cleared the local partition, so a + # PRIOR nonzero run's R2 object (which serving reads as the durable store) + # is now stale — DELETE it too, else /facts keeps serving facts the replace + # meant to clear (Codex R7-3 P2). + _maybe_delete_r2_facts(ticker=ticker, call_id=call_id) + _LOG.info( + "rolefact: no fact rows for %s/%s — cleared local partition + any stale R2 object", + ticker, + call_id, + ) + + return 0 + + +def _parse_terms(raw: str) -> list[dict[str, object]]: + """Parse ``ROLEFACT_TERMS`` (a JSON array of term specs); fail loud if malformed.""" + try: + parsed = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"ROLEFACT_TERMS is not valid JSON: {exc}") from exc + if not isinstance(parsed, list) or not parsed: + raise ValueError( + "ROLEFACT_TERMS must be a non-empty JSON array of term specs (each with " + "at least a 'term_canonical'); an empty term set would build zero facts." + ) + return [dict(t) for t in parsed] + + +def _parse_roster(raw: str | None) -> list[tuple[str, str]]: + """Parse ``ROLEFACT_ROSTER`` (a JSON array of ``[name, label]`` pairs) or ``[]``.""" + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"ROLEFACT_ROSTER is not valid JSON: {exc}") from exc + roster: list[tuple[str, str]] = [] + for entry in parsed: + if isinstance(entry, (list, tuple)) and len(entry) == 2: + roster.append((str(entry[0]), str(entry[1]))) + else: + raise ValueError(f"ROLEFACT_ROSTER entry {entry!r} is not a [name, label] pair.") + return roster + + +def _maybe_download_transcript_r2(ledger: object, *, ticker: str, call_id: str) -> None: + """Rehydrate STT's durable transcript parquet from R2 on a local-cache miss. + + STT, role/fact, and serving run in SEPARATE Cloud Run resources with isolated + ephemeral disks, so the transcript STT wrote via its local + :class:`TranscriptLedger` is NOT on THIS container's disk (Codex R7 P1). If the + local partition is absent AND an R2 bucket is configured (``ROLEFACT_R2_BUCKET`` + override, else ``R2_BUCKET`` — the infra env), download + ``earnings/transcripts//.parquet`` (the key STT published under) + into the local ledger path so the read that follows sees it. Opt-in on the + bucket: a co-located / operator run with a shared disk already has it locally. + + A miss (no such object) is logged, NOT raised — the caller then fails loud on + the empty read with a clear "STT must run first" error, which is the correct + signal whether the transcript is missing locally or in R2. + """ + path = ledger.path(ticker, call_id) # type: ignore[attr-defined] + if path.exists(): + return + bucket = optional_env("ROLEFACT_R2_BUCKET") or optional_env("R2_BUCKET") + if not bucket: + return + key = f"earnings/transcripts/{ticker}/{call_id}.parquet" + + from mostlyright.weather.satellite._r2_sink import download + + path.parent.mkdir(parents=True, exist_ok=True) + try: + download(bucket, key, str(path), r2_target=bucket) + except Exception: + _LOG.warning( + "rolefact: no durable transcript in R2 (bucket=%s key=%s) for %s/%s — " + "will fail loud if this container's local ledger is also empty", + bucket, + key, + ticker, + call_id, + exc_info=True, + ) + return + _LOG.info( + "rolefact: rehydrated transcript parquet from R2 bucket=%s key=%s (%s/%s)", + bucket, + key, + ticker, + call_id, + ) + + +def _maybe_upload_r2(fact_path: str, *, ticker: str, call_id: str) -> None: + """Opt-in upload of the derived fact parquet to R2 via the shipped write sink. + + Enabled when an upload bucket is configured: ``R2_BUCKET`` (the infra env name + the shipped ``infra/cloud_run.tf`` sets on the rolefact Job), with + ``ROLEFACT_R2_BUCKET`` accepted as a fallback/override for a manual run. The + write-token creds (``R2_ACCOUNT_ID`` / ``R2_WRITE_ACCESS_KEY_ID`` / + ``R2_WRITE_SECRET_ACCESS_KEY``) are read from the env by NAME by the shipped + sink (boto3 lazy-imported there). ONLY the derived FACT parquet (text/facts, + never audio) is uploaded (D-27.9). + """ + # R2_BUCKET is the infra env name; ROLEFACT_R2_BUCKET is the manual-run + # override (checked first so an operator can redirect a one-off upload). + bucket = optional_env("ROLEFACT_R2_BUCKET") or optional_env("R2_BUCKET") + if not bucket: + return + key = f"earnings/facts/{ticker}/{call_id}.parquet" + + from mostlyright.weather.satellite._r2_sink import upload + + returned = upload(fact_path, bucket, key, r2_target=bucket) + _LOG.info( + "rolefact uploaded derived facts to R2: bucket=%s key=%s (%s/%s)", + bucket, + returned, + ticker, + call_id, + ) + + +def _maybe_delete_r2_facts(*, ticker: str, call_id: str) -> None: + """Tombstone the R2 fact object when an idempotent rerun produced ZERO facts. + + A role/fact rerun that legitimately yields no facts (terms no longer match, a + corrected transcript) clears the LOCAL partition via ``FactLedger.replace`` — + but a PRIOR nonzero run may have uploaded + ``earnings/facts//.parquet`` to R2, which serving reads as the + durable store. Delete that stale object so ``/facts`` stops serving facts the + replace meant to clear (Codex R7-3 P2). Opt-in on the same bucket as the upload; + ``delete_object`` is idempotent, so a first-time zero-fact call is a safe no-op. + """ + bucket = optional_env("ROLEFACT_R2_BUCKET") or optional_env("R2_BUCKET") + if not bucket: + return + key = f"earnings/facts/{ticker}/{call_id}.parquet" + + from mostlyright.weather.satellite._r2_sink import delete + + # Do NOT swallow: R2/S3 delete_object is idempotent (a MISSING key succeeds), so + # a first-time zero-fact call is a safe no-op and raises nothing. An exception + # here is therefore a REAL auth/network/service failure — let it propagate so + # the job fails and is RETRIED, rather than exiting "success" while the stale + # fact object keeps being served from the durable store (Codex R7-4 P2). + delete(bucket, key, r2_target=bucket) + _LOG.info( + "rolefact: deleted stale R2 fact object bucket=%s key=%s (zero-row replace) for %s/%s", + bucket, + key, + ticker, + call_id, + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py new file mode 100644 index 0000000..a039119 --- /dev/null +++ b/services/earnings/jobs/stt.py @@ -0,0 +1,488 @@ +"""Earnings STT Cloud Run GPU Job (Phase 28, 28-13). + +The faster-whisper transcription stage of the audio-side ingest pipeline. Reads a +transient audio path + call identity from the environment, transcribes it via the +SHIPPED :class:`mostlyright.weather.earnings.stt.SttTranscriber` (CTranslate2 / +faster-whisper, lazy-imported), writes the AUDIO-FREE transcript segments to the +:class:`~mostlyright.weather.earnings.ledger.TranscriptLedger`, and — in the +opt-in live mode — publishes segments to the ``earnings-streaming`` Pub/Sub topic +via :class:`~services.earnings.pubsub_bridge.SegmentPublisher`. + +**Deployed topology.** capture and STT are SEPARATE Cloud Run resources with NO +shared disk, so STT receives an audio REFERENCE to a handoff OBJECT in the private +``AUDIO_HANDOFF_BUCKET`` — a ``gs:///`` URI (or a bare object key, +resolved against ``AUDIO_HANDOFF_BUCKET``), NOT a local path. STT DOWNLOADS the +object to a local temp file, transcribes, and deletes it. A local filesystem path +still works (operator / test / GCE MIG fallback). + +**Env contract:** + +* ``STT_AUDIO_PATH`` (required) — the audio reference: a ``gs://`` handoff-bucket + URI (the deployed path), a bare handoff object key (resolved against + ``AUDIO_HANDOFF_BUCKET``), or a local file path (operator / test). +* ``AUDIO_HANDOFF_BUCKET`` (optional) — the private GCS handoff bucket a bare + object key in ``STT_AUDIO_PATH`` is resolved against (the infra sets this). +* ``STT_TICKER`` (required) — the market ticker. +* ``STT_CALL_ID`` (required) — the provider call id (ledger partition key). +* ``STT_TIER`` (optional) — model size (default ``large-v3``; on-device + floor is ``small``). +* ``STT_DEVICE`` (optional) — faster-whisper device (default ``cuda`` on the + L4 GPU image; ``cpu`` for a CPU fallback). +* ``STT_COMPUTE_TYPE`` (optional) — CTranslate2 compute type (default ``float16`` + on GPU). +* ``STT_INITIAL_PROMPT`` (optional) — a per-call vocabulary-biasing prompt (the + market strike terms). Threaded straight to ``WhisperModel.transcribe``. +* ``MOSTLYRIGHT_CACHE_DIR`` (optional) — the ledger cache root (else the default + ``$HOME/.mostlyright/cache``). +* Live-publish (all three required to enable): + ``EARNINGS_STREAMING_ENABLED``, ``EARNINGS_STREAMING_PROJECT``, + and optionally ``EARNINGS_STREAMING_TOPIC`` (default ``earnings-streaming``). + +**Region.** This image runs on an L4 GPU in us-central1 (no Cloud Run GPU in +eu-west3) — see the Dockerfile. + +**Audio firewall (D-27.9).** ONLY the transcript TEXT + derived segments cross +into the ledger; the audio file is a transient input that is never persisted as a +ledger column (the ledger structurally refuses an audio-shaped key). The published +envelopes are text/facts-only (the bridge fails closed on any audio field). + +**Lazy imports.** faster-whisper / CTranslate2 are lazy-imported inside the shipped +:class:`SttTranscriber` (never at module load); ``google-cloud-storage`` is +lazy-imported inside :func:`_resolve_audio_reference` (only when the reference is a +``gs://`` handoff object); and ``google-cloud-pubsub`` is lazy-constructed only +inside :func:`~services.earnings.pubsub_bridge.build_publisher_client` — so this +entrypoint imports cleanly with no GPU / no whisper / no GCP SDK. +""" + +from __future__ import annotations + +import contextlib +import logging +import os +import tempfile +from collections.abc import Iterator + +from services.earnings.jobs._env import optional_env, require_env + +_LOG = logging.getLogger("services.earnings.jobs.stt") + + +def _split_gs_uri(reference: str, *, handoff_bucket: str | None) -> tuple[str, str] | None: + """Return (bucket, object_key) if ``reference`` names a GCS handoff object, else None. + + Recognizes a ``gs:///`` URI, OR a bare object key when + ``handoff_bucket`` is configured and the reference is not an existing local + path (the deployed capture→STT handoff passes a gs:// URI; a bare key is the + tolerant fallback). A plain local path returns ``None`` (transcribe it directly). + """ + if reference.startswith("gs://"): + rest = reference[len("gs://") :] + bucket, _, key = rest.partition("/") + if not bucket or not key: + raise ValueError( + f"malformed gs:// audio reference {reference!r} (need gs://bucket/key)" + ) + return bucket, key + # A bare object key resolved against the handoff bucket — ONLY when it is not + # already a real local file (so a local path is never mis-read as a GCS key). + if handoff_bucket and not os.path.exists(reference): + return handoff_bucket, reference.lstrip("/") + return None + + +@contextlib.contextmanager +def _resolve_audio_reference( + reference: str, *, handoff_bucket: str | None +) -> Iterator[tuple[str, tuple[str, str] | None]]: + """Yield ``(local_path, source)``, downloading a GCS handoff object if needed. + + In the deployed topology STT receives a ``gs://`` reference to a private + handoff-bucket object (capture and STT do not share a disk). This downloads it + to an ephemeral temp file, yields ``(local_path, (bucket, key))``, and DELETES + the LOCAL temp on exit (the local copy never outlives transcription; D-27.9). + It does NOT delete the SOURCE object — the caller deletes it only AFTER the + transcript is durably written (see :func:`transcribe_call`), so a ledger-write + failure still leaves the audio retryable. A local path yields + ``(reference, None)`` (operator / test / GCE MIG fallback). + """ + gs = _split_gs_uri(reference, handoff_bucket=handoff_bucket) + if gs is None: + if not os.path.exists(reference): + raise FileNotFoundError( + f"audio path {reference!r} does not exist — the capture job's transient " + "audio must be present on the shared ephemeral disk (local path), or the " + "reference must be a gs:// handoff object in AUDIO_HANDOFF_BUCKET." + ) + yield reference, None + return + + bucket, key = gs + # Lazy import: google-cloud-storage only when a GCS handoff object is fetched. + from google.cloud import storage + + ext = os.path.splitext(key)[1] or ".audio" + fd, local_path = tempfile.mkstemp(prefix="earnings-stt-handoff-", suffix=ext) + os.close(fd) + try: + _LOG.info("stt: downloading handoff audio gs://%s/%s -> %s", bucket, key, local_path) + storage.Client().bucket(bucket).blob(key).download_to_filename(local_path) + yield local_path, (bucket, key) + finally: + # The local temp is transient — delete it once transcription is done + # (regardless of success/failure). The SOURCE object is NOT touched here. + with contextlib.suppress(FileNotFoundError): + os.remove(local_path) + + +def _delete_handoff_source(bucket: str, key: str) -> None: + """Delete the transient SOURCE handoff object after the transcript is durable. + + Called by :func:`transcribe_call` ONLY after ``TranscriptLedger.append`` + succeeds, so raw earnings audio does not accumulate in the private bucket + (D-27.9) yet a failed call still has retryable audio. A delete failure loses no + data (the ledger has the transcript) and is logged loudly (the bucket lifecycle + policy is the backstop) rather than failing the already-successful call. + """ + from google.cloud import storage + + try: + storage.Client().bucket(bucket).blob(key).delete() + _LOG.info("stt: deleted transient handoff object gs://%s/%s (post-ledger)", bucket, key) + except Exception: + _LOG.exception( + "stt: FAILED to delete transient handoff object gs://%s/%s — raw audio must " + "not linger (D-27.9); ensure the AUDIO_HANDOFF_BUCKET lifecycle policy reaps it", + bucket, + key, + ) + + +#: R2 key namespace for the durable transcript parquet the STT stage publishes so +#: the SEPARATE role/fact Cloud Run Job (isolated ephemeral disk) can read it. The +#: role/fact job downloads ``earnings/transcripts//.parquet`` back +#: into its local ledger before role-attributing (see +#: :func:`services.earnings.jobs.rolefact._maybe_download_transcript_r2`). +_TRANSCRIPT_R2_KEY_FMT = "earnings/transcripts/{ticker}/{call_id}.parquet" + + +def _maybe_upload_transcript_r2( + ledger: object, *, ticker: str, call_id: str, r2_bucket: str | None +) -> None: + """Publish the durable transcript parquet to R2 for the cross-container handoff. + + STT, role/fact, and serving run in SEPARATE Cloud Run resources with isolated + ephemeral filesystems, so the local :class:`TranscriptLedger` write above is NOT + visible to the downstream role/fact Job — it would fail with "no persisted + transcript" (Codex R7 P1). When ``R2_BUCKET`` is configured (the deployed + ingest path) the transcript parquet is uploaded to the R2 data plane, the + architecture's durable text/fact store; role/fact then rehydrates it. Opt-in on + the bucket: a bare local / co-located operator run (no bucket) keeps the + prior local-only behavior. ONLY the audio-free transcript parquet crosses + (D-27.9); the write-token creds come from the env by NAME via the shipped sink. + + The ledger partition path is resolved ONLY when a bucket is set — a no-bucket + run never touches ``ledger.path`` (so a fake ledger without it stays supported). + """ + if not r2_bucket: + return + ledger_path = str(ledger.path(ticker, call_id)) # type: ignore[attr-defined] + if not os.path.exists(ledger_path): + # A legitimately empty transcript writes no parquet — nothing to publish. + _LOG.info( + "stt: no transcript parquet at %s for %s/%s — skipping R2 publish", + ledger_path, + ticker, + call_id, + ) + return + from mostlyright.weather.satellite._r2_sink import upload + + key = _TRANSCRIPT_R2_KEY_FMT.format(ticker=ticker, call_id=call_id) + upload(ledger_path, r2_bucket, key, r2_target=r2_bucket) + _LOG.info( + "stt: published durable transcript parquet to R2 bucket=%s key=%s (%s/%s)", + r2_bucket, + key, + ticker, + call_id, + ) + + +#: Default STT tier — the hosted / our-infra source-of-truth model (D-27.5). +_DEFAULT_TIER = "large-v3" +#: Default device/compute for the L4 GPU image. +_DEFAULT_DEVICE = "cuda" +_DEFAULT_COMPUTE_TYPE = "float16" + + +def _segment_rows( + result_segments: list[dict[str, object]], + *, + ticker: str, + call_id: str, +) -> list[dict[str, object]]: + """Project the transcriber's per-segment records onto transcript-ledger rows. + + The ledger projects each row onto its canonical ``COLUMNS`` and DROPS any + non-schema key, so only text + the temporal markers survive. The STT segment + ``start`` (seconds into the call) maps to the ``offset_seconds`` engine-relative + marker (NOT ``spoken_at``, which is a tz-aware wallclock — a float there would + silently persist as 1970-01-01). Audio is never a field here (D-27.9). + """ + rows: list[dict[str, object]] = [] + for idx, seg in enumerate(result_segments): + start = seg.get("start") + rows.append( + { + "ticker": ticker, + "call_id": call_id, + "segment_index": idx, + "segment": "batch", + "text": seg.get("text", ""), + "offset_seconds": float(start) if isinstance(start, (int, float)) else None, + "is_final": True, + "source": "earnings_call", + "delivery": "hosted", + } + ) + return rows + + +def _transcribe_local( + local_audio: str, + *, + ticker: str, + call_id: str, + tier: str, + device: str, + compute_type: str, + initial_prompt: str | None, +) -> object: + """Run the shipped transcriber on a LOCAL audio file; return its result. + + Split from :func:`transcribe_call` so the GCS-download / temp-file lifecycle + (:func:`_resolve_audio_reference`) wraps ONLY the transcription, and the temp + file is deleted the instant the model is done reading it. + """ + # Lazy import: SttTranscriber lazy-imports faster-whisper inside transcribe, + # so nothing heavy loads at module import. + from mostlyright.weather.earnings.stt import SttTranscriber + + _LOG.info("stt start: ticker=%s call_id=%s tier=%s device=%s", ticker, call_id, tier, device) + transcriber = SttTranscriber(tier, device=device, compute_type=compute_type) + result = transcriber.transcribe(local_audio, initial_prompt=initial_prompt) + _LOG.info( + "stt transcribed: ticker=%s call_id=%s segments=%d language=%s duration=%s", + ticker, + call_id, + len(result.segments), + result.language, + result.duration, + ) + return result + + +def transcribe_call( + audio_path: str, + *, + ticker: str, + call_id: str, + tier: str = _DEFAULT_TIER, + device: str = _DEFAULT_DEVICE, + compute_type: str = _DEFAULT_COMPUTE_TYPE, + initial_prompt: str | None = None, + publish_live: bool = False, + streaming_project: str | None = None, + streaming_topic: str = "earnings-streaming", + handoff_bucket: str | None = None, + r2_bucket: str | None = None, +) -> dict[str, object]: + """Transcribe one call's transient audio → transcript ledger (+ optional publish). + + The shared core used by BOTH the one-shot Cloud Run Job entrypoint + (:func:`main`) and the Cloud Run SERVICE HTTP handler + (:mod:`services.earnings.jobs.stt_server`). ``audio_path`` may be a ``gs://`` + handoff-bucket URI (the deployed capture→STT handoff), a bare object key + resolved against ``handoff_bucket``, or a local file path (operator / test) — + a GCS reference is downloaded to an ephemeral temp file that is deleted after + transcription. Returns a small audio-free summary dict (segment/row counts, + language, duration) — NEVER audio. Raises on a transcription / ledger failure + (fail loud). No audio reaches the ledger or wire. + """ + with _resolve_audio_reference(audio_path, handoff_bucket=handoff_bucket) as ( + local_audio, + source, + ): + result = _transcribe_local( + local_audio, + ticker=ticker, + call_id=call_id, + tier=tier, + device=device, + compute_type=compute_type, + initial_prompt=initial_prompt, + ) + + rows = _segment_rows(result.segments, ticker=ticker, call_id=call_id) + + # Lazy import: the ledger pulls pyarrow/filelock; kept out of module load so a + # fake-ledger test can import this module without them. + from mostlyright.weather.earnings.ledger import TranscriptLedger + + ledger = TranscriptLedger() + # Idempotent REPLACE (not append): STT writes the COMPLETE transcript for the + # call in one shot, so a retry / Pub/Sub redelivery must OVERWRITE the partition + # — appending would double the segments and make role/fact double-count the + # mentions (Codex R7-2 P1). + total = ledger.replace(rows, ticker=ticker, call_id=call_id) + _LOG.info( + "stt wrote transcript ledger: ticker=%s call_id=%s rows_now=%d", ticker, call_id, total + ) + + # Publish the durable transcript to R2 so the SEPARATE role/fact Cloud Run Job + # (isolated ephemeral disk) can read it — the local ledger write above never + # crosses containers (Codex R7 P1). Opt-in on R2_BUCKET; no-op for a local / + # co-located operator run. + _maybe_upload_transcript_r2(ledger, ticker=ticker, call_id=call_id, r2_bucket=r2_bucket) + + # DEFERRED ORCHESTRATION SEAM — STT -> role/fact trigger (Codex R7-6 P1). + # STT completes the TRANSCRIPT stage; the derived-FACT stage (earnings-rolefact + # Cloud Run Job) is NOT auto-triggered here, so hosted /facts stays empty until + # role/fact runs for this call. This is intentionally NOT wired in this + # deploy-runtime PR because role/fact needs the per-market TERM SPECS + # (ROLEFACT_TERMS — which strike words to count), which STT does NOT have: the + # terms are market-specific and must be threaded capture -> STT -> role/fact (or + # fetched from the markets catalog at role/fact time). WHO triggers role/fact + # (a scheduler, a transcript-ready Pub/Sub event fanning to a jobs.run, or an + # operator) is the same operator-gated orchestration decision the capture->STT + # entrypoint documents for capture. Tracked as a follow-up; until then role/fact + # is operator/scheduler-driven (env-supplied ticker/call_id/terms). + + if publish_live: + _maybe_publish_live( + result.segments, + ticker=ticker, + call_id=call_id, + project=streaming_project, + topic=streaming_topic, + ) + + # The transcript is now DURABLY written (and any live publish done), so it is + # safe to delete the transient source handoff object. Deleting it BEFORE the + # ledger write would strand a failed call with no retryable audio (Codex R4). + if source is not None: + _delete_handoff_source(*source) + + return { + "ticker": ticker, + "call_id": call_id, + "segments": len(result.segments), + "rows_written": total, + "language": result.language, + "duration": result.duration, + } + + +def main(argv: list[str] | None = None) -> int: + """Transcribe the transient audio → transcript ledger (+ optional live publish). + + The one-shot Cloud Run Job / GCE MIG entrypoint (env-driven). Returns ``0`` on + success; a missing required env var, a transcription failure, or a ledger write + failure propagates as a non-zero exit (fail loud). No audio ever reaches the + ledger or the wire. + """ + logging.basicConfig(level=logging.INFO) + + audio_path = require_env("STT_AUDIO_PATH") + ticker = require_env("STT_TICKER") + call_id = require_env("STT_CALL_ID") + tier = optional_env("STT_TIER", _DEFAULT_TIER) or _DEFAULT_TIER + device = optional_env("STT_DEVICE", _DEFAULT_DEVICE) or _DEFAULT_DEVICE + compute_type = optional_env("STT_COMPUTE_TYPE", _DEFAULT_COMPUTE_TYPE) or _DEFAULT_COMPUTE_TYPE + initial_prompt = optional_env("STT_INITIAL_PROMPT") + + enabled = optional_env("EARNINGS_STREAMING_ENABLED") + publish_live = bool(enabled) and enabled.lower() not in ("0", "false", "no") + + transcribe_call( + audio_path, + ticker=ticker, + call_id=call_id, + tier=tier, + device=device, + compute_type=compute_type, + initial_prompt=initial_prompt, + publish_live=publish_live, + streaming_project=optional_env("EARNINGS_STREAMING_PROJECT"), + streaming_topic=optional_env("EARNINGS_STREAMING_TOPIC", "earnings-streaming") + or "earnings-streaming", + handoff_bucket=optional_env("AUDIO_HANDOFF_BUCKET"), + r2_bucket=optional_env("R2_BUCKET"), + ) + return 0 + + +def _maybe_publish_live( + segments: list[dict[str, object]], + *, + ticker: str, + call_id: str, + project: str | None, + topic: str = "earnings-streaming", +) -> None: + """Opt-in live publish of segments to the ``earnings-streaming`` topic. + + The caller decides whether live publish is enabled; this fail-softs when no + ``project`` is configured. The real ``google.cloud.pubsub_v1`` client is + lazy-constructed by + :func:`~services.earnings.pubsub_bridge.build_publisher_client` (never at + module load). The batch STT segments are published as final transcript + segments; the true partial→final streaming path is the operator-gated 27-10 + live engine (not driven from this batch job). + + NOTE (temporal seam): ``Segment.spoken_at``/``knowledge_time`` are typed float; + here they carry the segment's ENGINE-RELATIVE offset-seconds (not a wallclock + epoch). This is the opt-in, explicitly-non-authoritative live-preview path — + the authoritative ledger write uses ``offset_seconds`` (see ``_segment_rows``). + """ + if not project: + _LOG.warning( + "live publish requested but no streaming project configured — skipping " + "(fail soft: the batch ledger is authoritative)." + ) + return + + from mostlyright.weather.earnings.streaming_transcriber import Segment + + from services.earnings.pubsub_bridge import ( + SegmentPublisher, + build_publisher_client, + ) + + publisher = SegmentPublisher(build_publisher_client(project, topic)) + published = 0 + for seg in segments: + start = seg.get("start") + publisher.publish( + call_id, + Segment( + text=str(seg.get("text", "")), + is_final=True, + spoken_at=float(start) if isinstance(start, (int, float)) else 0.0, + stream_seq=published, + knowledge_time=float(start) if isinstance(start, (int, float)) else 0.0, + ), + ) + published += 1 + publisher.publish_end_of_call(call_id) + _LOG.info( + "stt job published %d live segments to topic=%s for %s/%s", + published, + topic, + ticker, + call_id, + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/earnings/jobs/stt_server.py b/services/earnings/jobs/stt_server.py new file mode 100644 index 0000000..54c05e9 --- /dev/null +++ b/services/earnings/jobs/stt_server.py @@ -0,0 +1,122 @@ +"""Earnings STT Cloud Run SERVICE — HTTP transcription surface (Phase 28, 28-11). + +The infra declares STT as a Cloud Run **service** (``google_cloud_run_v2_service.stt``, +L4 GPU, scale-to-zero), so its container MUST serve HTTP on ``$PORT`` — a one-shot +CLI would never pass Cloud Run readiness. This is that HTTP surface: a thin FastAPI +app around the SAME shipped transcriber the one-shot job uses +(:func:`services.earnings.jobs.stt.transcribe_call`). The one-shot +``python -m services.earnings.jobs.stt`` entrypoint is kept for the GCE L4 MIG +fallback (28-OPERATOR-INPUTS). + +Routes: + * ``GET /healthz`` — unauthenticated liveness (no model load): lets the Cloud Run + revision become ready WITHOUT paying the GPU model-load cost. It touches no + faster-whisper state (the model is lazy-loaded on the first /transcribe). + * ``POST /transcribe`` — body ``{audio_path, ticker, call_id, tier?, device?, + compute_type?, initial_prompt?, publish_live?, streaming_project?, + streaming_topic?}`` → transcribes the transient audio, writes the AUDIO-FREE + transcript ledger, returns an audio-free summary (segment/row counts, language, + duration). NEVER returns or persists audio (D-27.9). + +**Audio firewall (D-27.9).** No route exposes an audio path/media-type/field. The +audio file is a transient input on the shared ephemeral disk; only transcript TEXT +crosses into the ledger. + +**Lazy imports.** faster-whisper / CTranslate2 stay lazy inside the shipped +transcriber, so importing this module (and answering /healthz) needs no GPU. +""" + +from __future__ import annotations + +import logging +import os +from typing import Annotated + +from fastapi import Body, FastAPI, HTTPException + +from services.earnings.jobs.stt import transcribe_call + +_LOG = logging.getLogger("services.earnings.jobs.stt_server") + +app = FastAPI( + title="mostlyright earnings STT service", + summary="GPU transcription of transient earnings audio → audio-free transcript ledger.", + version="0.1.0", +) + + +@app.get("/healthz", summary="Liveness probe (no model load, unauthenticated)") +def healthz() -> dict[str, str]: + """Static liveness token — lets the Cloud Run revision become ready cheaply.""" + return {"status": "ok"} + + +@app.post("/transcribe", summary="Transcribe transient audio → transcript ledger (text only)") +def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: + """Transcribe one call's transient audio; return an AUDIO-FREE summary. + + Fails 400 on a missing required field and 500 on a transcription/ledger error + (fail loud) — never a silent partial write. No audio is returned. + """ + audio_path = payload.get("audio_path") + ticker = payload.get("ticker") + call_id = payload.get("call_id") + missing = [ + k + for k, v in (("audio_path", audio_path), ("ticker", ticker), ("call_id", call_id)) + if not v + ] + if missing: + raise HTTPException( + status_code=400, detail=f"missing required field(s): {', '.join(missing)}" + ) + + # The deployed STT service has AUDIO_HANDOFF_BUCKET set; a bare object key / + # gs:// reference in the body is resolved + downloaded against it. A request + # may also override it explicitly. + handoff_bucket = payload.get("handoff_bucket") or os.environ.get("AUDIO_HANDOFF_BUCKET") + # R2_BUCKET (the deployed STT service sets it) enables the durable transcript + # publish so the SEPARATE role/fact Job can read the transcript across + # containers; a request may override it explicitly. + r2_bucket = payload.get("r2_bucket") or os.environ.get("R2_BUCKET") + + # Live-publish config DEFAULTS from the SERVICE env (mirrors the one-shot + # jobs/stt.py main()), because the deployed capture->STT trigger posts only + # {audio_path, ticker, call_id}. Without this, the STT SERVICE never publishes + # to earnings-streaming, so the serving /stream subscriber has nothing to fan + # out for hosted calls (Codex R7-6 P2). A request may still override any field. + env_enabled = os.environ.get("EARNINGS_STREAMING_ENABLED") + env_publish = bool(env_enabled) and env_enabled.strip().lower() not in ("0", "false", "no") + publish_live = bool(payload.get("publish_live", env_publish)) + streaming_project = payload.get("streaming_project") or os.environ.get( + "EARNINGS_STREAMING_PROJECT" + ) + streaming_topic = str( + payload.get("streaming_topic") + or os.environ.get("EARNINGS_STREAMING_TOPIC") + or "earnings-streaming" + ) + + try: + return transcribe_call( + str(audio_path), + ticker=str(ticker), + call_id=str(call_id), + tier=str(payload.get("tier") or "large-v3"), + device=str(payload.get("device") or "cuda"), + compute_type=str(payload.get("compute_type") or "float16"), + initial_prompt=payload.get("initial_prompt"), + publish_live=publish_live, + streaming_project=streaming_project, + streaming_topic=streaming_topic, + handoff_bucket=handoff_bucket, + r2_bucket=r2_bucket, + ) + except FileNotFoundError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + except Exception as exc: # pragma: no cover - transcription/ledger failure path + _LOG.exception("transcription failed for %s/%s", ticker, call_id) + raise HTTPException(status_code=500, detail=f"transcription failed: {exc}") from exc + + +__all__ = ["app", "healthz", "transcribe"] diff --git a/services/earnings/middleware/auth.py b/services/earnings/middleware/auth.py index 633e28e..ddfe508 100644 --- a/services/earnings/middleware/auth.py +++ b/services/earnings/middleware/auth.py @@ -86,6 +86,11 @@ def _stream_token_ok(self, request: Request) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the unauthenticated Cloud Run liveness probe — a health check + # cannot present the API key, so it bypasses the gate BEFORE the key check + # (mirrors the /stream signed-token exemption below). + if request.url.path.rstrip("/") == "/healthz": + return await call_next(request) if self._expected_key is None: # Keyless local/dev mode — gate open. return await call_next(request) diff --git a/services/earnings/middleware/ratelimit.py b/services/earnings/middleware/ratelimit.py index d6a069c..7d52cfc 100644 --- a/services/earnings/middleware/ratelimit.py +++ b/services/earnings/middleware/ratelimit.py @@ -160,6 +160,10 @@ def _consume(self, key: str) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the Cloud Run liveness probe — never throttle it: a probe + # answered with 429 would make Cloud Run kill a healthy instance. + if request.url.path.rstrip("/") == "/healthz": + return await call_next(request) if not self._consume(self._client_key(request)): return JSONResponse( status_code=429, diff --git a/services/earnings/r2_read.py b/services/earnings/r2_read.py new file mode 100644 index 0000000..969fd06 --- /dev/null +++ b/services/earnings/r2_read.py @@ -0,0 +1,235 @@ +"""Read-only Cloudflare R2 access for the earnings serving app (Phase 28, 28-13). + +The earnings ingest jobs run in SEPARATE Cloud Run resources from serving and +publish their durable text/fact parquet to R2 (bucket ``mostlyright-derived``): + + earnings/transcripts/{ticker}/{call_id}.parquet (STT, jobs/stt.py) + earnings/facts/{ticker}/{call_id}.parquet (role/fact, jobs/rolefact.py) + +A fresh serving Cloud Run instance has an EMPTY ephemeral disk, so it must read +those objects from R2 — the durable data plane — rather than a container-local +ledger (which would return empty even after ingest succeeded; Codex R7-4 P1). +This module is the READ side of the R2 firewall for earnings: it signs with the +READ-ONLY token (list+get) and NEVER holds the write token — mirroring +``services/weather/r2_read.py`` (the satellite serving read path). The deployed +``earnings-serving`` container already carries the read-token env +(``infra/cloud_run.tf``: ``R2_ACCESS_KEY_ID`` / ``R2_SECRET_ACCESS_KEY`` / +``R2_ACCOUNT_ID`` / ``R2_BUCKET`` from the ``r2-read-*`` secrets). + +**Audio firewall (D-27.9).** ONLY the text transcript + derived-fact parquet is +ever read here — audio never got an R2 key, so there is nothing audio-shaped to +read. The rows are the canonical ``schema.earnings_transcript.v1`` / +``schema.earnings_fact.v1`` shapes. + +**Read-only by construction.** The client exposes only ``get_object`` / +``list_objects`` — no put/delete. boto3 is lazy-imported (kept off module load so +a local/on-device serving tier that never touches R2 needs no boto3). +""" + +from __future__ import annotations + +import io +import os +import re +from typing import Any + +#: Object-store key prefixes the ingest jobs write under (kept in lockstep with +#: ``jobs/stt.py::_TRANSCRIPT_R2_KEY_FMT`` and ``jobs/rolefact.py``). +_EARNINGS_PREFIX = "earnings/" + +#: Environment-variable NAMES the READ-ONLY token credentials are read from — the +#: GENERIC ``R2_*`` names the deploy layer injects into the serving SA env (the +#: serving SA's ONLY R2 token is the read token; the ingest WRITE path uses the +#: disjoint ``R2_WRITE_*`` names). Mirrors ``services/weather/r2_read.py``. +_ENV_ACCOUNT_ID = "R2_ACCOUNT_ID" +_ENV_ACCESS_KEY_ID = "R2_ACCESS_KEY_ID" +_ENV_SECRET_ACCESS_KEY = "R2_SECRET_ACCESS_KEY" + +_ENV_BUCKET = "R2_BUCKET" +_DEFAULT_BUCKET = "mostlyright-derived" + +#: R2's fixed S3-compat pseudo-region (Cloudflare requires ``"auto"``). +_R2_REGION = "auto" + +#: A ticker / call_id path segment must be a safe single key component (no ``/`` +#: or ``..`` — the ingest write side only ever wrote safe segments, so an unsafe +#: query value cannot match any object and is rejected as "no such call"). +_SAFE_SEGMENT = re.compile(r"^[A-Za-z0-9._-]+$") + + +def r2_read_configured() -> bool: + """True when the READ-ONLY R2 token is present (the deployed serving path). + + The deployed ``earnings-serving`` container carries the read-token env; a + local / on-device / test serving tier does not, and reads the local ledger + instead. Gate on the access-key id (the token that unambiguously means + "serve from the durable R2 corpus"). + """ + return bool(os.environ.get(_ENV_ACCESS_KEY_ID)) + + +def _require_env(name: str) -> str: + value = os.environ.get(name) + if not value: + raise ValueError( + f"the earnings R2 read client needs {name} set (the READ-ONLY-token " + "credential is injected into the serving SA env from Secret Manager). " + "It is unset or empty." + ) + return value + + +def _derived_bucket() -> str: + return os.environ.get(_ENV_BUCKET) or _DEFAULT_BUCKET + + +def _validate_segment(value: str, *, field: str) -> str: + if not _SAFE_SEGMENT.match(value): + raise ValueError(f"unsafe {field} segment {value!r} for an R2 key") + return value + + +class EarningsR2Reader: + """Read-only R2 accessor: list + fetch the earnings transcript/fact partitions. + + Constructed lazily against the injected READ-token env. No write surface — the + write side (``satellite/_r2_sink.py``) is a separate module bound to the + disjoint write token. + """ + + def __init__(self, bucket: str | None = None) -> None: + self._bucket = bucket or _derived_bucket() + self._client: Any | None = None + + @property + def bucket(self) -> str: + return self._bucket + + def _get_client(self) -> Any: + if self._client is not None: + return self._client + import boto3 + import botocore.config + + account_id = _require_env(_ENV_ACCOUNT_ID) + access_key_id = _require_env(_ENV_ACCESS_KEY_ID) + secret_access_key = _require_env(_ENV_SECRET_ACCESS_KEY) + + self._client = boto3.client( + "s3", + endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com", + aws_access_key_id=access_key_id, + aws_secret_access_key=secret_access_key, + region_name=_R2_REGION, + config=botocore.config.Config(retries={"max_attempts": 5, "mode": "adaptive"}), + ) + return self._client + + def _list_child_dirs(self, prefix: str) -> list[str]: + """Return the immediate child "directory" names under ``prefix`` (deduped).""" + client = self._get_client() + names: set[str] = set() + token: str | None = None + while True: + kwargs: dict[str, Any] = {"Bucket": self._bucket, "Prefix": prefix, "Delimiter": "/"} + if token is not None: + kwargs["ContinuationToken"] = token + resp = client.list_objects_v2(**kwargs) + for cp in resp.get("CommonPrefixes", []) or []: + # e.g. "earnings/facts/GIS/" -> "GIS" + names.add(cp["Prefix"][len(prefix) :].rstrip("/")) + if not resp.get("IsTruncated"): + break + token = resp.get("NextContinuationToken") + return sorted(n for n in names if n) + + def _list_object_stems(self, prefix: str) -> list[str]: + """Return the ``*.parquet`` object stems directly under ``prefix`` (sorted).""" + client = self._get_client() + stems: list[str] = [] + token: str | None = None + while True: + kwargs: dict[str, Any] = {"Bucket": self._bucket, "Prefix": prefix} + if token is not None: + kwargs["ContinuationToken"] = token + resp = client.list_objects_v2(**kwargs) + for obj in resp.get("Contents", []) or []: + key = obj["Key"] + tail = key[len(prefix) :] + if "/" in tail or not tail.endswith(".parquet"): + continue + stems.append(tail[: -len(".parquet")]) + if not resp.get("IsTruncated"): + break + token = resp.get("NextContinuationToken") + return sorted(stems) + + def list_tickers(self, subdir: str) -> list[str]: + """List every ticker with at least one persisted partition under ``subdir``.""" + return self._list_child_dirs(f"{_EARNINGS_PREFIX}{subdir}/") + + def list_call_ids(self, subdir: str, ticker: str) -> list[str]: + """List every persisted ``call_id`` for ``ticker`` under ``subdir``.""" + safe = _validate_segment(ticker, field="ticker") + return self._list_object_stems(f"{_EARNINGS_PREFIX}{subdir}/{safe}/") + + def read_partition(self, subdir: str, ticker: str, call_id: str) -> list[dict[str, object]]: + """Fetch + parse one ``(ticker, call_id)`` partition; ``[]`` on a genuine miss. + + Settlement safety (mirrors ``weather/r2_read``): a MISSING object + (``NoSuchKey`` — nothing ingested for this call yet) is a legitimate empty + (``[]``); a missing/typoed BUCKET or any other error is a CONFIG/read + failure that must propagate (never silently look like "no data"). + """ + from botocore.exceptions import ClientError + + safe_ticker = _validate_segment(ticker, field="ticker") + safe_call = _validate_segment(call_id, field="call_id") + key = f"{_EARNINGS_PREFIX}{subdir}/{safe_ticker}/{safe_call}.parquet" + client = self._get_client() + try: + resp = client.get_object(Bucket=self._bucket, Key=key) + except ClientError as exc: + response = getattr(exc, "response", None) or {} + code = str(response.get("Error", {}).get("Code", "")) + if code == "NoSuchKey": + return [] + raise + body = resp["Body"].read() + + import pyarrow.parquet as pq + + return pq.read_table(io.BytesIO(body)).to_pylist() + + +class R2LedgerSource: + """Duck-typed read facade over :class:`EarningsR2Reader` for one ledger subdir. + + Implements the read subset the serving routes call on ``ServingState.transcripts`` + / ``.facts`` (``read`` / ``read_ticker`` / ``list_call_ids`` / ``list_tickers``), + so ``ServingState`` can hold an R2-backed source in place of the local ledger + with NO route change. It is READ-ONLY (no ``append``/``replace``) — serving + never writes. + """ + + def __init__(self, reader: EarningsR2Reader, subdir: str) -> None: + self._reader = reader + self._subdir = subdir + + def read(self, ticker: str, call_id: str) -> list[dict[str, object]]: + return self._reader.read_partition(self._subdir, ticker, call_id) + + def list_call_ids(self, ticker: str) -> list[str]: + return self._reader.list_call_ids(self._subdir, ticker) + + def list_tickers(self) -> list[str]: + return self._reader.list_tickers(self._subdir) + + def read_ticker(self, ticker: str) -> list[dict[str, object]]: + out: list[dict[str, object]] = [] + for call_id in self.list_call_ids(ticker): + out.extend(self.read(ticker, call_id)) + return out + + +__all__ = ["EarningsR2Reader", "R2LedgerSource", "r2_read_configured"] diff --git a/services/earnings/routes/health.py b/services/earnings/routes/health.py new file mode 100644 index 0000000..54e515d --- /dev/null +++ b/services/earnings/routes/health.py @@ -0,0 +1,30 @@ +"""``GET /healthz`` — the unauthenticated container health probe (Phase 28, 28-12). + +Cloud Run (and any HTTP uptime check) needs a cheap, dependency-free endpoint it +can poll to decide whether an instance is live. It MUST NOT be gated by the +API-key auth (a probe cannot present the key) nor consume a rate-limit/ceiling +token (a probe throttled to 429 would make Cloud Run kill a healthy instance). +The path ``/healthz`` is therefore exempted at the TOP of every middleware's +``dispatch`` (auth + ratelimit), mirroring the existing ``/stream`` token +exemption — see ``middleware/auth.py`` and ``middleware/ratelimit.py``. + +The response is a static ``{"status": "ok"}`` — it deliberately touches NO +ledger, R2, or Pub/Sub state (a health probe must not depend on downstream I/O +that could make a serving-capable instance report unhealthy). It carries no +audio surface (D-27.9): the path/schema is a plain status string. +""" + +from __future__ import annotations + +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/healthz", summary="Liveness probe (unauthenticated, no downstream I/O)") +def healthz() -> dict[str, str]: + """Return a static liveness token — no auth, no ledger/R2/Pub-Sub touch.""" + return {"status": "ok"} + + +__all__ = ["router"] diff --git a/services/earnings/tests/test_healthz_and_lifespan.py b/services/earnings/tests/test_healthz_and_lifespan.py new file mode 100644 index 0000000..9ae8d49 --- /dev/null +++ b/services/earnings/tests/test_healthz_and_lifespan.py @@ -0,0 +1,162 @@ +"""Phase 28 (28-12): /healthz probe + the SegmentSubscriber lifespan wiring. + +Covers the deploy-runtime additions to the earnings serving app: + * /healthz is served UNAUTHENTICATED and is exempt from the rate limiter + (the Cloud Run probe idiom) while every other route stays key-gated. + * The streaming lifespan is a NO-OP by default (EARNINGS_STREAMING_SUBSCRIPTION + unset) — behaviour identical to the ledger-only deploy. + * The _RegistryBusAdapter routes a subscriber republish into the per-call + BusRegistry so /stream (which uses BusRegistry.get) can find the bus. + * When the subscription env IS set, the lifespan starts the subscriber thread + and records the serving event loop. +""" + +from __future__ import annotations + +import asyncio +import threading + +import pytest +from fastapi.testclient import TestClient +from mostlyright.weather.earnings.streaming_transcriber import Segment + +from services.earnings.app import ( + _parse_streaming_subscription, + _RegistryBusAdapter, + create_app, +) +from services.earnings.deps import BusRegistry + +_KEY = "test-key-abc" + + +# --------------------------------------------------------------------------- +# /healthz — unauthenticated + unthrottled +# --------------------------------------------------------------------------- +def test_healthz_ok_without_auth() -> None: + app = create_app(api_key=_KEY) + client = TestClient(app) + resp = client.get("/healthz") # no key header + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + + +def test_other_routes_still_401_without_key() -> None: + # /healthz being open must not open the rest of the surface. + app = create_app(api_key=_KEY) + client = TestClient(app) + assert client.get("/capabilities").status_code == 401 + + +def test_healthz_not_rate_limited() -> None: + # A tiny per-client budget: /capabilities would 429 on the 2nd call, but + # /healthz bypasses the limiter entirely (a throttled probe would make Cloud + # Run kill a healthy instance). + app = create_app(api_key=_KEY, rate_limit=1, rate_window_seconds=60.0) + client = TestClient(app) + for _ in range(5): + assert client.get("/healthz").status_code == 200 + + +# --------------------------------------------------------------------------- +# Lifespan — no-op by default +# --------------------------------------------------------------------------- +def test_lifespan_noop_without_subscription_env(monkeypatch) -> None: + monkeypatch.delenv("EARNINGS_STREAMING_SUBSCRIPTION", raising=False) + app = create_app(api_key=_KEY) + # Entering the context triggers lifespan startup/shutdown; it must not raise + # and must not start a subscriber thread. + with TestClient(app) as client: + assert client.get("/healthz").status_code == 200 + assert not any(t.name == "earnings-streaming-subscriber" for t in threading.enumerate()) + + +# --------------------------------------------------------------------------- +# _RegistryBusAdapter — republish routes into the registry +# --------------------------------------------------------------------------- +def test_registry_bus_adapter_publish_creates_and_routes() -> None: + registry = BusRegistry() + adapter = _RegistryBusAdapter(registry) + seg = Segment(text="hi", is_final=True, spoken_at=1.0, stream_seq=1, knowledge_time=1.0) + # publish returns a coroutine (SegmentBus.publish is async) — await it. + asyncio.run(adapter.publish("call-1", seg)) + # /stream uses BusRegistry.get — the adapter must have registered the bus. + assert registry.get("call-1") is not None + + +def test_registry_bus_adapter_close_routes() -> None: + registry = BusRegistry() + adapter = _RegistryBusAdapter(registry) + asyncio.run(adapter.close("call-2")) + bus = registry.get("call-2") + assert bus is not None + assert bus.is_closed("call-2") + + +# --------------------------------------------------------------------------- +# Lifespan — starts the subscriber when the subscription env is set +# --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# Subscription resource-path parsing (the deployed cross-project case) +# --------------------------------------------------------------------------- +def test_parse_full_subscription_resource_path(monkeypatch) -> None: + # The infra sets the FULL cross-project resource path; the INGEST project must + # be parsed FROM it (GOOGLE_CLOUD_PROJECT on the serving instance is wrong). + monkeypatch.delenv("EARNINGS_INGEST_PROJECT", raising=False) + monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "mr-serving") # the WRONG project + project, sub_id = _parse_streaming_subscription( + "projects/mr-earnings-ingest/subscriptions/earnings-streaming-serving" + ) + assert project == "mr-earnings-ingest" # parsed from the path, not GOOGLE_CLOUD_PROJECT + assert sub_id == "earnings-streaming-serving" + + +def test_parse_bare_subscription_requires_project(monkeypatch) -> None: + monkeypatch.delenv("EARNINGS_INGEST_PROJECT", raising=False) + monkeypatch.delenv("GOOGLE_CLOUD_PROJECT", raising=False) + with pytest.raises(RuntimeError, match="bare subscription id"): + _parse_streaming_subscription("earnings-streaming-serving") + + +def test_lifespan_starts_subscriber_when_env_set(monkeypatch) -> None: + # The DEPLOYED form: a full cross-project subscription resource path, no + # EARNINGS_INGEST_PROJECT — the ingest project is parsed from the path. + monkeypatch.setenv( + "EARNINGS_STREAMING_SUBSCRIPTION", + "projects/mr-earnings-ingest/subscriptions/earnings-streaming-serving", + ) + monkeypatch.delenv("EARNINGS_INGEST_PROJECT", raising=False) + + consumed = threading.Event() + + def _fake_streaming_pull(project: str, subscription: str): + assert project == "mr-earnings-ingest" + assert subscription == "earnings-streaming-serving" + + def _pull(callback) -> None: + # The subscriber thread reaches here — signal, then return (the fake + # feed is exhausted; the real client would block on future.result()). + consumed.set() + + return _pull + + def _fake_run_on_loop(loop): + def _run(coro): + coro.close() # nothing fed, so no coroutine actually runs + + return _run + + # The lifespan lazy-imports these from pubsub_bridge; patch there. + monkeypatch.setattr( + "services.earnings.pubsub_bridge.build_streaming_pull", _fake_streaming_pull + ) + monkeypatch.setattr( + "services.earnings.pubsub_bridge.make_run_coroutine_threadsafe", _fake_run_on_loop + ) + + app = create_app(api_key=_KEY) + with TestClient(app) as client: + assert consumed.wait(timeout=5.0), "subscriber consume thread did not start" + # The serving loop is recorded so an out-of-loop producer can inject. + assert app.state.serving.buses.serving_loop is not None + assert client.get("/healthz").status_code == 200 diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py new file mode 100644 index 0000000..d2540df --- /dev/null +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -0,0 +1,1271 @@ +"""Cloud Run Jobs entrypoint tests (Phase 28, 28-13). + +Proves the three thin ``python -m services.earnings.jobs.`` entrypoints: + +* import CLEANLY with NO audio toolchain — the module top-level must not import + ``av`` / ``faster_whisper`` / ``ctranslate2`` / a headless browser (the heavy + deps are lazy-imported inside ``main`` / the shipped engine surfaces). Proven by + poisoning ``sys.modules`` with a sentinel that raises if imported. +* ``main()`` runs against INJECTED / FAKE engine surfaces (fake capture / + transcribe / fact-build + a tmp ledger root) — no network, no GPU, no ffmpeg — + and returns ``0``, writing the expected artifact. +* the audio firewall holds — no audio path is uploaded / served / persisted as a + ledger column, and a non-local / non-transient capture fails loud. +* each ``main()`` fails LOUD (clear error naming the var) when a required env var + is missing. + +Pure-Python fakes — NO ``faster-whisper``, NO ``av``, NO ``boto3``, NO GCP. +""" + +from __future__ import annotations + +import builtins +import importlib +import json +import os +import sys +import types + +import pytest + +from services.earnings.jobs import capture as capture_job +from services.earnings.jobs import rolefact as rolefact_job +from services.earnings.jobs import stt as stt_job +from services.earnings.jobs._env import optional_env, require_env + +# --------------------------------------------------------------------------- +# Lazy-import discipline: the audio toolchain must NOT load at module import. +# --------------------------------------------------------------------------- +_AUDIO_MODULES = ("av", "faster_whisper", "ctranslate2") + + +def test_modules_import_without_audio_toolchain(monkeypatch: pytest.MonkeyPatch) -> None: + """Re-importing the job modules must not pull in any audio/GPU dep.""" + + class _Poison: + def __getattr__(self, _name: str) -> object: # pragma: no cover - only if imported + raise AssertionError("audio toolchain imported at module load — must be lazy") + + real_import = builtins.__import__ + + def _guarded_import(name: str, *args: object, **kwargs: object) -> object: + root = name.split(".")[0] + if root in _AUDIO_MODULES: + raise AssertionError( + f"{name!r} imported at module load — heavy deps must be lazy (D-27.9 discipline)" + ) + return real_import(name, *args, **kwargs) # type: ignore[arg-type] + + for mod in _AUDIO_MODULES: + monkeypatch.setitem(sys.modules, mod, _Poison()) + monkeypatch.setattr(builtins, "__import__", _guarded_import) + + for name in ( + "services.earnings.jobs.capture", + "services.earnings.jobs.stt", + "services.earnings.jobs.rolefact", + ): + sys.modules.pop(name, None) + importlib.import_module(name) + + +# --------------------------------------------------------------------------- +# _env helper +# --------------------------------------------------------------------------- +def test_require_env_fails_loud_when_missing(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("SOME_MISSING_VAR", raising=False) + with pytest.raises(ValueError, match="SOME_MISSING_VAR"): + require_env("SOME_MISSING_VAR") + + +def test_optional_env_defaults(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("SOME_OPT_VAR", raising=False) + assert optional_env("SOME_OPT_VAR", "fallback") == "fallback" + monkeypatch.setenv("SOME_OPT_VAR", "") + assert optional_env("SOME_OPT_VAR", "fallback") == "fallback" + monkeypatch.setenv("SOME_OPT_VAR", "set") + assert optional_env("SOME_OPT_VAR", "fallback") == "set" + + +# --------------------------------------------------------------------------- +# capture.main +# --------------------------------------------------------------------------- +class _FakeArtifact: + def __init__(self, audio_path: str, *, is_transient: bool = True) -> None: + self.audio_path = audio_path + self.ticker = "CHWY" + self.call_id = "evt-1" + self.source_media_url = "https://static.events.q4inc.com/x/y.mp4" + self.is_transient = is_transient + + +def test_capture_main_runs_and_keeps_audio_local(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + out_dir = tmp_path / "cap" + out_dir.mkdir() + audio_path = str(out_dir / "audio.wav") + + captured: dict[str, object] = {} + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + captured["event"] = event + captured["tmp_dir"] = tmp_dir + return _FakeArtifact(audio_path) + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.setenv("CAPTURE_CALL_ID", "evt-1") + monkeypatch.setenv("CAPTURE_WEBCAST_URL", "https://static.events.q4inc.com/x/y.mp4") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + assert capture_job.main() == 0 + # The webcast URL rode through to the shipped capture surface as media_url. + assert captured["event"]["media_url"] == "https://static.events.q4inc.com/x/y.mp4" + assert captured["tmp_dir"] == str(out_dir) + + +def test_capture_main_fails_loud_on_missing_env(monkeypatch: pytest.MonkeyPatch) -> None: + # Operator-override path: CAPTURE_TICKER is set but CAPTURE_CALL_ID is not. + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.delenv("CAPTURE_CALL_ID", raising=False) + with pytest.raises(ValueError, match="CAPTURE_CALL_ID"): + capture_job.main() + + +def test_capture_main_fails_loud_without_subscription_or_override( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Neither the operator-override CAPTURE_TICKER nor CAPTURE_JOBS_SUBSCRIPTION is + # set: the deployed path resolver fails loud naming the subscription var. + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + monkeypatch.delenv("CAPTURE_JOBS_SUBSCRIPTION", raising=False) + with pytest.raises(ValueError, match="CAPTURE_JOBS_SUBSCRIPTION"): + capture_job.main() + + +def test_capture_main_pulls_subscription_and_uploads_handoff( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """DEPLOYED path: pull ONE capture-job message, capture, upload to handoff bucket, ack.""" + out_dir = tmp_path / "cap" + out_dir.mkdir() + audio_path = str(out_dir / "audio.wav") + (out_dir / "audio.wav").write_bytes(b"fake-audio") + + # --- fake google-cloud-pubsub: one message carrying the per-call spec -------- + acked: dict[str, object] = {} + + class _FakeMessage: + def __init__(self, data: bytes) -> None: + self.data = data + + class _FakeReceived: + def __init__(self, data: bytes) -> None: + self.ack_id = "ack-123" + self.message = _FakeMessage(data) + + class _FakePullResponse: + def __init__(self, data: bytes) -> None: + self.received_messages = [_FakeReceived(data)] + + class _FakeSubscriber: + def pull(self, *, subscription, max_messages, return_immediately): + acked["subscription_pulled"] = subscription + spec = { + "ticker": "CHWY", + "call_id": "evt-1", + "webcast_url": "https://static.events.q4inc.com/x/y.mp4", + } + return _FakePullResponse(json.dumps(spec).encode("utf-8")) + + def modify_ack_deadline(self, *, subscription, ack_ids, ack_deadline_seconds): + acked.setdefault("lease_extensions", []).append(ack_deadline_seconds) + + def acknowledge(self, *, subscription, ack_ids): + # Record the ORDER: ack must come AFTER the STT trigger. + acked["ack_subscription"] = subscription + acked["ack_ids"] = list(ack_ids) + acked["ack_after_trigger"] = "stt_endpoint" in triggered + + fake_pubsub = types.SimpleNamespace(SubscriberClient=lambda: _FakeSubscriber()) + + # --- fake httpx: capture the STT trigger POST (2xx) --------------------------- + triggered: dict[str, object] = {} + + class _FakeResponse: + status_code = 200 + + def raise_for_status(self) -> None: + return None + + def _fake_post(url, *, json, headers, timeout): + triggered["stt_endpoint"] = url + triggered["stt_payload"] = json + return _FakeResponse() + + fake_httpx = types.SimpleNamespace(post=_fake_post) + monkeypatch.setitem(sys.modules, "httpx", fake_httpx) + + # --- fake google-cloud-storage: capture the uploaded handoff object ---------- + uploaded: dict[str, object] = {} + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def upload_from_filename(self, filename: str) -> None: + uploaded["blob"] = self._name + uploaded["local"] = filename + + class _FakeBucket: + def __init__(self, name: str) -> None: + self._name = name + + def blob(self, name: str) -> _FakeBlob: + uploaded["bucket"] = self._name + return _FakeBlob(name) + + class _FakeStorageClient: + def bucket(self, name: str) -> _FakeBucket: + return _FakeBucket(name) + + fake_storage = types.SimpleNamespace(Client=lambda: _FakeStorageClient()) + + # google.cloud.{pubsub_v1,storage} are lazy-imported inside capture.py; inject + # the fakes into sys.modules so the `from google.cloud import ...` picks them up. + monkeypatch.setitem(sys.modules, "google.cloud.pubsub_v1", fake_pubsub) + monkeypatch.setitem(sys.modules, "google.cloud.storage", fake_storage) + + class _FakeArtifactLocal: + def __init__(self) -> None: + self.audio_path = audio_path + self.ticker = "CHWY" + self.call_id = "evt-1" + self.source_media_url = "https://static.events.q4inc.com/x/y.mp4" + self.is_transient = True + + captured: dict[str, object] = {} + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + captured["event"] = event + return _FakeArtifactLocal() + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + # DEPLOYED env: no CAPTURE_TICKER override; subscription + handoff + STT URL set. + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + monkeypatch.setenv("CAPTURE_JOBS_SUBSCRIPTION", "projects/p/subscriptions/capture-jobs") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + monkeypatch.setenv("STT_SERVICE_URL", "https://earnings-stt-abc.a.run.app") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + assert capture_job.main() == 0 + # The per-call spec was pulled off the subscription and rode into capture. + assert acked["subscription_pulled"] == "projects/p/subscriptions/capture-jobs" + assert captured["event"]["media_url"] == "https://static.events.q4inc.com/x/y.mp4" + # The transient audio was uploaded to the PRIVATE handoff bucket (never R2). + assert uploaded["bucket"] == "earnings-audio-handoff-123" + assert uploaded["blob"] == "handoff/CHWY/evt-1.wav" + assert uploaded["local"] == audio_path + # STT was triggered with the gs:// handoff URI (closes capture→STT). + assert triggered["stt_endpoint"] == "https://earnings-stt-abc.a.run.app/transcribe" + assert triggered["stt_payload"] == { + "audio_path": "gs://earnings-audio-handoff-123/handoff/CHWY/evt-1.wav", + "ticker": "CHWY", + "call_id": "evt-1", + } + # The message was acked ONLY AFTER the STT trigger succeeded. + assert acked["ack_ids"] == ["ack-123"] + assert acked["ack_after_trigger"] is True + + +def test_capture_main_missing_message_fails_loud(monkeypatch: pytest.MonkeyPatch) -> None: + """An empty subscription pull fails loud rather than silently no-op'ing.""" + + class _FakeSubscriber: + def pull(self, *, subscription, max_messages, return_immediately): + return types.SimpleNamespace(received_messages=[]) + + fake_pubsub = types.SimpleNamespace(SubscriberClient=lambda: _FakeSubscriber()) + monkeypatch.setitem(sys.modules, "google.cloud.pubsub_v1", fake_pubsub) + + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + monkeypatch.setenv("CAPTURE_JOBS_SUBSCRIPTION", "projects/p/subscriptions/capture-jobs") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + + with pytest.raises(RuntimeError, match="no capture-job message"): + capture_job.main() + + +def _install_deployed_capture_fakes(monkeypatch, out_dir, *, stt_post): + """Wire pubsub + storage + httpx fakes for a deployed-path capture run. + + ``stt_post`` is the fake ``httpx.post`` callable (a test controls whether it + 2xx's or raises). Returns a ``record`` dict the fakes write into (lease + extensions, ack ids, uploaded blob) so the test can assert ordering. + """ + record: dict[str, object] = {"lease_extensions": [], "acked": False} + audio_path = str(out_dir / "audio.wav") + (out_dir / "audio.wav").write_bytes(b"fake-audio") + + class _FakeSubscriber: + def pull(self, *, subscription, max_messages, return_immediately): + spec = { + "ticker": "CHWY", + "call_id": "evt-1", + "webcast_url": "https://static.events.q4inc.com/x/y.mp4", + } + data = json.dumps(spec).encode("utf-8") + msg = types.SimpleNamespace(ack_id="ack-xyz", message=types.SimpleNamespace(data=data)) + return types.SimpleNamespace(received_messages=[msg]) + + def modify_ack_deadline(self, *, subscription, ack_ids, ack_deadline_seconds): + record["lease_extensions"].append(ack_deadline_seconds) # type: ignore[union-attr] + + def acknowledge(self, *, subscription, ack_ids): + record["acked"] = True + record["ack_ids"] = list(ack_ids) + + class _FakeBlob: + def __init__(self, name): + self._name = name + + def upload_from_filename(self, filename): + record["uploaded_blob"] = self._name + + class _FakeStorageClient: + def bucket(self, name): + return types.SimpleNamespace(blob=lambda n: _FakeBlob(n)) + + class _FakeArtifactLocal: + def __init__(self) -> None: + self.audio_path = audio_path + self.ticker = "CHWY" + self.call_id = "evt-1" + self.source_media_url = "https://static.events.q4inc.com/x/y.mp4" + self.is_transient = True + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + return _FakeArtifactLocal() + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setitem( + sys.modules, + "google.cloud.pubsub_v1", + types.SimpleNamespace(SubscriberClient=lambda: _FakeSubscriber()), + ) + monkeypatch.setitem( + sys.modules, + "google.cloud.storage", + types.SimpleNamespace(Client=lambda: _FakeStorageClient()), + ) + monkeypatch.setitem(sys.modules, "httpx", types.SimpleNamespace(post=stt_post)) + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + monkeypatch.setenv("CAPTURE_JOBS_SUBSCRIPTION", "projects/p/subscriptions/capture-jobs") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + return record + + +def test_capture_main_extends_lease_for_long_capture( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A long capture keeps the Pub/Sub lease alive (modify_ack_deadline is called).""" + + class _OkResponse: + status_code = 200 + + def raise_for_status(self): + return None + + def _ok_post(url, *, json, headers, timeout): + return _OkResponse() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_ok_post) + + # Force the lease loop to tick during capture: a small (non-zero) refresh + # interval + a capture that blocks long enough for at least one extension. + # The re-lease deadline stays 600 (Pub/Sub's cap) so the assertion is realistic. + monkeypatch.setattr(capture_job, "_LEASE_DEADLINE_SECONDS", 600) + monkeypatch.setattr(capture_job, "_LEASE_REFRESH_FRACTION", 0.05 / 600) + + import time as _time + + import mostlyright.weather.earnings.capture.q4 as q4mod + + class _SlowArtifact: + audio_path = str(out_dir / "audio.wav") + ticker = "CHWY" + call_id = "evt-1" + source_media_url = "https://static.events.q4inc.com/x/y.mp4" + is_transient = True + + class _SlowAdapter: + def capture(self, event, *, tmp_dir=None, **_): + # Block long enough that the (interval=0) lease loop fires >=1 extension. + _time.sleep(0.15) + return _SlowArtifact() + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _SlowAdapter) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + assert capture_job.main() == 0 + # The lease was extended at least once during the long capture, at 600s each. + assert len(record["lease_extensions"]) >= 1 # type: ignore[arg-type] + assert all(d == 600 for d in record["lease_extensions"]) # type: ignore[union-attr] + assert record["acked"] is True + + +def test_capture_main_fails_loud_when_stt_service_url_unset( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """STT_SERVICE_URL unset in the deployed path fails loud; message NOT acked.""" + + def _unused_post(url, *, json, headers, timeout): # pragma: no cover - never called + raise AssertionError("STT must not be POSTed when STT_SERVICE_URL is unset") + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_unused_post) + monkeypatch.delenv("STT_SERVICE_URL", raising=False) + + with pytest.raises(RuntimeError, match="STT_SERVICE_URL is unset"): + capture_job.main() + # The audio was uploaded, but the message was NOT acked (redeliver → retry). + assert record["acked"] is False + assert record.get("uploaded_blob") == "handoff/CHWY/evt-1.wav" + + +def test_capture_main_does_not_ack_when_stt_trigger_fails( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A non-2xx STT trigger raises and the message is NOT acked (idempotent retry).""" + + class _HttpError(Exception): + pass + + class _ErrResponse: + status_code = 503 + + def raise_for_status(self): + raise _HttpError("503 from STT") + + def _err_post(url, *, json, headers, timeout): + return _ErrResponse() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_err_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + with pytest.raises(_HttpError): + capture_job.main() + assert record["acked"] is False + + +def test_capture_main_rejects_non_local_audio(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + out_dir = tmp_path / "cap" + out_dir.mkdir() + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + # An escaped audio path OUTSIDE the ephemeral dir must fail loud. + return _FakeArtifact("/etc/passwd") + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.setenv("CAPTURE_CALL_ID", "evt-1") + monkeypatch.setenv("CAPTURE_WEBCAST_URL", "https://static.events.q4inc.com/x/y.mp4") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + with pytest.raises(RuntimeError, match="must stay a transient local artifact"): + capture_job.main() + + +def test_capture_main_rejects_non_transient_audio( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + out_dir = tmp_path / "cap" + out_dir.mkdir() + audio_path = str(out_dir / "audio.wav") + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + return _FakeArtifact(audio_path, is_transient=False) + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.setenv("CAPTURE_CALL_ID", "evt-1") + monkeypatch.setenv("CAPTURE_WEBCAST_URL", "https://static.events.q4inc.com/x/y.mp4") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + with pytest.raises(RuntimeError, match="must always be transient"): + capture_job.main() + + +# --------------------------------------------------------------------------- +# stt.main +# --------------------------------------------------------------------------- +class _FakeTranscriptResult: + def __init__(self) -> None: + self.text = "we grew RPO and tariffs this quarter" + self.segments = [ + {"text": "we grew RPO", "start": 0.0, "end": 2.0}, + {"text": "and tariffs this quarter", "start": 2.0, "end": 4.0}, + ] + self.language = "en" + self.duration = 4.0 + + +def test_stt_main_writes_transcript_ledger_no_audio( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + seen: dict[str, object] = {} + + class _FakeTranscriber: + def __init__(self, model_size, *, device, compute_type): + seen["model_size"] = model_size + seen["device"] = device + + def transcribe(self, audio_path, *, initial_prompt=None): + seen["audio_path"] = audio_path + seen["initial_prompt"] = initial_prompt + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.setenv("STT_INITIAL_PROMPT", "RPO tariffs") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + + assert stt_job.main() == 0 + assert seen["model_size"] == "small" + assert seen["initial_prompt"] == "RPO tariffs" + + # The transcript is persisted (text only) and carries NO audio column. + from mostlyright.weather.earnings.ledger import TranscriptLedger + + ledger = TranscriptLedger() + rows = ledger.read("CHWY", "evt-1") + assert len(rows) == 2 + assert rows[0]["text"] == "we grew RPO" + assert not any("audio" in col.lower() for col in ledger.column_names()) + # No row carries an audio-shaped key. + assert all(not any("audio" in str(k).lower() for k in row) for row in rows) + + +def test_stt_main_fails_loud_on_missing_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("STT_AUDIO_PATH", raising=False) + with pytest.raises(ValueError, match="STT_AUDIO_PATH"): + stt_job.main() + + +def test_stt_main_fails_loud_when_audio_missing(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + monkeypatch.setenv("STT_AUDIO_PATH", str(tmp_path / "nope.wav")) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + # transcribe_call raises a generic "audio path ... does not exist" (shared by + # the one-shot main() and the HTTP server); the fail-loud property is what matters. + with pytest.raises(FileNotFoundError, match="does not exist"): + stt_job.main() + + +def test_stt_main_downloads_gs_handoff_before_transcribing( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """DEPLOYED path: STT_AUDIO_PATH is a gs:// handoff object → downloaded, transcribed.""" + cache = tmp_path / "cache" + handoff_bytes = b"RIFF-handoff-audio" + + downloaded: dict[str, object] = {} + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def download_to_filename(self, local_path: str) -> None: + downloaded["blob"] = self._name + downloaded["local_path"] = local_path + # The download must produce a real local file the transcriber reads. + with open(local_path, "wb") as fh: + fh.write(handoff_bytes) + + class _FakeBucket: + def __init__(self, name: str) -> None: + self._name = name + + def blob(self, name: str) -> _FakeBlob: + downloaded["bucket"] = self._name + return _FakeBlob(name) + + class _FakeStorageClient: + def bucket(self, name: str) -> _FakeBucket: + return _FakeBucket(name) + + fake_storage = types.SimpleNamespace(Client=lambda: _FakeStorageClient()) + monkeypatch.setitem(sys.modules, "google.cloud.storage", fake_storage) + + seen: dict[str, object] = {} + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + # The transcriber must receive the LOCAL downloaded temp path (which + # must EXIST at transcription time), never the gs:// reference. + seen["audio_path"] = audio_path + seen["existed_during_transcribe"] = os.path.exists(audio_path) + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", "gs://earnings-audio-handoff-123/handoff/CHWY/evt-1.wav") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + + assert stt_job.main() == 0 + assert downloaded["bucket"] == "earnings-audio-handoff-123" + assert downloaded["blob"] == "handoff/CHWY/evt-1.wav" + # The transcriber saw a LOCAL path (not the gs:// ref) and it existed. + assert not str(seen["audio_path"]).startswith("gs://") + assert seen["existed_during_transcribe"] is True + # The transient temp file is cleaned up after transcription (D-27.9). + assert not os.path.exists(downloaded["local_path"]) + + from mostlyright.weather.earnings.ledger import TranscriptLedger + + rows = TranscriptLedger().read("CHWY", "evt-1") + assert len(rows) == 2 + + +def test_stt_transcribe_call_bare_key_resolves_against_handoff_bucket( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A bare object key (not gs://) resolves against handoff_bucket + downloads.""" + cache = tmp_path / "cache" + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def download_to_filename(self, local_path: str) -> None: + with open(local_path, "wb") as fh: + fh.write(b"audio") + + seen: dict[str, object] = {} + + class _FakeBucket: + def __init__(self, name: str) -> None: + seen["bucket"] = name + + def blob(self, name: str) -> _FakeBlob: + seen["blob"] = name + return _FakeBlob(name) + + fake_storage = types.SimpleNamespace( + Client=lambda: types.SimpleNamespace(bucket=lambda name: _FakeBucket(name)) + ) + monkeypatch.setitem(sys.modules, "google.cloud.storage", fake_storage) + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr( + sttmod, + "SttTranscriber", + lambda *a, **k: types.SimpleNamespace( + transcribe=lambda audio_path, *, initial_prompt=None: _FakeTranscriptResult() + ), + ) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + + out = stt_job.transcribe_call( + "handoff/CHWY/evt-2.wav", + ticker="CHWY", + call_id="evt-2", + tier="small", + device="cpu", + handoff_bucket="earnings-audio-handoff-123", + ) + assert seen["bucket"] == "earnings-audio-handoff-123" + assert seen["blob"] == "handoff/CHWY/evt-2.wav" + assert out["ticker"] == "CHWY" + + +def test_stt_main_live_publish_opt_in(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"fake") + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() + + published: list[object] = [] + + class _FakePublisher: + def __init__(self, _callable): + pass + + def publish(self, call_id, item): + published.append(item) + + def publish_end_of_call(self, call_id): + published.append("EOC") + + import mostlyright.weather.earnings.stt as sttmod + + import services.earnings.pubsub_bridge as bridge + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setattr(bridge, "SegmentPublisher", _FakePublisher) + monkeypatch.setattr(bridge, "build_publisher_client", lambda project, topic: object()) + + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("EARNINGS_STREAMING_ENABLED", "true") + monkeypatch.setenv("EARNINGS_STREAMING_PROJECT", "mr-earnings-ingest") + + assert stt_job.main() == 0 + # Two segments + one end-of-call marker. + assert len(published) == 3 + assert published[-1] == "EOC" + + +# --------------------------------------------------------------------------- +# rolefact.main +# --------------------------------------------------------------------------- +def _seed_transcript(cache_dir, ticker: str, call_id: str, texts: list[str]) -> None: + from mostlyright.weather.earnings.ledger import TranscriptLedger + + rows = [ + { + "ticker": ticker, + "call_id": call_id, + "segment_index": i, + "segment": "batch", + "text": t, + "is_final": True, + "source": "earnings_call", + "delivery": "hosted", + } + for i, t in enumerate(texts) + ] + TranscriptLedger().append(rows, ticker=ticker, call_id=call_id) + + +def test_rolefact_main_builds_facts_no_audio(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-1", ["we mentioned tariff twice", "tariff again here"]) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + + assert rolefact_job.main() == 0 + + from mostlyright.weather.earnings.ledger import FactLedger + + fl = FactLedger() + facts = fl.read("CHWY", "evt-1") + # 'tariff' appears once per segment -> two occurrences across the two segments. + assert len(facts) == 2 + assert all(f["term_canonical"] == "tariff" for f in facts) + # No audio field on the fact ledger. + assert not any("audio" in col.lower() for col in fl.column_names()) + assert all(not any("audio" in str(k).lower() for k in f) for f in facts) + + +def test_rolefact_main_fails_loud_on_missing_transcript( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(tmp_path / "cache")) + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-missing") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + with pytest.raises(RuntimeError, match="no persisted transcript"): + rolefact_job.main() + + +def test_rolefact_main_fails_loud_on_missing_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("ROLEFACT_TICKER", raising=False) + with pytest.raises(ValueError, match="ROLEFACT_TICKER"): + rolefact_job.main() + + +def test_rolefact_main_bad_terms_json(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(tmp_path / "cache")) + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", "not-json") + with pytest.raises(ValueError, match="ROLEFACT_TERMS is not valid JSON"): + rolefact_job.main() + + +def test_rolefact_main_r2_upload_opt_in_uploads_facts_only( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-1", ["tariff here"]) + + uploaded: dict[str, object] = {} + + # Resolve the sink module via importlib (the dotted-path resolver) rather than + # an ``import a.b.c as x`` statement — the latter trips a namespace-package + # resolution quirk on ``mostlyright.weather.satellite``. rolefact imports + # ``upload`` from this module at call time, so patching the attribute here is + # what the job's lazy ``from ... import upload`` picks up. + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + uploaded["local_path"] = str(local_path) + uploaded["bucket"] = bucket + uploaded["key"] = key + return key + + monkeypatch.setattr(sink, "upload", _fake_upload) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.setenv("ROLEFACT_R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + # Only the derived FACT parquet is uploaded (never audio). + assert uploaded["key"] == "earnings/facts/CHWY/evt-1.parquet" + assert uploaded["local_path"].endswith("evt-1.parquet") + assert "audio" not in uploaded["local_path"].lower() + + +def test_rolefact_main_uploads_via_infra_r2_bucket_env( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """The infra env name R2_BUCKET (not ROLEFACT_R2_BUCKET) enables + targets the upload.""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-9", ["tariff here"]) + + uploaded: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + uploaded["bucket"] = bucket + uploaded["key"] = key + return key + + monkeypatch.setattr(sink, "upload", _fake_upload) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-9") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + # infra sets R2_BUCKET (NOT ROLEFACT_R2_BUCKET) on the rolefact Cloud Run Job. + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + assert uploaded["bucket"] == "mostlyright-derived" + assert uploaded["key"] == "earnings/facts/CHWY/evt-9.parquet" + + +# --------------------------------------------------------------------------- +# Round-7 P1 fixes: +# (1) capture->STT trigger must wait out a full transcription (not 60s) AND +# stay under the Pub/Sub lease. +# (2) the transcript must be durable across the SEPARATE STT / role-fact +# Cloud Run containers (via the R2 data plane), not just local disk. +# --------------------------------------------------------------------------- +class _OkPost: + status_code = 200 + + def raise_for_status(self): + return None + + +def test_capture_stt_trigger_uses_long_default_timeout( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """The synchronous STT trigger waits out a full transcription (default 3600s) — + the old 60s cap timed out mid-transcription → NACK → duplicate recapture.""" + seen: dict[str, object] = {} + + def _record_post(url, *, json, headers, timeout): + seen["timeout"] = timeout + return _OkPost() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_record_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + monkeypatch.delenv("STT_TRIGGER_TIMEOUT_SECONDS", raising=False) + + assert capture_job.main() == 0 + assert seen["timeout"] == 3600.0 + + +def test_capture_stt_trigger_timeout_env_override( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + seen: dict[str, object] = {} + + def _record_post(url, *, json, headers, timeout): + seen["timeout"] = timeout + return _OkPost() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_record_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + monkeypatch.setenv("STT_TRIGGER_TIMEOUT_SECONDS", "1200") + + assert capture_job.main() == 0 + assert seen["timeout"] == 1200.0 + + +def test_capture_stt_trigger_bad_timeout_fails_loud( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + def _unused_post(url, *, json, headers, timeout): # pragma: no cover - never reached + raise AssertionError("must not POST with an invalid timeout") + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_unused_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + monkeypatch.setenv("STT_TRIGGER_TIMEOUT_SECONDS", "notanumber") + + with pytest.raises(ValueError, match="STT_TRIGGER_TIMEOUT_SECONDS"): + capture_job.main() + assert record["acked"] is False + + +def test_capture_holds_lease_during_stt_trigger(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + """The lease is extended WHILE the (slow) STT trigger runs. Capture is instant + here, so extensions beyond the immediate one prove the lease covers the + trigger — previously the trigger ran AFTER the lease released (Codex R7 P1).""" + import time as _time + + def _slow_post(url, *, json, headers, timeout): + # Block long enough for the tiny-interval lease loop to tick during the + # trigger (capture itself returns immediately in the helper's fake adapter). + _time.sleep(0.25) + return _OkPost() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_slow_post) + # interval = 600 * (0.01/600) = 0.01s — many ticks across the 0.25s trigger. + monkeypatch.setattr(capture_job, "_LEASE_REFRESH_FRACTION", 0.01 / 600) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + assert capture_job.main() == 0 + # >=2: the immediate extension at lease-start PLUS at least one during the + # trigger (instant capture leaves no other window for repeated extensions). + assert len(record["lease_extensions"]) >= 2 # type: ignore[arg-type] + assert record["acked"] is True + + +def test_stt_publishes_transcript_to_r2_when_bucket_set( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """STT publishes the durable TEXT transcript parquet to R2 (never audio) so the + SEPARATE role/fact Job can read it across containers (Codex R7 P1).""" + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + uploaded: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + uploaded["local_path"] = str(local_path) + uploaded["bucket"] = bucket + uploaded["key"] = key + return key + + monkeypatch.setattr(sink, "upload", _fake_upload) + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert stt_job.main() == 0 + assert uploaded["bucket"] == "mostlyright-derived" + assert uploaded["key"] == "earnings/transcripts/CHWY/evt-1.parquet" + assert uploaded["local_path"].endswith("evt-1.parquet") + # Only the TEXT transcript parquet crosses to R2 — never an audio path. + assert "audio" not in uploaded["local_path"].lower() + + +def test_stt_no_r2_publish_when_bucket_unset(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + """No R2 bucket → the transcript stays local (byte-identical to pre-R7).""" + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _boom_upload(*a, **k): # pragma: no cover - must not be called + raise AssertionError("no R2 publish when R2_BUCKET is unset") + + monkeypatch.setattr(sink, "upload", _boom_upload) + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + monkeypatch.delenv("R2_BUCKET", raising=False) + + assert stt_job.main() == 0 + + +def test_rolefact_downloads_transcript_from_r2_on_local_miss( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """rolefact rehydrates STT's transcript from R2 when THIS container's local + cache is empty (separate Cloud Run containers, Codex R7 P1).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + # NB: NO local seed — the transcript lives only in R2 (a fresh container). + + downloaded: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_download(bucket, key, local_path, *, r2_target=None): + downloaded["bucket"] = bucket + downloaded["key"] = key + # Simulate the object landing in this container's ledger cache. + _seed_transcript(cache, "CHWY", "evt-2", ["tariff here"]) + return str(local_path) + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + downloaded.setdefault("fact_uploads", []).append(key) # type: ignore[union-attr] + return key + + monkeypatch.setattr(sink, "download", _fake_download) + monkeypatch.setattr(sink, "upload", _fake_upload) # R2_BUCKET also enables fact upload + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-2") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + assert downloaded["bucket"] == "mostlyright-derived" + assert downloaded["key"] == "earnings/transcripts/CHWY/evt-2.parquet" + + from mostlyright.weather.earnings.ledger import FactLedger + + facts = FactLedger().read("CHWY", "evt-2") + assert len(facts) == 1 + assert facts[0]["term_canonical"] == "tariff" + + +def test_rolefact_r2_download_miss_still_fails_loud( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A missing R2 transcript is logged, not swallowed — rolefact still fails loud + on the resulting empty read (the correct 'STT must run first' signal).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _raise_download(bucket, key, local_path, *, r2_target=None): + raise RuntimeError("NoSuchKey") + + monkeypatch.setattr(sink, "download", _raise_download) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-absent") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + with pytest.raises(RuntimeError, match="no persisted transcript"): + rolefact_job.main() + + +def test_stt_rerun_is_idempotent_not_doubled(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + """Re-running STT for the SAME call overwrites (replaces) the transcript — a + redelivery/retry must not double the segments (Codex R7-2 P1).""" + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() # two segments + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + monkeypatch.delenv("R2_BUCKET", raising=False) + + assert stt_job.main() == 0 + assert stt_job.main() == 0 # retry / redelivery + + from mostlyright.weather.earnings.ledger import TranscriptLedger + + rows = TranscriptLedger().read("CHWY", "evt-1") + assert len(rows) == 2 # NOT 4 — the second run replaced, not appended + + +def test_rolefact_rerun_is_idempotent_not_doubled( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """Re-running role/fact for the SAME call replaces the fact partition — a + redelivery/retry must not double the fact rows (Codex R7-2 P1).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-1", ["we mentioned tariff twice", "tariff again here"]) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.delenv("R2_BUCKET", raising=False) + + assert rolefact_job.main() == 0 + assert rolefact_job.main() == 0 # retry / redelivery + + from mostlyright.weather.earnings.ledger import FactLedger + + facts = FactLedger().read("CHWY", "evt-1") + # 'tariff' occurs once per segment -> 2 facts; a second run must not make it 4. + assert len(facts) == 2 + + +def test_rolefact_zero_facts_deletes_stale_r2_object( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A zero-mention rerun clears the local partition AND deletes the stale R2 fact + object, so serving stops serving facts the replace cleared (Codex R7-3 P2).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-3", ["nothing relevant is said here"]) + + deleted: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_delete(bucket, key, *, r2_target=None): + deleted["bucket"] = bucket + deleted["key"] = key + + def _boom_upload(*a, **k): # pragma: no cover - zero facts => must not upload + raise AssertionError("must not upload when there are zero facts") + + monkeypatch.setattr(sink, "delete", _fake_delete) + monkeypatch.setattr(sink, "upload", _boom_upload) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-3") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') # never mentioned + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + # The stale R2 fact object was tombstoned (not left behind). + assert deleted["bucket"] == "mostlyright-derived" + assert deleted["key"] == "earnings/facts/CHWY/evt-3.parquet" + + from mostlyright.weather.earnings.ledger import FactLedger + + assert FactLedger().read("CHWY", "evt-3") == [] + + +def test_capture_subscription_without_handoff_bucket_fails_loud( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A SUBSCRIPTION-pulled capture with AUDIO_HANDOFF_BUCKET unset fails loud and + does NOT ack — else the audio is orphaned with no redelivery (Codex R7-5 P1).""" + + def _unused_post(url, *, json, headers, timeout): # pragma: no cover - never reached + raise AssertionError("must not reach the STT trigger") + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_unused_post) + # Deploy misconfiguration: a real subscription pull but no handoff bucket. + monkeypatch.delenv("AUDIO_HANDOFF_BUCKET", raising=False) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + with pytest.raises(RuntimeError, match="AUDIO_HANDOFF_BUCKET is unset"): + capture_job.main() + # The message was NOT acked (Pub/Sub redelivers once the env is fixed). + assert record["acked"] is False diff --git a/services/earnings/tests/test_serving_r2_read.py b/services/earnings/tests/test_serving_r2_read.py new file mode 100644 index 0000000..c66faa1 --- /dev/null +++ b/services/earnings/tests/test_serving_r2_read.py @@ -0,0 +1,168 @@ +"""Serving reads the durable R2 corpus the ingest jobs wrote (Codex R7-4 P1). + +STT / role-fact publish transcript + fact parquet to R2; the serving app runs in a +SEPARATE Cloud Run container with an empty local disk, so it must read those from +R2 (else /transcripts and /facts return empty after ingest succeeded). These tests +prove: the ``EarningsR2Reader`` parses R2 objects + distinguishes a genuine miss +from a real error; ``ServingState`` picks the R2 source only when the read token is +present and no explicit ledger_root is given; and the routes serve R2 rows. +""" + +from __future__ import annotations + +import io + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest +from fastapi.testclient import TestClient + +from services.earnings.app import create_app +from services.earnings.deps import ServingState +from services.earnings.r2_read import EarningsR2Reader, R2LedgerSource + + +def _parquet_bytes(rows: list[dict]) -> bytes: + buf = io.BytesIO() + pq.write_table(pa.Table.from_pylist(rows), buf) + return buf.getvalue() + + +class _FakeBody: + def __init__(self, data: bytes) -> None: + self._data = data + + def read(self) -> bytes: + return self._data + + +class _FakeS3: + """A minimal in-memory S3/R2 stand-in: {key: parquet-bytes}.""" + + def __init__(self, objects: dict[str, bytes]) -> None: + self._objects = objects + + def get_object(self, *, Bucket, Key): + if Key not in self._objects: + from botocore.exceptions import ClientError + + raise ClientError({"Error": {"Code": "NoSuchKey"}}, "GetObject") + return {"Body": _FakeBody(self._objects[Key])} + + def list_objects_v2(self, *, Bucket, Prefix, Delimiter=None, ContinuationToken=None): + keys = [k for k in self._objects if k.startswith(Prefix)] + if Delimiter: + common = set() + for k in keys: + tail = k[len(Prefix) :] + if Delimiter in tail: + common.add(Prefix + tail.split(Delimiter, 1)[0] + Delimiter) + return {"CommonPrefixes": [{"Prefix": p} for p in sorted(common)], "IsTruncated": False} + return {"Contents": [{"Key": k} for k in sorted(keys)], "IsTruncated": False} + + +def _reader_over(objects: dict[str, bytes]) -> EarningsR2Reader: + reader = EarningsR2Reader(bucket="mostlyright-derived") + reader._client = _FakeS3(objects) # bypass boto3 (no creds needed) + return reader + + +def test_reader_read_partition_parses_rows() -> None: + rows = [{"ticker": "GIS", "call_id": "GIS-Q3", "segment_index": 0, "text": "hello"}] + reader = _reader_over({"earnings/transcripts/GIS/GIS-Q3.parquet": _parquet_bytes(rows)}) + got = reader.read_partition("transcripts", "GIS", "GIS-Q3") + assert got == rows + + +def test_reader_missing_object_is_empty_not_error() -> None: + reader = _reader_over({}) + # NoSuchKey -> [] (nothing ingested yet), never a raise. + assert reader.read_partition("facts", "GIS", "NOPE") == [] + + +def test_reader_real_error_propagates() -> None: + class _BoomS3: + def get_object(self, *, Bucket, Key): + from botocore.exceptions import ClientError + + raise ClientError({"Error": {"Code": "AccessDenied"}}, "GetObject") + + reader = EarningsR2Reader() + reader._client = _BoomS3() + with pytest.raises(Exception): # noqa: B017 - a non-NoSuchKey error must NOT be silently empty + reader.read_partition("facts", "GIS", "GIS-Q3") + + +def test_reader_lists_tickers_and_call_ids() -> None: + objects = { + "earnings/facts/GIS/GIS-Q3.parquet": _parquet_bytes([{"ticker": "GIS"}]), + "earnings/facts/GIS/GIS-Q2.parquet": _parquet_bytes([{"ticker": "GIS"}]), + "earnings/facts/ORCL/ORCL-Q1.parquet": _parquet_bytes([{"ticker": "ORCL"}]), + } + reader = _reader_over(objects) + assert reader.list_tickers("facts") == ["GIS", "ORCL"] + assert reader.list_call_ids("facts", "GIS") == ["GIS-Q2", "GIS-Q3"] + + +def test_reader_rejects_unsafe_segment() -> None: + reader = _reader_over({}) + with pytest.raises(ValueError, match="unsafe"): + reader.read_partition("facts", "../secret", "x") + + +def test_serving_state_uses_r2_when_read_token_present(monkeypatch) -> None: + monkeypatch.setenv("R2_ACCESS_KEY_ID", "read-key") + monkeypatch.setenv("R2_SECRET_ACCESS_KEY", "read-secret") + monkeypatch.setenv("R2_ACCOUNT_ID", "acct") + state = ServingState.build(ledger_root=None) + assert isinstance(state.transcripts, R2LedgerSource) + assert isinstance(state.facts, R2LedgerSource) + + +def test_serving_state_local_when_ledger_root_given(monkeypatch, tmp_path) -> None: + # An explicit ledger_root ALWAYS wins (tests / on-device), even with R2 env set. + monkeypatch.setenv("R2_ACCESS_KEY_ID", "read-key") + state = ServingState.build(ledger_root=tmp_path) + assert not isinstance(state.transcripts, R2LedgerSource) + assert not isinstance(state.facts, R2LedgerSource) + + +def test_serving_state_local_when_no_r2_token(monkeypatch) -> None: + monkeypatch.delenv("R2_ACCESS_KEY_ID", raising=False) + state = ServingState.build(ledger_root=None) + assert not isinstance(state.transcripts, R2LedgerSource) + + +def test_routes_serve_r2_data(monkeypatch) -> None: + """/transcripts + /facts return the rows read from R2 (deployed serving path).""" + transcript_rows = [ + { + "ticker": "GIS", + "call_id": "GIS-Q3", + "segment_index": 0, + "text": "hi", + "delivery": "hosted", + } + ] + fact_rows = [{"ticker": "GIS", "call_id": "GIS-Q3", "term_canonical": "AI", "mention_count": 1}] + objects = { + "earnings/transcripts/GIS/GIS-Q3.parquet": _parquet_bytes(transcript_rows), + "earnings/facts/GIS/GIS-Q3.parquet": _parquet_bytes(fact_rows), + } + + import services.earnings.deps as deps + + # Force the R2 branch (no ledger_root) with a fake reader over the objects. + monkeypatch.setenv("R2_ACCESS_KEY_ID", "read-key") + monkeypatch.setattr(deps, "EarningsR2Reader", lambda *a, **k: _reader_over(objects)) + + app = create_app(api_key=None) # keyless in-process; no ledger_root -> R2 path + client = TestClient(app) + + r = client.get("/transcripts", params={"ticker": "GIS", "call_id": "GIS-Q3"}) + assert r.status_code == 200 + assert r.json() == transcript_rows + + r = client.get("/facts", params={"ticker": "GIS"}) + assert r.status_code == 200 + assert r.json() == fact_rows diff --git a/services/earnings/tests/test_stt_handoff_delete.py b/services/earnings/tests/test_stt_handoff_delete.py new file mode 100644 index 0000000..9825dbb --- /dev/null +++ b/services/earnings/tests/test_stt_handoff_delete.py @@ -0,0 +1,106 @@ +"""Phase 28 (28-11) — the STT handoff-object delete ordering (Codex R4 P1). + +The transient source audio object in AUDIO_HANDOFF_BUCKET must be deleted ONLY +AFTER the transcript is durably written to the ledger — never before. Deleting it +at transcription time would strand a call whose ledger write then fails with no +retryable audio. These tests fake google-cloud-storage + the transcriber + the +ledger to assert the ordering and the skip-on-failure behavior. +""" + +from __future__ import annotations + +import sys +import types + +import pytest + + +class _FakeSegments: + def __init__(self) -> None: + self.segments = [{"text": "hello", "start": 0.0}] + self.language = "en" + self.duration = 1.0 + + +def _install_fakes(monkeypatch, events: list[str], *, ledger_raises: bool) -> dict: + """Wire fake storage + transcriber + ledger that record an ordered event log.""" + state: dict = {"deleted": False} + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def download_to_filename(self, local_path: str) -> None: + events.append("download") + with open(local_path, "wb") as fh: + fh.write(b"fake-audio") + + def delete(self) -> None: + events.append("delete") + state["deleted"] = True + + class _FakeBucket: + def blob(self, name: str) -> _FakeBlob: + return _FakeBlob(name) + + class _FakeClient: + def bucket(self, name: str) -> _FakeBucket: + return _FakeBucket() + + monkeypatch.setitem( + sys.modules, "google.cloud.storage", types.SimpleNamespace(Client=lambda: _FakeClient()) + ) + + import mostlyright.weather.earnings.stt as engine_stt + + class _FakeTranscriber: + def __init__(self, *a, **k) -> None: ... + def transcribe(self, path, *, initial_prompt=None): + events.append("transcribe") + return _FakeSegments() + + monkeypatch.setattr(engine_stt, "SttTranscriber", _FakeTranscriber) + + import mostlyright.weather.earnings.ledger as engine_ledger + + class _FakeLedger: + # STT writes the COMPLETE transcript via the idempotent ``replace`` (not + # ``append``) so a retry does not double rows (Codex R7-2 P1). + def replace(self, rows, *, ticker, call_id): + events.append("ledger") + if ledger_raises: + raise RuntimeError("ledger write failed") + return len(rows) + + monkeypatch.setattr(engine_ledger, "TranscriptLedger", _FakeLedger) + return state + + +def test_handoff_deleted_after_ledger_write(monkeypatch) -> None: + from services.earnings.jobs.stt import transcribe_call + + events: list[str] = [] + state = _install_fakes(monkeypatch, events, ledger_raises=False) + + out = transcribe_call( + "gs://handoff-bkt/handoff/CHWY/evt-1.wav", ticker="CHWY", call_id="evt-1", device="cpu" + ) + assert out["segments"] == 1 + # The source object was deleted, and ONLY after the ledger write. + assert state["deleted"] is True + assert events == ["download", "transcribe", "ledger", "delete"] + + +def test_handoff_kept_when_ledger_write_fails(monkeypatch) -> None: + from services.earnings.jobs.stt import transcribe_call + + events: list[str] = [] + state = _install_fakes(monkeypatch, events, ledger_raises=True) + + with pytest.raises(RuntimeError, match="ledger write failed"): + transcribe_call( + "gs://handoff-bkt/handoff/CHWY/evt-2.wav", ticker="CHWY", call_id="evt-2", device="cpu" + ) + # The ledger write failed, so the source audio is KEPT for a retry (NOT deleted). + assert state["deleted"] is False + assert "delete" not in events diff --git a/services/earnings/tests/test_stt_server.py b/services/earnings/tests/test_stt_server.py new file mode 100644 index 0000000..12854df --- /dev/null +++ b/services/earnings/tests/test_stt_server.py @@ -0,0 +1,130 @@ +"""Phase 28 (28-11): the STT Cloud Run SERVICE HTTP surface. + +The STT image is deployed as a Cloud Run service (must serve $PORT), so it exposes +GET /healthz (ready without a GPU model load) + POST /transcribe (wraps the shipped +transcriber). Audio is never returned. faster-whisper stays lazy — importing the +server + answering /healthz must not load it. +""" + +from __future__ import annotations + +import sys + +from fastapi.testclient import TestClient + + +def test_import_and_healthz_do_not_load_whisper(monkeypatch) -> None: + # Poison the heavy audio deps: importing the server + hitting /healthz must not + # touch them (they are lazy-loaded only inside transcribe_call on a real call). + for mod in ("faster_whisper", "ctranslate2", "av"): + monkeypatch.setitem(sys.modules, mod, None) + from services.earnings.jobs.stt_server import app + + client = TestClient(app) + resp = client.get("/healthz") + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + + +def test_transcribe_missing_field_returns_400() -> None: + from services.earnings.jobs.stt_server import app + + client = TestClient(app) + resp = client.post("/transcribe", json={"ticker": "GIS"}) # no audio_path/call_id + assert resp.status_code == 400 + assert "missing required field" in resp.json()["detail"] + + +def test_transcribe_delegates_to_transcribe_call(monkeypatch) -> None: + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured["audio_path"] = audio_path + captured.update(kwargs) + return {"ticker": kwargs["ticker"], "call_id": kwargs["call_id"], "segments": 3} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + client = TestClient(server.app) + resp = client.post( + "/transcribe", + json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1", "tier": "small"}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body == {"ticker": "GIS", "call_id": "c1", "segments": 3} + # No audio field leaks in the response (D-27.9). + assert not any("audio" in k.lower() for k in body) + assert captured["audio_path"] == "/tmp/a.wav" + assert captured["tier"] == "small" + + +def test_transcribe_derives_streaming_from_env(monkeypatch) -> None: + """The STT SERVICE derives publish_live/project/topic from its env — the deployed + capture->STT trigger posts none of them, so without this the serving /stream + subscriber never gets events for hosted calls (Codex R7-6 P2).""" + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured.update(kwargs) + return {"ticker": kwargs["ticker"], "call_id": kwargs["call_id"], "segments": 0} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + monkeypatch.setenv("EARNINGS_STREAMING_ENABLED", "1") + monkeypatch.setenv("EARNINGS_STREAMING_PROJECT", "mr-ingest") + monkeypatch.setenv("EARNINGS_STREAMING_TOPIC", "earnings-streaming") + + client = TestClient(server.app) + resp = client.post( + "/transcribe", json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1"} + ) + assert resp.status_code == 200 + assert captured["publish_live"] is True + assert captured["streaming_project"] == "mr-ingest" + assert captured["streaming_topic"] == "earnings-streaming" + + +def test_transcribe_no_streaming_env_defaults_off(monkeypatch) -> None: + """No streaming env -> publish_live stays off (unchanged local/default behavior).""" + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured.update(kwargs) + return {"ticker": "GIS", "call_id": "c1", "segments": 0} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + + client = TestClient(server.app) + resp = client.post( + "/transcribe", json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1"} + ) + assert resp.status_code == 200 + assert captured["publish_live"] is False + + +def test_transcribe_request_overrides_env_streaming(monkeypatch) -> None: + """An explicit request field still overrides the env-derived default.""" + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured.update(kwargs) + return {"ticker": "GIS", "call_id": "c1", "segments": 0} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + monkeypatch.setenv("EARNINGS_STREAMING_ENABLED", "1") + + client = TestClient(server.app) + resp = client.post( + "/transcribe", + json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1", "publish_live": False}, + ) + assert resp.status_code == 200 + assert captured["publish_live"] is False diff --git a/services/weather/app.py b/services/weather/app.py index 7450e28..3507cc0 100644 --- a/services/weather/app.py +++ b/services/weather/app.py @@ -34,7 +34,7 @@ from fastapi import FastAPI from starlette.middleware.cors import CORSMiddleware -from . import routes +from . import health, routes from .deps import SatelliteReadSource, ServingState from .middleware.auth import API_KEY_ENV, ApiKeyAuthMiddleware from .middleware.ceiling import ( @@ -187,6 +187,9 @@ def create_app( ) app.state.serving = ServingState.build(source=source) + # /healthz is unauthenticated (exempted in every middleware) — the Cloud Run + # HTTP probe idiom. Registered first so it is always present. + app.include_router(health.router, tags=["health"]) app.include_router(routes.router, tags=["satellite"]) resolved_key = _resolve_env_key() if api_key is _UNSET else api_key diff --git a/services/weather/health.py b/services/weather/health.py new file mode 100644 index 0000000..7b11414 --- /dev/null +++ b/services/weather/health.py @@ -0,0 +1,28 @@ +"""``GET /healthz`` — the unauthenticated container health probe (Phase 28, 28-30). + +Cloud Run (and any HTTP uptime check) polls this to decide whether the +weather-serving instance is live. It MUST NOT be gated by the API-key auth (a +probe cannot present the key) nor consume a rate-limit / global-ceiling token (a +probe throttled to 429 would make Cloud Run kill a healthy instance). The path +``/healthz`` is therefore exempted at the TOP of every middleware's ``dispatch`` +(auth + ratelimit + ceiling) — see ``middleware/``. + +The response is a static ``{"status": "ok"}`` — it touches NO R2 state (a health +probe must not depend on the read token / a bucket round-trip that could make a +serving-capable instance report unhealthy). +""" + +from __future__ import annotations + +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/healthz", summary="Liveness probe (unauthenticated, no R2 touch)") +def healthz() -> dict[str, str]: + """Return a static liveness token — no auth, no R2 round-trip.""" + return {"status": "ok"} + + +__all__ = ["router"] diff --git a/services/weather/middleware/auth.py b/services/weather/middleware/auth.py index 6a89976..0327572 100644 --- a/services/weather/middleware/auth.py +++ b/services/weather/middleware/auth.py @@ -65,6 +65,10 @@ def __init__(self, app: object, *, expected_key: str | None) -> None: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the unauthenticated Cloud Run liveness probe — it cannot + # present the API key, so it bypasses the gate before the key check. + if request.url.path.rstrip("/") == "/healthz": + return await call_next(request) if self._expected_key is None: # Keyless local/dev mode — gate open. return await call_next(request) diff --git a/services/weather/middleware/ceiling.py b/services/weather/middleware/ceiling.py index e6f2e9f..6e9ad1e 100644 --- a/services/weather/middleware/ceiling.py +++ b/services/weather/middleware/ceiling.py @@ -87,6 +87,11 @@ def _consume(self) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the Cloud Run liveness probe — it must not consume a global + # ceiling token, or a probe answered 429 would make Cloud Run kill a + # healthy instance. + if request.url.path.rstrip("/") == "/healthz": + return await call_next(request) if not self._consume(): return JSONResponse( status_code=429, diff --git a/services/weather/middleware/ratelimit.py b/services/weather/middleware/ratelimit.py index 0258af6..fd23ebc 100644 --- a/services/weather/middleware/ratelimit.py +++ b/services/weather/middleware/ratelimit.py @@ -149,6 +149,9 @@ def _consume(self, key: str) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the Cloud Run liveness probe — never throttle it. + if request.url.path.rstrip("/") == "/healthz": + return await call_next(request) if not self._consume(self._client_key(request)): return JSONResponse( status_code=429, diff --git a/services/weather/tests/test_healthz.py b/services/weather/tests/test_healthz.py new file mode 100644 index 0000000..fc4e7cb --- /dev/null +++ b/services/weather/tests/test_healthz.py @@ -0,0 +1,45 @@ +"""Phase 28 (28-30): /healthz probe on the weather serving app. + +/healthz is served UNAUTHENTICATED and is exempt from BOTH the per-key rate +limiter and the H4 global request ceiling (the Cloud Run probe idiom — a probe +answered 401/429 would make Cloud Run kill a healthy instance), while every +other route stays key-gated + ceiling-bounded. It touches no R2 state. +""" + +from __future__ import annotations + +from fastapi.testclient import TestClient + +from services.weather.app import create_app + +_KEY = "test-key-weather" + + +def test_healthz_ok_without_auth() -> None: + app = create_app(api_key=_KEY) + client = TestClient(app) + resp = client.get("/healthz") # no key header + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + + +def test_other_routes_still_401_without_key() -> None: + app = create_app(api_key=_KEY) + client = TestClient(app) + assert client.get("/capabilities").status_code == 401 + + +def test_healthz_exempt_from_per_key_rate_limit() -> None: + app = create_app(api_key=_KEY, rate_limit=1, rate_window_seconds=60.0) + client = TestClient(app) + for _ in range(5): + assert client.get("/healthz").status_code == 200 + + +def test_healthz_exempt_from_global_ceiling() -> None: + # A global ceiling of 1 rps would 429 the 2nd request in the window; /healthz + # must bypass it (H4 ceiling is for the public data surface, not the probe). + app = create_app(api_key=_KEY, global_limit=1, global_window_seconds=60.0) + client = TestClient(app) + for _ in range(5): + assert client.get("/healthz").status_code == 200