From 0c0788e20426e4bd395fceb5e61aa585b349c2f7 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 19:15:18 +0200 Subject: [PATCH 01/17] feat(28): /healthz probes + SegmentSubscriber lifespan wiring on the serving apps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unauthenticated /healthz on earnings + weather serving (exempt from auth + ratelimit + the weather global ceiling — the Cloud Run probe idiom). Wire the cross-project earnings-streaming SegmentSubscriber into the earnings app lifespan (opt-in via EARNINGS_STREAMING_SUBSCRIPTION; no-op default, H2 single-instance) via a registry-backed bus adapter so /stream can find the bus. --- services/earnings/app.py | 112 ++++++++++++++- services/earnings/middleware/auth.py | 5 + services/earnings/middleware/ratelimit.py | 4 + services/earnings/routes/health.py | 30 ++++ .../tests/test_healthz_and_lifespan.py | 130 ++++++++++++++++++ services/weather/app.py | 5 +- services/weather/health.py | 28 ++++ services/weather/middleware/auth.py | 4 + services/weather/middleware/ceiling.py | 5 + services/weather/middleware/ratelimit.py | 3 + services/weather/tests/test_healthz.py | 45 ++++++ 11 files changed, 369 insertions(+), 2 deletions(-) create mode 100644 services/earnings/routes/health.py create mode 100644 services/earnings/tests/test_healthz_and_lifespan.py create mode 100644 services/weather/health.py create mode 100644 services/weather/tests/test_healthz.py diff --git a/services/earnings/app.py b/services/earnings/app.py index acf1f9f..52dd40a 100644 --- a/services/earnings/app.py +++ b/services/earnings/app.py @@ -15,16 +15,37 @@ from __future__ import annotations +import asyncio +import contextlib +import logging import os import re +import threading from pathlib import Path +from typing import TYPE_CHECKING from fastapi import FastAPI from .deps import ServingState from .middleware.auth import API_KEY_ENV, ApiKeyAuthMiddleware from .middleware.ratelimit import TokenBucketRateLimitMiddleware -from .routes import capabilities, facts, stream, transcripts +from .routes import capabilities, facts, health, stream, transcripts + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + +_LOG = logging.getLogger("services.earnings.app") + +#: Env var naming the cross-project ``earnings-streaming`` Pub/Sub SUBSCRIPTION +#: the serving instance pulls live segments from (28-12 C2). When UNSET (the +#: default — including every test and a ledger-only serving deploy) the app runs +#: WITHOUT a subscriber: ``/stream`` 404s until the live pipeline is wired. When +#: SET, the lifespan starts the :class:`SegmentSubscriber` (H2: single instance). +STREAMING_SUBSCRIPTION_ENV = "EARNINGS_STREAMING_SUBSCRIPTION" + +#: Env var naming the ingest project that owns the earnings-streaming topic +#: (cross-project, C2). Falls back to ``GOOGLE_CLOUD_PROJECT`` (Cloud Run sets it). +INGEST_PROJECT_ENV = "EARNINGS_INGEST_PROJECT" #: Default per-client request budget + window for the public feed. Overridable #: per deploy; a burst beyond this returns 429 (T-27-27). @@ -105,6 +126,91 @@ def assert_no_audio_surface(app: FastAPI) -> None: ) +class _RegistryBusAdapter: + """Route :class:`SegmentSubscriber` republishes into the per-call bus registry. + + The subscriber expects a single ``SegmentBus``-shaped object + (``publish(call_id, item)`` / ``close(call_id)`` returning coroutines). In the + HOSTED serving topology there is no local capture runner to + :meth:`BusRegistry.register` a call's bus — the cross-project Pub/Sub + subscriber IS the bus producer. So this adapter ``get_or_create``s the call's + bus on first message, after which the ``/stream`` route (which uses + :meth:`BusRegistry.get`) can find it. Registry mutation is lock-guarded, so + the StreamingPull transport thread and the serving loop do not race. + """ + + def __init__(self, registry: object) -> None: + self._registry = registry + + def publish(self, call_id: str, item: object) -> object: + return self._registry.get_or_create(call_id).publish(call_id, item) # type: ignore[attr-defined] + + def close(self, call_id: str) -> object: + return self._registry.get_or_create(call_id).close(call_id) # type: ignore[attr-defined] + + +@contextlib.asynccontextmanager +async def _streaming_lifespan(app: FastAPI) -> AsyncIterator[None]: + """Start/stop the cross-project earnings-streaming subscriber (28-12 C2, H2). + + OPT-IN: only when ``EARNINGS_STREAMING_SUBSCRIPTION`` is set. Absent it (the + default — every test + a ledger-only deploy) this is a no-op and app startup + is unchanged. When set, it records the serving event loop on the bus registry + and runs :meth:`SegmentSubscriber.consume` on a daemon thread so live + segments/facts from the ingest pipeline reach the ``/stream`` fan-out. + + **H2 (load-bearing).** The subscriber shares ONE Pub/Sub subscription; correct + fan-out requires EXACTLY ONE always-warm instance (``max-instances=1`` + + affinity, pinned in ``infra/cloud_run.tf``). Teardown relies on daemon-thread + process exit (Cloud Run always-warm single instance) — a graceful + StreamingPull cancel is a deferred hardening seam. + """ + subscription = os.environ.get(STREAMING_SUBSCRIPTION_ENV, "").strip() + if not subscription: + yield + return + project = ( + os.environ.get(INGEST_PROJECT_ENV) or os.environ.get("GOOGLE_CLOUD_PROJECT") or "" + ).strip() + if not project: + raise RuntimeError( + f"{STREAMING_SUBSCRIPTION_ENV} is set but no ingest project is configured — " + f"set {INGEST_PROJECT_ENV} (or GOOGLE_CLOUD_PROJECT) to the project that owns " + "the earnings-streaming topic (cross-project, C2)." + ) + # Lazy-import the GCP-SDK-backed bridge factories so importing this module (and + # the ledger-only default deploy) never needs google-cloud-pubsub. + from .pubsub_bridge import ( + SegmentSubscriber, + build_streaming_pull, + make_run_coroutine_threadsafe, + ) + + registry = app.state.serving.buses + registry.serving_loop = asyncio.get_running_loop() + subscriber = SegmentSubscriber( + _RegistryBusAdapter(registry), # type: ignore[arg-type] + run_on_loop=make_run_coroutine_threadsafe(registry.serving_loop), + ) + streaming_pull = build_streaming_pull(project, subscription) + + def _run() -> None: + try: + subscriber.consume(streaming_pull) + except Exception: # pragma: no cover - transport-thread crash path + _LOG.exception("earnings-streaming subscriber thread exited unexpectedly") + + thread = threading.Thread(target=_run, name="earnings-streaming-subscriber", daemon=True) + thread.start() + _LOG.info("earnings-streaming subscriber started (subscription=%s)", subscription) + try: + yield + finally: + # Daemon thread: the always-warm single instance tears down on process + # exit (Cloud Run). A graceful StreamingPull cancel is a deferred seam. + _LOG.info("earnings-streaming subscriber shutdown (daemon thread will exit with process)") + + #: Sentinel so ``api_key`` can distinguish "not passed → read env" from an #: explicit ``None`` (keyless local/dev mode). _UNSET = object() @@ -184,9 +290,13 @@ def create_app( title="mostlyright earnings serving API", summary="Transcript + derived-fact serving (text/facts only — never audio).", version="0.1.0", + lifespan=_streaming_lifespan, ) app.state.serving = ServingState.build(ledger_root, stt_tier=stt_tier) + # /healthz is unauthenticated (exempted in the auth + ratelimit middleware) — + # the Cloud Run HTTP probe idiom. Registered first so it is always present. + app.include_router(health.router, tags=["health"]) app.include_router(transcripts.router, tags=["transcripts"]) app.include_router(facts.router, tags=["facts"]) app.include_router(capabilities.router, tags=["capabilities"]) diff --git a/services/earnings/middleware/auth.py b/services/earnings/middleware/auth.py index 633e28e..251789d 100644 --- a/services/earnings/middleware/auth.py +++ b/services/earnings/middleware/auth.py @@ -86,6 +86,11 @@ def _stream_token_ok(self, request: Request) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the unauthenticated Cloud Run liveness probe — a health check + # cannot present the API key, so it bypasses the gate BEFORE the key check + # (mirrors the /stream signed-token exemption below). + if request.url.path == "/healthz": + return await call_next(request) if self._expected_key is None: # Keyless local/dev mode — gate open. return await call_next(request) diff --git a/services/earnings/middleware/ratelimit.py b/services/earnings/middleware/ratelimit.py index d6a069c..1127846 100644 --- a/services/earnings/middleware/ratelimit.py +++ b/services/earnings/middleware/ratelimit.py @@ -160,6 +160,10 @@ def _consume(self, key: str) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the Cloud Run liveness probe — never throttle it: a probe + # answered with 429 would make Cloud Run kill a healthy instance. + if request.url.path == "/healthz": + return await call_next(request) if not self._consume(self._client_key(request)): return JSONResponse( status_code=429, diff --git a/services/earnings/routes/health.py b/services/earnings/routes/health.py new file mode 100644 index 0000000..54e515d --- /dev/null +++ b/services/earnings/routes/health.py @@ -0,0 +1,30 @@ +"""``GET /healthz`` — the unauthenticated container health probe (Phase 28, 28-12). + +Cloud Run (and any HTTP uptime check) needs a cheap, dependency-free endpoint it +can poll to decide whether an instance is live. It MUST NOT be gated by the +API-key auth (a probe cannot present the key) nor consume a rate-limit/ceiling +token (a probe throttled to 429 would make Cloud Run kill a healthy instance). +The path ``/healthz`` is therefore exempted at the TOP of every middleware's +``dispatch`` (auth + ratelimit), mirroring the existing ``/stream`` token +exemption — see ``middleware/auth.py`` and ``middleware/ratelimit.py``. + +The response is a static ``{"status": "ok"}`` — it deliberately touches NO +ledger, R2, or Pub/Sub state (a health probe must not depend on downstream I/O +that could make a serving-capable instance report unhealthy). It carries no +audio surface (D-27.9): the path/schema is a plain status string. +""" + +from __future__ import annotations + +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/healthz", summary="Liveness probe (unauthenticated, no downstream I/O)") +def healthz() -> dict[str, str]: + """Return a static liveness token — no auth, no ledger/R2/Pub-Sub touch.""" + return {"status": "ok"} + + +__all__ = ["router"] diff --git a/services/earnings/tests/test_healthz_and_lifespan.py b/services/earnings/tests/test_healthz_and_lifespan.py new file mode 100644 index 0000000..6d535b7 --- /dev/null +++ b/services/earnings/tests/test_healthz_and_lifespan.py @@ -0,0 +1,130 @@ +"""Phase 28 (28-12): /healthz probe + the SegmentSubscriber lifespan wiring. + +Covers the deploy-runtime additions to the earnings serving app: + * /healthz is served UNAUTHENTICATED and is exempt from the rate limiter + (the Cloud Run probe idiom) while every other route stays key-gated. + * The streaming lifespan is a NO-OP by default (EARNINGS_STREAMING_SUBSCRIPTION + unset) — behaviour identical to the ledger-only deploy. + * The _RegistryBusAdapter routes a subscriber republish into the per-call + BusRegistry so /stream (which uses BusRegistry.get) can find the bus. + * When the subscription env IS set, the lifespan starts the subscriber thread + and records the serving event loop. +""" + +from __future__ import annotations + +import asyncio +import threading + +from fastapi.testclient import TestClient +from mostlyright.weather.earnings.streaming_transcriber import Segment + +from services.earnings.app import _RegistryBusAdapter, create_app +from services.earnings.deps import BusRegistry + +_KEY = "test-key-abc" + + +# --------------------------------------------------------------------------- +# /healthz — unauthenticated + unthrottled +# --------------------------------------------------------------------------- +def test_healthz_ok_without_auth() -> None: + app = create_app(api_key=_KEY) + client = TestClient(app) + resp = client.get("/healthz") # no key header + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + + +def test_other_routes_still_401_without_key() -> None: + # /healthz being open must not open the rest of the surface. + app = create_app(api_key=_KEY) + client = TestClient(app) + assert client.get("/capabilities").status_code == 401 + + +def test_healthz_not_rate_limited() -> None: + # A tiny per-client budget: /capabilities would 429 on the 2nd call, but + # /healthz bypasses the limiter entirely (a throttled probe would make Cloud + # Run kill a healthy instance). + app = create_app(api_key=_KEY, rate_limit=1, rate_window_seconds=60.0) + client = TestClient(app) + for _ in range(5): + assert client.get("/healthz").status_code == 200 + + +# --------------------------------------------------------------------------- +# Lifespan — no-op by default +# --------------------------------------------------------------------------- +def test_lifespan_noop_without_subscription_env(monkeypatch) -> None: + monkeypatch.delenv("EARNINGS_STREAMING_SUBSCRIPTION", raising=False) + app = create_app(api_key=_KEY) + # Entering the context triggers lifespan startup/shutdown; it must not raise + # and must not start a subscriber thread. + with TestClient(app) as client: + assert client.get("/healthz").status_code == 200 + assert not any(t.name == "earnings-streaming-subscriber" for t in threading.enumerate()) + + +# --------------------------------------------------------------------------- +# _RegistryBusAdapter — republish routes into the registry +# --------------------------------------------------------------------------- +def test_registry_bus_adapter_publish_creates_and_routes() -> None: + registry = BusRegistry() + adapter = _RegistryBusAdapter(registry) + seg = Segment(text="hi", is_final=True, spoken_at=1.0, stream_seq=1, knowledge_time=1.0) + # publish returns a coroutine (SegmentBus.publish is async) — await it. + asyncio.run(adapter.publish("call-1", seg)) + # /stream uses BusRegistry.get — the adapter must have registered the bus. + assert registry.get("call-1") is not None + + +def test_registry_bus_adapter_close_routes() -> None: + registry = BusRegistry() + adapter = _RegistryBusAdapter(registry) + asyncio.run(adapter.close("call-2")) + bus = registry.get("call-2") + assert bus is not None + assert bus.is_closed("call-2") + + +# --------------------------------------------------------------------------- +# Lifespan — starts the subscriber when the subscription env is set +# --------------------------------------------------------------------------- +def test_lifespan_starts_subscriber_when_env_set(monkeypatch) -> None: + monkeypatch.setenv("EARNINGS_STREAMING_SUBSCRIPTION", "earnings-streaming-serving") + monkeypatch.setenv("EARNINGS_INGEST_PROJECT", "mr-earnings-ingest") + + consumed = threading.Event() + + def _fake_streaming_pull(project: str, subscription: str): + assert project == "mr-earnings-ingest" + assert subscription == "earnings-streaming-serving" + + def _pull(callback) -> None: + # The subscriber thread reaches here — signal, then return (the fake + # feed is exhausted; the real client would block on future.result()). + consumed.set() + + return _pull + + def _fake_run_on_loop(loop): + def _run(coro): + coro.close() # nothing fed, so no coroutine actually runs + + return _run + + # The lifespan lazy-imports these from pubsub_bridge; patch there. + monkeypatch.setattr( + "services.earnings.pubsub_bridge.build_streaming_pull", _fake_streaming_pull + ) + monkeypatch.setattr( + "services.earnings.pubsub_bridge.make_run_coroutine_threadsafe", _fake_run_on_loop + ) + + app = create_app(api_key=_KEY) + with TestClient(app) as client: + assert consumed.wait(timeout=5.0), "subscriber consume thread did not start" + # The serving loop is recorded so an out-of-loop producer can inject. + assert app.state.serving.buses.serving_loop is not None + assert client.get("/healthz").status_code == 200 diff --git a/services/weather/app.py b/services/weather/app.py index 7450e28..3507cc0 100644 --- a/services/weather/app.py +++ b/services/weather/app.py @@ -34,7 +34,7 @@ from fastapi import FastAPI from starlette.middleware.cors import CORSMiddleware -from . import routes +from . import health, routes from .deps import SatelliteReadSource, ServingState from .middleware.auth import API_KEY_ENV, ApiKeyAuthMiddleware from .middleware.ceiling import ( @@ -187,6 +187,9 @@ def create_app( ) app.state.serving = ServingState.build(source=source) + # /healthz is unauthenticated (exempted in every middleware) — the Cloud Run + # HTTP probe idiom. Registered first so it is always present. + app.include_router(health.router, tags=["health"]) app.include_router(routes.router, tags=["satellite"]) resolved_key = _resolve_env_key() if api_key is _UNSET else api_key diff --git a/services/weather/health.py b/services/weather/health.py new file mode 100644 index 0000000..7b11414 --- /dev/null +++ b/services/weather/health.py @@ -0,0 +1,28 @@ +"""``GET /healthz`` — the unauthenticated container health probe (Phase 28, 28-30). + +Cloud Run (and any HTTP uptime check) polls this to decide whether the +weather-serving instance is live. It MUST NOT be gated by the API-key auth (a +probe cannot present the key) nor consume a rate-limit / global-ceiling token (a +probe throttled to 429 would make Cloud Run kill a healthy instance). The path +``/healthz`` is therefore exempted at the TOP of every middleware's ``dispatch`` +(auth + ratelimit + ceiling) — see ``middleware/``. + +The response is a static ``{"status": "ok"}`` — it touches NO R2 state (a health +probe must not depend on the read token / a bucket round-trip that could make a +serving-capable instance report unhealthy). +""" + +from __future__ import annotations + +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/healthz", summary="Liveness probe (unauthenticated, no R2 touch)") +def healthz() -> dict[str, str]: + """Return a static liveness token — no auth, no R2 round-trip.""" + return {"status": "ok"} + + +__all__ = ["router"] diff --git a/services/weather/middleware/auth.py b/services/weather/middleware/auth.py index 6a89976..bb43489 100644 --- a/services/weather/middleware/auth.py +++ b/services/weather/middleware/auth.py @@ -65,6 +65,10 @@ def __init__(self, app: object, *, expected_key: str | None) -> None: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the unauthenticated Cloud Run liveness probe — it cannot + # present the API key, so it bypasses the gate before the key check. + if request.url.path == "/healthz": + return await call_next(request) if self._expected_key is None: # Keyless local/dev mode — gate open. return await call_next(request) diff --git a/services/weather/middleware/ceiling.py b/services/weather/middleware/ceiling.py index e6f2e9f..0e4b99b 100644 --- a/services/weather/middleware/ceiling.py +++ b/services/weather/middleware/ceiling.py @@ -87,6 +87,11 @@ def _consume(self) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the Cloud Run liveness probe — it must not consume a global + # ceiling token, or a probe answered 429 would make Cloud Run kill a + # healthy instance. + if request.url.path == "/healthz": + return await call_next(request) if not self._consume(): return JSONResponse( status_code=429, diff --git a/services/weather/middleware/ratelimit.py b/services/weather/middleware/ratelimit.py index 0258af6..05a1fe6 100644 --- a/services/weather/middleware/ratelimit.py +++ b/services/weather/middleware/ratelimit.py @@ -149,6 +149,9 @@ def _consume(self, key: str) -> bool: async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: + # /healthz is the Cloud Run liveness probe — never throttle it. + if request.url.path == "/healthz": + return await call_next(request) if not self._consume(self._client_key(request)): return JSONResponse( status_code=429, diff --git a/services/weather/tests/test_healthz.py b/services/weather/tests/test_healthz.py new file mode 100644 index 0000000..fc4e7cb --- /dev/null +++ b/services/weather/tests/test_healthz.py @@ -0,0 +1,45 @@ +"""Phase 28 (28-30): /healthz probe on the weather serving app. + +/healthz is served UNAUTHENTICATED and is exempt from BOTH the per-key rate +limiter and the H4 global request ceiling (the Cloud Run probe idiom — a probe +answered 401/429 would make Cloud Run kill a healthy instance), while every +other route stays key-gated + ceiling-bounded. It touches no R2 state. +""" + +from __future__ import annotations + +from fastapi.testclient import TestClient + +from services.weather.app import create_app + +_KEY = "test-key-weather" + + +def test_healthz_ok_without_auth() -> None: + app = create_app(api_key=_KEY) + client = TestClient(app) + resp = client.get("/healthz") # no key header + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + + +def test_other_routes_still_401_without_key() -> None: + app = create_app(api_key=_KEY) + client = TestClient(app) + assert client.get("/capabilities").status_code == 401 + + +def test_healthz_exempt_from_per_key_rate_limit() -> None: + app = create_app(api_key=_KEY, rate_limit=1, rate_window_seconds=60.0) + client = TestClient(app) + for _ in range(5): + assert client.get("/healthz").status_code == 200 + + +def test_healthz_exempt_from_global_ceiling() -> None: + # A global ceiling of 1 rps would 429 the 2nd request in the window; /healthz + # must bypass it (H4 ceiling is for the public data surface, not the probe). + app = create_app(api_key=_KEY, global_limit=1, global_window_seconds=60.0) + client = TestClient(app) + for _ in range(5): + assert client.get("/healthz").status_code == 200 From dc6070ea825a7e22431a824377f2fc16e843628e Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 19:15:18 +0200 Subject: [PATCH 02/17] feat(28): 6 service Dockerfiles + earnings ingest entrypoints Audio-free earnings-serving image (parquet extra only, no whisper/av/ffmpeg, firewall a); shared weather ingest image (satellite CLI); capture/stt/rolefact CUDA/CPU images + thin python -m services.earnings.jobs.* entrypoints that drive the shipped engine libraries (lazy audio imports; audio dies on ephemeral disk). --- deploy/earnings/capture.Dockerfile | 54 +++ deploy/earnings/rolefact.Dockerfile | 46 ++ deploy/earnings/serving.Dockerfile | 63 +++ deploy/earnings/stt.Dockerfile | 60 +++ deploy/weather/ingest.Dockerfile | 41 ++ services/earnings/jobs/__init__.py | 29 ++ services/earnings/jobs/_env.py | 39 ++ services/earnings/jobs/capture.py | 121 ++++++ services/earnings/jobs/rolefact.py | 221 ++++++++++ services/earnings/jobs/stt.py | 213 ++++++++++ .../earnings/tests/test_jobs_entrypoints.py | 401 ++++++++++++++++++ 11 files changed, 1288 insertions(+) create mode 100644 deploy/earnings/capture.Dockerfile create mode 100644 deploy/earnings/rolefact.Dockerfile create mode 100644 deploy/earnings/serving.Dockerfile create mode 100644 deploy/earnings/stt.Dockerfile create mode 100644 deploy/weather/ingest.Dockerfile create mode 100644 services/earnings/jobs/__init__.py create mode 100644 services/earnings/jobs/_env.py create mode 100644 services/earnings/jobs/capture.py create mode 100644 services/earnings/jobs/rolefact.py create mode 100644 services/earnings/jobs/stt.py create mode 100644 services/earnings/tests/test_jobs_entrypoints.py diff --git a/deploy/earnings/capture.Dockerfile b/deploy/earnings/capture.Dockerfile new file mode 100644 index 0000000..93f7a1f --- /dev/null +++ b/deploy/earnings/capture.Dockerfile @@ -0,0 +1,54 @@ +# Earnings CAPTURE Cloud Run Job — the AUDIO side of the firewall (28-13). +# +# The VOD/replay cold-fetch stage (mr-earnings-ingest). It packages +# `services/earnings/` + the two SDK packages with the `[earnings]` extra +# (faster-whisper + `av`/PyAV) and runs `python -m services.earnings.jobs.capture`. +# +# On the AUDIO side by design (unlike the SLIM weather serving image, which OMITS +# any audio toolchain). The shipped Q4 capture surface extracts the transient +# audio track via PyAV (`av`), which ships FFmpeg's libraries as binary wheels — +# so a system `ffmpeg` is NOT strictly required. We still `apt-get install ffmpeg` +# as belt-and-suspenders for any codec PyAV's bundled libs defer to. +# +# NO chromium: the 27-03 Q4 static-MP4 capture is a cold ranged-GET over httpx +# (the guest form gates the PAGE, not the asset — RESEARCH §2). There is NO +# headless-browser navigation on this path, so no Chromium is installed. +# +# The captured audio is a TRANSIENT artifact on the task's ephemeral disk — it is +# NEVER uploaded, NEVER served, NEVER a ledger column (D-27.9). NON-published: +# this image COPYs `services/` (a monorepo service), never a PyPI wheel. + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# ffmpeg for any codec PyAV's bundled FFmpeg libs defer to (audio side — expected +# here, unlike serving). No chromium: the Q4 static-MP4 path is a cold httpx GET. +RUN apt-get update \ + && apt-get install -y --no-install-recommends ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +# --- Dependency layer -------------------------------------------------------- +# Copy the package sources first so the dep layer caches across app-code edits. +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# Install core + weather[earnings] (the [earnings] extra pulls faster-whisper + +# av/PyAV — the audio extract + STT engine deps, lazy-imported at runtime). +RUN pip install \ + ./packages/core \ + "./packages/weather[earnings]" + +# --- App layer --------------------------------------------------------------- +# The non-published service is imported as `services.earnings.*` (matching the +# repo-root conftest sys.path convention), so it is copied under /app/services. +COPY services/earnings/ services/earnings/ + +# Cloud Run Jobs pass the capture spec via env (CAPTURE_TICKER / CAPTURE_CALL_ID / +# CAPTURE_WEBCAST_URL / CAPTURE_OUT_DIR). The entrypoint fails loud on a missing var. +ENTRYPOINT ["python", "-m", "services.earnings.jobs.capture"] diff --git a/deploy/earnings/rolefact.Dockerfile b/deploy/earnings/rolefact.Dockerfile new file mode 100644 index 0000000..94e4e9d --- /dev/null +++ b/deploy/earnings/rolefact.Dockerfile @@ -0,0 +1,46 @@ +# Earnings ROLEFACT Cloud Run Job — role-attribution + fact-building (28-13). +# +# The POST-audio, CPU-only stage. It reads the persisted transcript TEXT from the +# transcript ledger, role-attributes turns, counts mentions, builds +# `schema.earnings_fact.v1` rows (fail-closed Kalshi filter), writes the fact +# ledger, and OPTIONALLY uploads the derived fact parquet to R2 via the shipped +# write sink. It packages `services/earnings/` + the two SDK packages with the +# `[earnings]` extra + boto3 (R2 write) and runs +# `python -m services.earnings.jobs.rolefact`. +# +# NO ffmpeg / NO faster-whisper GPU / NO chromium: this stage is entirely +# post-audio (it never touches audio bytes — D-27.9). SLIM CPython base. The +# `[earnings]` extra still pulls faster-whisper/av transitively, but they are +# lazy-imported and never exercised on this CPU path. boto3 is added explicitly +# for the R2 write sink (`mostlyright.weather.satellite._r2_sink`, which imports +# boto3 lazily and reads the write-token creds from the env by NAME). +# +# NON-published: COPYs `services/`, never a PyPI wheel. + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# --- Dependency layer -------------------------------------------------------- +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# core + weather[earnings] (fact_builder / ledger / role_parser / classify_mentions +# live in the weather earnings module) + boto3 for the R2 write sink. +RUN pip install \ + ./packages/core \ + "./packages/weather[earnings]" \ + "boto3>=1.34,<2.0" + +# --- App layer --------------------------------------------------------------- +COPY services/earnings/ services/earnings/ + +# Cloud Run Jobs pass the rolefact spec via env (ROLEFACT_TICKER / ROLEFACT_CALL_ID +# / ROLEFACT_TERMS / ROLEFACT_ROSTER; ROLEFACT_R2_BUCKET + R2_* write creds opt in +# to the R2 upload). The entrypoint fails loud on a missing var. +ENTRYPOINT ["python", "-m", "services.earnings.jobs.rolefact"] diff --git a/deploy/earnings/serving.Dockerfile b/deploy/earnings/serving.Dockerfile new file mode 100644 index 0000000..23a4aaa --- /dev/null +++ b/deploy/earnings/serving.Dockerfile @@ -0,0 +1,63 @@ +# Earnings serving image — the hosted /transcripts + /facts + /capabilities + +# /stream (SSE) REST app (28-12) in mr-serving/eu-west3. +# +# AUDIO FIREWALL (D-27.9, legal — Swatch v. Bloomberg). This SERVING image +# PHYSICALLY OMITS the audio toolchain: it installs mostlyrightmd-weather with +# the [parquet] extra ONLY (pandas for the ledger DataFrame path) — NOT the +# [earnings] extra, so faster-whisper (CTranslate2) and av (PyAV/FFmpeg) are +# ABSENT. There is no ffmpeg, no Chromium, no whisper in this image. Audio never +# reaches serving; only text/fact parquet does (via the ledger, and the +# earnings-streaming Pub/Sub bridge which carries a closed text/facts-only +# envelope). The earnings engine's heavy deps are lazy-imported, so importing +# `mostlyright.weather.earnings.{ledger,segment_bus,streaming_transcriber}` here +# needs none of them. +# +# NON-PUBLISHED: this COPYs the `services/earnings/` monorepo service; it is NOT +# a PyPI wheel and MUST NOT enter any published dist (the wheel grep-gate stays +# clean). uvicorn/fastapi/google-cloud-pubsub live in THIS image, never in a dist. +# +# Read-closed by construction: the app fails CLOSED at startup if EARNINGS_API_KEY +# is unset (services/earnings/app.py::_resolve_env_key), so a misconfigured deploy +# crashes loud rather than serving unauthenticated. Cloud Run injects PORT. + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# --- Dependency layer -------------------------------------------------------- +# Copy just the package sources the serving app imports (core + weather) first so +# the dep layer caches across app-code edits. +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# Install the two published distributions with the [parquet] extra ONLY (pandas +# for the ledger read path) — deliberately NOT [earnings] (that pulls +# faster-whisper + av, the audio toolchain the firewall forbids on serving) — plus +# the serving runtime (FastAPI + uvicorn) and google-cloud-pubsub (the SSE +# streaming subscriber; only started when EARNINGS_STREAMING_SUBSCRIPTION is set, +# lazy-imported so the ledger-only default deploy needs no Pub/Sub call). +RUN pip install \ + ./packages/core \ + "./packages/weather[parquet]" \ + "fastapi>=0.115,<1" \ + "uvicorn[standard]>=0.30" \ + "google-cloud-pubsub>=2.20,<3" + +# --- App layer --------------------------------------------------------------- +# The non-published serving app is imported as `services.earnings.*` (matching the +# repo-root conftest sys.path convention), so it is copied under /app/services. +COPY services/earnings/ services/earnings/ + +ENV PORT=8080 +EXPOSE 8080 + +# One uvicorn worker: the in-process per-key rate limiter + the single-instance +# SSE bus fan-out (H2) are per-process. Cloud Run pins this service at +# min=max=1 (infra/cloud_run.tf) so the shared earnings-streaming subscription is +# consumed by exactly one instance. /healthz is unauthenticated for the probe. +CMD ["sh", "-c", "uvicorn services.earnings.app:app --host 0.0.0.0 --port ${PORT} --workers 1"] diff --git a/deploy/earnings/stt.Dockerfile b/deploy/earnings/stt.Dockerfile new file mode 100644 index 0000000..fed2a82 --- /dev/null +++ b/deploy/earnings/stt.Dockerfile @@ -0,0 +1,60 @@ +# Earnings STT Cloud Run GPU Job — faster-whisper transcription (28-13). +# +# The GPU transcription stage of the audio-side ingest pipeline. Runs on an L4 GPU +# in us-central1 (there is NO Cloud Run GPU in europe-west3, so this stage is +# region-split from the eu-west3 serving/ingest). It packages `services/earnings/` +# + the two SDK packages with the `[earnings]` extra and runs +# `python -m services.earnings.jobs.stt`. +# +# STT engine is faster-whisper (CTranslate2) — NO torch (D-27.5). CTranslate2 is +# the CUDA runtime; the cudnn-runtime CUDA base supplies the cuDNN/CUDA libs +# CTranslate2's GPU path links against. faster-whisper + av come from the +# `[earnings]` extra and are LAZY-imported at runtime (never at module load). +# +# Audio is a TRANSIENT input — only the transcript TEXT crosses into the ledger, +# never audio (D-27.9). NON-published: COPYs `services/`, never a PyPI wheel. + +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + DEBIAN_FRONTEND=noninteractive + +WORKDIR /app + +# python3.12 + pip on the CUDA base (Ubuntu 22.04 ships 3.10; add the deadsnakes +# 3.12 the SDK floors target). ffmpeg for any codec PyAV's bundled libs defer to. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + python3.12 \ + python3.12-venv \ + python3-pip \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3.12 /usr/local/bin/python \ + && ln -sf /usr/bin/python3.12 /usr/local/bin/python3 + +# --- Dependency layer -------------------------------------------------------- +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# core + weather[earnings] — the [earnings] extra pins faster-whisper>=1.0,<2.0 +# (CTranslate2 Whisper; NO torch) + av. python3.12's pip resolves the wheels. +RUN python -m pip install --break-system-packages \ + ./packages/core \ + "./packages/weather[earnings]" + +# --- App layer --------------------------------------------------------------- +COPY services/earnings/ services/earnings/ + +# Cloud Run Jobs pass the STT spec via env (STT_AUDIO_PATH / STT_TICKER / +# STT_CALL_ID / STT_TIER / STT_DEVICE / STT_COMPUTE_TYPE / STT_INITIAL_PROMPT; +# EARNINGS_STREAMING_* opt in to the live publish). The entrypoint fails loud on +# a missing var. +ENTRYPOINT ["python", "-m", "services.earnings.jobs.stt"] diff --git a/deploy/weather/ingest.Dockerfile b/deploy/weather/ingest.Dockerfile new file mode 100644 index 0000000..ca20b1d --- /dev/null +++ b/deploy/weather/ingest.Dockerfile @@ -0,0 +1,41 @@ +# Weather ingest image — the satellite backfill fleet (Cloud Batch, 28-21) AND +# the daily incremental job (Cloud Run Job, 28-22), both in mostlyright-satellite +# (H1) / us-central1. ONE image, two invocations (the ENTRYPOINT is the backfill +# subcommand; the batch/job args select roster/shard/incremental). +# +# BIG-BYTES FIREWALL (§4b): the ~28 TB raw imagery NEVER leaves the US. This image +# runs the Phase-25/26 satellite CLI near the GCS NODD mirror (--mirror gcp), +# reduces in-region, and uploads ONLY tiny derived per-station×date parquet to R2 +# (--r2-bucket). The R2 WRITE token + EUMETSAT creds are injected from Secret +# Manager as env by the deploy layer (infra/batch.tf) — never baked in. +# +# NON-PUBLISHED deploy image: it installs the PUBLISHED mostlyrightmd-weather +# [satellite] extra (boto3/s3fs/gcsfs/h5netcdf/xarray/numpy/eumdac), so unlike the +# earnings services it COPYs NO `services/` tree — it is a thin CLI wrapper around +# shipped wheel code. The heavy HDF5/xarray codecs ship as wheels (no apt). + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# Copy sources first so the (heavy) dep layer caches across edits. +COPY packages/core/ packages/core/ +COPY packages/weather/ packages/weather/ + +# Install core + weather[satellite] (the native satellite ring: anonymous NODD +# S3/GCS + keyed EUMETSAT Data Store, parquet reduce). No serving/audio deps. +RUN pip install \ + ./packages/core \ + "./packages/weather[satellite]" + +# The container command is the satellite `backfill` subcommand; Cloud Batch / +# Cloud Run Job append the flags (--mirror gcp --roster kalshi,polymarket +# [--incremental yesterday] --progress-bucket --r2-bucket ). Cloud Batch +# sets BATCH_TASK_INDEX/BATCH_TASK_COUNT so the CLI selects this shard's station +# from the roster (one array-task shard per station, D-28.8). +ENTRYPOINT ["python", "-m", "mostlyright.weather.satellite", "backfill"] diff --git a/services/earnings/jobs/__init__.py b/services/earnings/jobs/__init__.py new file mode 100644 index 0000000..5d36a43 --- /dev/null +++ b/services/earnings/jobs/__init__.py @@ -0,0 +1,29 @@ +"""Cloud Run Jobs entrypoints for the earnings INGEST pipeline (Phase 28, 28-13). + +These three thin ``python -m services.earnings.jobs.`` entrypoints are the +deploy-runtime scaffolding for the audio side of the firewall (the +``mr-earnings-ingest`` project). Each reads its job spec from environment +variables (how Cloud Run Jobs pass per-invocation config), invokes the SHIPPED +engine libraries under ``mostlyright.weather.earnings`` (never re-implementing +capture / STT / fact-building), and exits ``0`` on success / non-zero on failure +(fail loud). + +* :mod:`services.earnings.jobs.capture` — cold-fetch a webcast to TRANSIENT audio. +* :mod:`services.earnings.jobs.stt` — faster-whisper transcribe → transcript ledger. +* :mod:`services.earnings.jobs.rolefact` — role-attribute + build facts → fact ledger. + +**Audio firewall (D-27.9, legal).** Audio is a transient ingest artifact that +dies on ephemeral disk. It is NEVER a ledger column, NEVER uploaded to R2, and +NEVER served — the capture job asserts the audio path stays local and the +ledgers structurally refuse an audio-shaped field. + +**Lazy heavy imports.** ffmpeg/PyAV (``av``), faster-whisper (CTranslate2), and +the httpx-heavy capture bits are imported INSIDE ``main`` (never at module load), +so these modules import cleanly — and ``main()`` runs against injected/fake +inputs — with no audio toolchain, no GPU, and no ffmpeg present (mirrors the +sse.py / pubsub_bridge.py lazy-import discipline). +""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/services/earnings/jobs/_env.py b/services/earnings/jobs/_env.py new file mode 100644 index 0000000..b3721ec --- /dev/null +++ b/services/earnings/jobs/_env.py @@ -0,0 +1,39 @@ +"""Shared env-var helpers for the earnings Cloud Run Jobs entrypoints. + +Cloud Run Jobs inject per-invocation config as environment variables, so every +job resolves its inputs through :func:`require_env` (fail loud, naming the +missing var) or :func:`optional_env` (nullable, with a default). A missing +REQUIRED var must crash the job LOUD at startup — a silently-defaulted ticker or +audio path would mis-attribute a settlement-adjacent transcript. +""" + +from __future__ import annotations + +import os + +__all__ = ["optional_env", "require_env"] + + +def require_env(name: str) -> str: + """Return ``os.environ[name]`` or raise a loud config error naming the var. + + A Cloud Run Job with a required env var unset is a deploy misconfiguration; + the job must fail loud at startup rather than run against an empty/defaulted + value (which could mis-attribute a settlement-adjacent artifact). + """ + value = os.environ.get(name) + if value is None or value == "": + raise ValueError( + f"required environment variable {name!r} is unset or empty — the " + "Cloud Run Job cannot run without it (fail loud rather than default " + "a settlement-adjacent input)." + ) + return value + + +def optional_env(name: str, default: str | None = None) -> str | None: + """Return ``os.environ[name]`` when set + non-empty, else ``default``.""" + value = os.environ.get(name) + if value is None or value == "": + return default + return value diff --git a/services/earnings/jobs/capture.py b/services/earnings/jobs/capture.py new file mode 100644 index 0000000..307e1f2 --- /dev/null +++ b/services/earnings/jobs/capture.py @@ -0,0 +1,121 @@ +"""Earnings webcast-capture Cloud Run Job (Phase 28, 28-13). + +The AUDIO side of the firewall (``mr-earnings-ingest``). Reads a capture job spec +from the environment (Cloud Run Jobs pass config via env), invokes the SHIPPED +capture surface (:class:`mostlyright.weather.earnings.capture.q4.Q4CaptureAdapter`) +to cold-fetch the webcast media into an EPHEMERAL dir, and hands the resulting +transient :class:`~mostlyright.weather.earnings.capture.base.AudioArtifact` to the +downstream STT job (via the shared ephemeral disk within the same Cloud Run task, +or by leaving the artifact for an operator-gated orchestration seam — see below). + +**Audio firewall (D-27.9, legal).** The captured audio is a TRANSIENT artifact on +ephemeral disk. This job NEVER uploads it, NEVER writes it to a ledger, and NEVER +serves it — it asserts the artifact's ``is_transient`` flag and that its path +stays under the local capture dir. The bytes die with the ephemeral task. + +**Env contract:** + +* ``CAPTURE_TICKER`` (required) — the market ticker (e.g. ``CHWY``). +* ``CAPTURE_CALL_ID`` (required) — the provider event/call id. +* ``CAPTURE_WEBCAST_URL`` (required) — the sniffed static media URL + (``static.events.q4inc.com/.../{uuid}.mp4``). The shipped SSRF guard rejects a + non-Q4/non-https URL BEFORE any fetch. +* ``CAPTURE_OUT_DIR`` (optional) — the ephemeral dir the transient audio is + written under (default: a fresh ``tempfile`` dir, still ephemeral). + +**Lazy imports.** ffmpeg/PyAV (``av``) and httpx are pulled in only by the shipped +capture surface, which lazy-imports them inside its own methods — this module and +its ``main`` import nothing heavy at module load, so the entrypoint imports cleanly +with no audio toolchain (the test stubs the capture surface). + +**Operator-gated live seam.** The live-during-call path +(:meth:`CaptureAdapter.live` → Amazon-IVS HLS) is OPERATOR-GATED (27-09) and is +NOT driven here — this job is the VOD/replay cold-fetch. A live orchestration +would follow the HLS edge into the streaming STT (27-10); that is out of scope +for the deploy-runtime scaffolding and intentionally not wired. +""" + +from __future__ import annotations + +import logging +import os +import tempfile + +from services.earnings.jobs._env import optional_env, require_env + +_LOG = logging.getLogger("services.earnings.jobs.capture") + + +def _assert_audio_local(audio_path: str, out_dir: str) -> None: + """Fail loud if the captured audio is not a local file under ``out_dir`` (D-27.9). + + The captured audio must stay a transient artifact on ephemeral disk — it must + NOT be a remote URL / object-store key and must live under the capture dir. + Anything else means audio has (or could) escape the firewall. + """ + real_out = os.path.realpath(out_dir) + real_audio = os.path.realpath(audio_path) + if not real_audio.startswith(real_out + os.sep): + raise RuntimeError( + f"captured audio {audio_path!r} is not under the local ephemeral " + f"capture dir {out_dir!r} — audio must stay a transient local artifact " + "and NEVER be uploaded or served (D-27.9)." + ) + + +def main(argv: list[str] | None = None) -> int: + """Cold-fetch the webcast media to a transient local :class:`AudioArtifact`. + + Reads the job spec from the environment, invokes the shipped Q4 capture + surface, asserts the audio stays local, logs the transient path for the + downstream STT job, and returns ``0`` on success. Any failure (missing env, + SSRF-rejected URL, no HTTP media, extract failure) propagates as a non-zero + exit (fail loud). + """ + logging.basicConfig(level=logging.INFO) + + ticker = require_env("CAPTURE_TICKER") + call_id = require_env("CAPTURE_CALL_ID") + webcast_url = require_env("CAPTURE_WEBCAST_URL") + out_dir = optional_env("CAPTURE_OUT_DIR") or tempfile.mkdtemp(prefix="earnings-capture-") + os.makedirs(out_dir, exist_ok=True) + + # Lazy import: the shipped capture surface lazy-imports httpx/PyAV inside its + # own methods, so nothing heavy loads at module import (keeps this entrypoint + # importable with no audio toolchain). + from mostlyright.weather.earnings.capture.q4 import Q4CaptureAdapter + + _LOG.info("capture job start: ticker=%s call_id=%s out_dir=%s", ticker, call_id, out_dir) + + adapter = Q4CaptureAdapter() + event = {"ticker": ticker, "call_id": call_id, "media_url": webcast_url} + + # The AudioArtifact is a context manager whose __exit__ cleans up the transient + # audio. We do NOT enter it here — the downstream STT job (same ephemeral task / + # operator orchestration) consumes the path, then cleans it up. We DO assert the + # firewall invariants and log the transient path. + artifact = adapter.capture(event, tmp_dir=out_dir) + + if not artifact.is_transient: + raise RuntimeError( + f"capture returned a NON-transient artifact for {ticker}/{call_id} — " + "captured earnings audio must always be transient (D-27.9)." + ) + _assert_audio_local(artifact.audio_path, out_dir) + + _LOG.info( + "capture job done: ticker=%s call_id=%s transient_audio=%s source=%s", + artifact.ticker, + artifact.call_id, + artifact.audio_path, + artifact.source_media_url, + ) + # The transient audio path is emitted on stdout for the downstream STT job / + # operator orchestration to pick up off the same ephemeral disk. It is NEVER + # uploaded or served here. + print(artifact.audio_path) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py new file mode 100644 index 0000000..aa60977 --- /dev/null +++ b/services/earnings/jobs/rolefact.py @@ -0,0 +1,221 @@ +"""Earnings role-attribution + fact-building Cloud Run Job (Phase 28, 28-13). + +The POST-audio, CPU-only stage on the ingest side of the firewall. Reads the +persisted transcript for a call from the +:class:`~mostlyright.weather.earnings.ledger.TranscriptLedger`, role-attributes +the turns via :class:`~mostlyright.weather.earnings.role_parser.RoleParser`, +counts per-term mentions with +:func:`~mostlyright.weather.earnings.stt.classify_mentions`, assembles +``schema.earnings_fact.v1`` rows via +:func:`~mostlyright.weather.earnings.fact_builder.build_fact_rows` (which applies +the fail-closed Kalshi filter), and writes them to the +:class:`~mostlyright.weather.earnings.ledger.FactLedger`. It then OPTIONALLY +uploads the derived fact parquet to Cloudflare R2 via the shipped write sink. + +**Env contract:** + +* ``ROLEFACT_TICKER`` (required) — the market ticker (fact/transcript partition). +* ``ROLEFACT_CALL_ID`` (required) — the provider call id. +* ``ROLEFACT_TERMS`` (required) — a JSON array of market-term specs, each with + at least ``term_canonical`` (the counted term). Optional per-term keys: + ``term_match_rule``, ``counting_mode``, ``threshold_n``, ``window_scope``, + ``term_accepted_forms``. +* ``ROLEFACT_ROSTER`` (optional) — a JSON array of ``[speaker_name, label]`` + roster pairs anchoring exec identity for the fail-closed Kalshi rule. +* ``MOSTLYRIGHT_CACHE_DIR`` (optional) — the ledger cache root. +* R2 upload (opt-in; ``ROLEFACT_R2_BUCKET`` enables it): + ``ROLEFACT_R2_BUCKET`` + the write-token creds ``R2_ACCOUNT_ID`` / + ``R2_WRITE_ACCESS_KEY_ID`` / ``R2_WRITE_SECRET_ACCESS_KEY`` (read from the env + by NAME by the shipped sink). + +**Audio firewall (D-27.9).** This stage is entirely post-audio — it reads TEXT +from the transcript ledger and writes DERIVED FACTS. There is no audio anywhere in +this job; ffmpeg / whisper / chromium are absent from its image. + +**Lazy imports.** boto3 is lazy-imported inside the shipped R2 sink (only reached +when an upload bucket is configured), so this entrypoint imports cleanly. +""" + +from __future__ import annotations + +import json +import logging +from typing import TYPE_CHECKING + +from services.earnings.jobs._env import optional_env, require_env + +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence + +_LOG = logging.getLogger("services.earnings.jobs.rolefact") + + +def _transcript_text(rows: Sequence[Mapping[str, object]]) -> str: + """Join the transcript-ledger rows' ``text`` in segment order into one string.""" + ordered = sorted(rows, key=lambda r: _segment_index(r)) + return "\n".join(str(r.get("text", "")) for r in ordered if r.get("text")) + + +def _segment_index(row: Mapping[str, object]) -> int: + idx = row.get("segment_index") + return idx if isinstance(idx, int) else 0 + + +def _build_stt_counts( + transcript: str, + market_terms: Sequence[Mapping[str, object]], +) -> list[dict[str, object]]: + """Run ``classify_mentions`` per market term → per-occurrence stt_count records. + + ``classify_mentions`` (NOT ``count_mentions``) is the production counter (D-30): + it emits one record per occurrence carrying ``compound_type``, which the fact + builder needs to split a row per ``(term, compound_type)``. Each record is + stamped with the term + a ``mention_count`` of 1 (one record per occurrence). + """ + from mostlyright.weather.earnings.stt import classify_mentions + + counts: list[dict[str, object]] = [] + for spec in market_terms: + term = str(spec.get("term_canonical", "")) + if not term: + continue + match_rule = str(spec.get("term_match_rule", "plural_possessive_ok_no_tense")) + for occ in classify_mentions(transcript, term, match_rule=match_rule): + counts.append( + { + "term": term, + "matched_surface_form": occ.get("surface", term), + "mention_count": 1, + "compound_type": occ.get("compound_type", "standalone"), + # No turn_index linkage in the batch post-call path — the builder + # then treats the occurrence as un-anchorable (diarization_advisory) + # unless a roster-anchored role parser fills turns (below). + } + ) + return counts + + +def main(argv: list[str] | None = None) -> int: + """Read transcript → build + persist fact rows (+ optional R2 upload). + + Returns ``0`` on success. A missing required env var, a missing transcript, a + fact-build/ledger failure, or an R2 upload failure propagates as a non-zero + exit (fail loud). + """ + logging.basicConfig(level=logging.INFO) + + ticker = require_env("ROLEFACT_TICKER") + call_id = require_env("ROLEFACT_CALL_ID") + market_terms = _parse_terms(require_env("ROLEFACT_TERMS")) + roster = _parse_roster(optional_env("ROLEFACT_ROSTER")) + + from mostlyright.weather.earnings.ledger import FactLedger, TranscriptLedger + + _LOG.info( + "rolefact job start: ticker=%s call_id=%s terms=%d", ticker, call_id, len(market_terms) + ) + + transcript_rows = TranscriptLedger().read(ticker, call_id) + if not transcript_rows: + raise RuntimeError( + f"no persisted transcript for {ticker}/{call_id} — the STT job must run " + "before rolefact (fail loud rather than build zero facts)." + ) + transcript = _transcript_text(transcript_rows) + + # Role-attribute the turns (fail-closed Kalshi rule anchors exec identity to the + # roster). The batch stt_counts carry no turn_index, so the builder scopes each + # occurrence to diarization_advisory (Kalshi-excluded, Polymarket-retained) — + # the conservative default; a full turn↔occurrence join is the operator-gated + # live role-attribution seam (27-04). + from mostlyright.weather.earnings.role_parser import RoleParser + + turns = RoleParser(roster).attribute_turns(transcript) + _LOG.info("rolefact attributed %d turns for %s/%s", len(turns), ticker, call_id) + + from mostlyright.weather.earnings.fact_builder import build_fact_rows + + stt_counts = _build_stt_counts(transcript, market_terms) + fact_rows = build_fact_rows( + stt_counts, + turns, + market_terms, + ticker=ticker, + call_id=call_id, + ) + _LOG.info("rolefact built %d fact rows for %s/%s", len(fact_rows), ticker, call_id) + + fact_ledger = FactLedger() + total = fact_ledger.append(fact_rows, ticker=ticker, call_id=call_id) + fact_path = fact_ledger.path(ticker, call_id) + _LOG.info( + "rolefact wrote fact ledger: ticker=%s call_id=%s rows_now=%d path=%s", + ticker, + call_id, + total, + fact_path, + ) + + _maybe_upload_r2(str(fact_path), ticker=ticker, call_id=call_id) + + return 0 + + +def _parse_terms(raw: str) -> list[dict[str, object]]: + """Parse ``ROLEFACT_TERMS`` (a JSON array of term specs); fail loud if malformed.""" + try: + parsed = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"ROLEFACT_TERMS is not valid JSON: {exc}") from exc + if not isinstance(parsed, list) or not parsed: + raise ValueError( + "ROLEFACT_TERMS must be a non-empty JSON array of term specs (each with " + "at least a 'term_canonical'); an empty term set would build zero facts." + ) + return [dict(t) for t in parsed] + + +def _parse_roster(raw: str | None) -> list[tuple[str, str]]: + """Parse ``ROLEFACT_ROSTER`` (a JSON array of ``[name, label]`` pairs) or ``[]``.""" + if not raw: + return [] + try: + parsed = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"ROLEFACT_ROSTER is not valid JSON: {exc}") from exc + roster: list[tuple[str, str]] = [] + for entry in parsed: + if isinstance(entry, (list, tuple)) and len(entry) == 2: + roster.append((str(entry[0]), str(entry[1]))) + else: + raise ValueError(f"ROLEFACT_ROSTER entry {entry!r} is not a [name, label] pair.") + return roster + + +def _maybe_upload_r2(fact_path: str, *, ticker: str, call_id: str) -> None: + """Opt-in upload of the derived fact parquet to R2 via the shipped write sink. + + Enabled only when ``ROLEFACT_R2_BUCKET`` is set. The write-token creds + (``R2_ACCOUNT_ID`` / ``R2_WRITE_ACCESS_KEY_ID`` / ``R2_WRITE_SECRET_ACCESS_KEY``) + are read from the env by NAME by the shipped sink (boto3 lazy-imported there). + ONLY the derived FACT parquet (text/facts, never audio) is uploaded (D-27.9). + """ + bucket = optional_env("ROLEFACT_R2_BUCKET") + if not bucket: + return + key = f"earnings/facts/{ticker}/{call_id}.parquet" + + from mostlyright.weather.satellite._r2_sink import upload + + returned = upload(fact_path, bucket, key, r2_target=bucket) + _LOG.info( + "rolefact uploaded derived facts to R2: bucket=%s key=%s (%s/%s)", + bucket, + returned, + ticker, + call_id, + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py new file mode 100644 index 0000000..31be6ee --- /dev/null +++ b/services/earnings/jobs/stt.py @@ -0,0 +1,213 @@ +"""Earnings STT Cloud Run GPU Job (Phase 28, 28-13). + +The faster-whisper transcription stage of the audio-side ingest pipeline. Reads a +transient audio path + call identity from the environment, transcribes it via the +SHIPPED :class:`mostlyright.weather.earnings.stt.SttTranscriber` (CTranslate2 / +faster-whisper, lazy-imported), writes the AUDIO-FREE transcript segments to the +:class:`~mostlyright.weather.earnings.ledger.TranscriptLedger`, and — in the +opt-in live mode — publishes segments to the ``earnings-streaming`` Pub/Sub topic +via :class:`~services.earnings.pubsub_bridge.SegmentPublisher`. + +**Env contract:** + +* ``STT_AUDIO_PATH`` (required) — the transient audio file from the capture job. +* ``STT_TICKER`` (required) — the market ticker. +* ``STT_CALL_ID`` (required) — the provider call id (ledger partition key). +* ``STT_TIER`` (optional) — model size (default ``large-v3``; on-device + floor is ``small``). +* ``STT_DEVICE`` (optional) — faster-whisper device (default ``cuda`` on the + L4 GPU image; ``cpu`` for a CPU fallback). +* ``STT_COMPUTE_TYPE`` (optional) — CTranslate2 compute type (default ``float16`` + on GPU). +* ``STT_INITIAL_PROMPT`` (optional) — a per-call vocabulary-biasing prompt (the + market strike terms). Threaded straight to ``WhisperModel.transcribe``. +* ``MOSTLYRIGHT_CACHE_DIR`` (optional) — the ledger cache root (else the default + ``$HOME/.mostlyright/cache``). +* Live-publish (all three required to enable): + ``EARNINGS_STREAMING_ENABLED``, ``EARNINGS_STREAMING_PROJECT``, + and optionally ``EARNINGS_STREAMING_TOPIC`` (default ``earnings-streaming``). + +**Region.** This image runs on an L4 GPU in us-central1 (no Cloud Run GPU in +eu-west3) — see the Dockerfile. + +**Audio firewall (D-27.9).** ONLY the transcript TEXT + derived segments cross +into the ledger; the audio file is a transient input that is never persisted as a +ledger column (the ledger structurally refuses an audio-shaped key). The published +envelopes are text/facts-only (the bridge fails closed on any audio field). + +**Lazy imports.** faster-whisper / CTranslate2 are lazy-imported inside the shipped +:class:`SttTranscriber` (never at module load), and ``google-cloud-pubsub`` is +lazy-constructed only inside :func:`~services.earnings.pubsub_bridge.build_publisher_client` +— so this entrypoint imports cleanly with no GPU / no whisper / no GCP SDK. +""" + +from __future__ import annotations + +import logging +import os + +from services.earnings.jobs._env import optional_env, require_env + +_LOG = logging.getLogger("services.earnings.jobs.stt") + +#: Default STT tier — the hosted / our-infra source-of-truth model (D-27.5). +_DEFAULT_TIER = "large-v3" +#: Default device/compute for the L4 GPU image. +_DEFAULT_DEVICE = "cuda" +_DEFAULT_COMPUTE_TYPE = "float16" + + +def _segment_rows( + result_segments: list[dict[str, object]], + *, + ticker: str, + call_id: str, +) -> list[dict[str, object]]: + """Project the transcriber's per-segment records onto transcript-ledger rows. + + The ledger projects each row onto its canonical ``COLUMNS`` and DROPS any + non-schema key, so only text + the temporal markers survive. The STT segment + ``start`` (seconds into the call) maps to the ``offset_seconds`` engine-relative + marker (NOT ``spoken_at``, which is a tz-aware wallclock — a float there would + silently persist as 1970-01-01). Audio is never a field here (D-27.9). + """ + rows: list[dict[str, object]] = [] + for idx, seg in enumerate(result_segments): + start = seg.get("start") + rows.append( + { + "ticker": ticker, + "call_id": call_id, + "segment_index": idx, + "segment": "batch", + "text": seg.get("text", ""), + "offset_seconds": float(start) if isinstance(start, (int, float)) else None, + "is_final": True, + "source": "earnings_call", + "delivery": "hosted", + } + ) + return rows + + +def main(argv: list[str] | None = None) -> int: + """Transcribe the transient audio → transcript ledger (+ optional live publish). + + Returns ``0`` on success. A missing required env var, a transcription failure, + or a ledger write failure propagates as a non-zero exit (fail loud). No audio + ever reaches the ledger or the wire. + """ + logging.basicConfig(level=logging.INFO) + + audio_path = require_env("STT_AUDIO_PATH") + ticker = require_env("STT_TICKER") + call_id = require_env("STT_CALL_ID") + tier = optional_env("STT_TIER", _DEFAULT_TIER) or _DEFAULT_TIER + device = optional_env("STT_DEVICE", _DEFAULT_DEVICE) or _DEFAULT_DEVICE + compute_type = optional_env("STT_COMPUTE_TYPE", _DEFAULT_COMPUTE_TYPE) or _DEFAULT_COMPUTE_TYPE + initial_prompt = optional_env("STT_INITIAL_PROMPT") + + if not os.path.exists(audio_path): + raise FileNotFoundError( + f"STT_AUDIO_PATH {audio_path!r} does not exist — the capture job's " + "transient audio must be present on the shared ephemeral disk." + ) + + # Lazy import: SttTranscriber lazy-imports faster-whisper inside transcribe, + # so nothing heavy loads at module import. + from mostlyright.weather.earnings.stt import SttTranscriber + + _LOG.info( + "stt job start: ticker=%s call_id=%s tier=%s device=%s", ticker, call_id, tier, device + ) + + transcriber = SttTranscriber(tier, device=device, compute_type=compute_type) + result = transcriber.transcribe(audio_path, initial_prompt=initial_prompt) + _LOG.info( + "stt job transcribed: ticker=%s call_id=%s segments=%d language=%s duration=%s", + ticker, + call_id, + len(result.segments), + result.language, + result.duration, + ) + + rows = _segment_rows(result.segments, ticker=ticker, call_id=call_id) + + # Lazy import: the ledger pulls pyarrow/filelock; kept out of module load so a + # fake-ledger test can import this module without them. (These are base weather + # deps, but keeping the import inside main mirrors the audio-toolchain seam.) + from mostlyright.weather.earnings.ledger import TranscriptLedger + + ledger = TranscriptLedger() + total = ledger.append(rows, ticker=ticker, call_id=call_id) + _LOG.info( + "stt job wrote transcript ledger: ticker=%s call_id=%s rows_now=%d", ticker, call_id, total + ) + + _maybe_publish_live(result.segments, ticker=ticker, call_id=call_id) + + return 0 + + +def _maybe_publish_live( + segments: list[dict[str, object]], + *, + ticker: str, + call_id: str, +) -> None: + """Opt-in live publish of segments to the ``earnings-streaming`` topic. + + Enabled ONLY when ``EARNINGS_STREAMING_ENABLED`` is truthy AND a project is + configured. The real ``google.cloud.pubsub_v1`` client is lazy-constructed by + :func:`~services.earnings.pubsub_bridge.build_publisher_client` (never at + module load). The batch STT segments are published as final transcript + segments; the true partial→final streaming path is the operator-gated 27-10 + live engine (not driven from this batch job). + """ + enabled = optional_env("EARNINGS_STREAMING_ENABLED") + if not enabled or enabled.lower() in ("0", "false", "no"): + return + project = optional_env("EARNINGS_STREAMING_PROJECT") + if not project: + _LOG.warning( + "EARNINGS_STREAMING_ENABLED set but EARNINGS_STREAMING_PROJECT unset — " + "skipping live publish (fail soft: the batch ledger is authoritative)." + ) + return + topic = optional_env("EARNINGS_STREAMING_TOPIC", "earnings-streaming") or "earnings-streaming" + + from mostlyright.weather.earnings.streaming_transcriber import Segment + + from services.earnings.pubsub_bridge import ( + SegmentPublisher, + build_publisher_client, + ) + + publisher = SegmentPublisher(build_publisher_client(project, topic)) + published = 0 + for seg in segments: + start = seg.get("start") + publisher.publish( + call_id, + Segment( + text=str(seg.get("text", "")), + is_final=True, + spoken_at=float(start) if isinstance(start, (int, float)) else 0.0, + stream_seq=published, + knowledge_time=float(start) if isinstance(start, (int, float)) else 0.0, + ), + ) + published += 1 + publisher.publish_end_of_call(call_id) + _LOG.info( + "stt job published %d live segments to topic=%s for %s/%s", + published, + topic, + ticker, + call_id, + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py new file mode 100644 index 0000000..0ac5c06 --- /dev/null +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -0,0 +1,401 @@ +"""Cloud Run Jobs entrypoint tests (Phase 28, 28-13). + +Proves the three thin ``python -m services.earnings.jobs.`` entrypoints: + +* import CLEANLY with NO audio toolchain — the module top-level must not import + ``av`` / ``faster_whisper`` / ``ctranslate2`` / a headless browser (the heavy + deps are lazy-imported inside ``main`` / the shipped engine surfaces). Proven by + poisoning ``sys.modules`` with a sentinel that raises if imported. +* ``main()`` runs against INJECTED / FAKE engine surfaces (fake capture / + transcribe / fact-build + a tmp ledger root) — no network, no GPU, no ffmpeg — + and returns ``0``, writing the expected artifact. +* the audio firewall holds — no audio path is uploaded / served / persisted as a + ledger column, and a non-local / non-transient capture fails loud. +* each ``main()`` fails LOUD (clear error naming the var) when a required env var + is missing. + +Pure-Python fakes — NO ``faster-whisper``, NO ``av``, NO ``boto3``, NO GCP. +""" + +from __future__ import annotations + +import builtins +import importlib +import sys + +import pytest + +from services.earnings.jobs import capture as capture_job +from services.earnings.jobs import rolefact as rolefact_job +from services.earnings.jobs import stt as stt_job +from services.earnings.jobs._env import optional_env, require_env + +# --------------------------------------------------------------------------- +# Lazy-import discipline: the audio toolchain must NOT load at module import. +# --------------------------------------------------------------------------- +_AUDIO_MODULES = ("av", "faster_whisper", "ctranslate2") + + +def test_modules_import_without_audio_toolchain(monkeypatch: pytest.MonkeyPatch) -> None: + """Re-importing the job modules must not pull in any audio/GPU dep.""" + + class _Poison: + def __getattr__(self, _name: str) -> object: # pragma: no cover - only if imported + raise AssertionError("audio toolchain imported at module load — must be lazy") + + real_import = builtins.__import__ + + def _guarded_import(name: str, *args: object, **kwargs: object) -> object: + root = name.split(".")[0] + if root in _AUDIO_MODULES: + raise AssertionError( + f"{name!r} imported at module load — heavy deps must be lazy (D-27.9 discipline)" + ) + return real_import(name, *args, **kwargs) # type: ignore[arg-type] + + for mod in _AUDIO_MODULES: + monkeypatch.setitem(sys.modules, mod, _Poison()) + monkeypatch.setattr(builtins, "__import__", _guarded_import) + + for name in ( + "services.earnings.jobs.capture", + "services.earnings.jobs.stt", + "services.earnings.jobs.rolefact", + ): + sys.modules.pop(name, None) + importlib.import_module(name) + + +# --------------------------------------------------------------------------- +# _env helper +# --------------------------------------------------------------------------- +def test_require_env_fails_loud_when_missing(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("SOME_MISSING_VAR", raising=False) + with pytest.raises(ValueError, match="SOME_MISSING_VAR"): + require_env("SOME_MISSING_VAR") + + +def test_optional_env_defaults(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("SOME_OPT_VAR", raising=False) + assert optional_env("SOME_OPT_VAR", "fallback") == "fallback" + monkeypatch.setenv("SOME_OPT_VAR", "") + assert optional_env("SOME_OPT_VAR", "fallback") == "fallback" + monkeypatch.setenv("SOME_OPT_VAR", "set") + assert optional_env("SOME_OPT_VAR", "fallback") == "set" + + +# --------------------------------------------------------------------------- +# capture.main +# --------------------------------------------------------------------------- +class _FakeArtifact: + def __init__(self, audio_path: str, *, is_transient: bool = True) -> None: + self.audio_path = audio_path + self.ticker = "CHWY" + self.call_id = "evt-1" + self.source_media_url = "https://static.events.q4inc.com/x/y.mp4" + self.is_transient = is_transient + + +def test_capture_main_runs_and_keeps_audio_local(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + out_dir = tmp_path / "cap" + out_dir.mkdir() + audio_path = str(out_dir / "audio.wav") + + captured: dict[str, object] = {} + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + captured["event"] = event + captured["tmp_dir"] = tmp_dir + return _FakeArtifact(audio_path) + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.setenv("CAPTURE_CALL_ID", "evt-1") + monkeypatch.setenv("CAPTURE_WEBCAST_URL", "https://static.events.q4inc.com/x/y.mp4") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + assert capture_job.main() == 0 + # The webcast URL rode through to the shipped capture surface as media_url. + assert captured["event"]["media_url"] == "https://static.events.q4inc.com/x/y.mp4" + assert captured["tmp_dir"] == str(out_dir) + + +def test_capture_main_fails_loud_on_missing_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + with pytest.raises(ValueError, match="CAPTURE_TICKER"): + capture_job.main() + + +def test_capture_main_rejects_non_local_audio(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + out_dir = tmp_path / "cap" + out_dir.mkdir() + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + # An escaped audio path OUTSIDE the ephemeral dir must fail loud. + return _FakeArtifact("/etc/passwd") + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.setenv("CAPTURE_CALL_ID", "evt-1") + monkeypatch.setenv("CAPTURE_WEBCAST_URL", "https://static.events.q4inc.com/x/y.mp4") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + with pytest.raises(RuntimeError, match="must stay a transient local artifact"): + capture_job.main() + + +def test_capture_main_rejects_non_transient_audio( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + out_dir = tmp_path / "cap" + out_dir.mkdir() + audio_path = str(out_dir / "audio.wav") + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + return _FakeArtifact(audio_path, is_transient=False) + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.setenv("CAPTURE_CALL_ID", "evt-1") + monkeypatch.setenv("CAPTURE_WEBCAST_URL", "https://static.events.q4inc.com/x/y.mp4") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + with pytest.raises(RuntimeError, match="must always be transient"): + capture_job.main() + + +# --------------------------------------------------------------------------- +# stt.main +# --------------------------------------------------------------------------- +class _FakeTranscriptResult: + def __init__(self) -> None: + self.text = "we grew RPO and tariffs this quarter" + self.segments = [ + {"text": "we grew RPO", "start": 0.0, "end": 2.0}, + {"text": "and tariffs this quarter", "start": 2.0, "end": 4.0}, + ] + self.language = "en" + self.duration = 4.0 + + +def test_stt_main_writes_transcript_ledger_no_audio( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + seen: dict[str, object] = {} + + class _FakeTranscriber: + def __init__(self, model_size, *, device, compute_type): + seen["model_size"] = model_size + seen["device"] = device + + def transcribe(self, audio_path, *, initial_prompt=None): + seen["audio_path"] = audio_path + seen["initial_prompt"] = initial_prompt + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.setenv("STT_INITIAL_PROMPT", "RPO tariffs") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + + assert stt_job.main() == 0 + assert seen["model_size"] == "small" + assert seen["initial_prompt"] == "RPO tariffs" + + # The transcript is persisted (text only) and carries NO audio column. + from mostlyright.weather.earnings.ledger import TranscriptLedger + + ledger = TranscriptLedger() + rows = ledger.read("CHWY", "evt-1") + assert len(rows) == 2 + assert rows[0]["text"] == "we grew RPO" + assert not any("audio" in col.lower() for col in ledger.column_names()) + # No row carries an audio-shaped key. + assert all(not any("audio" in str(k).lower() for k in row) for row in rows) + + +def test_stt_main_fails_loud_on_missing_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("STT_AUDIO_PATH", raising=False) + with pytest.raises(ValueError, match="STT_AUDIO_PATH"): + stt_job.main() + + +def test_stt_main_fails_loud_when_audio_missing(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + monkeypatch.setenv("STT_AUDIO_PATH", str(tmp_path / "nope.wav")) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + with pytest.raises(FileNotFoundError, match="STT_AUDIO_PATH"): + stt_job.main() + + +def test_stt_main_live_publish_opt_in(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"fake") + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() + + published: list[object] = [] + + class _FakePublisher: + def __init__(self, _callable): + pass + + def publish(self, call_id, item): + published.append(item) + + def publish_end_of_call(self, call_id): + published.append("EOC") + + import mostlyright.weather.earnings.stt as sttmod + + import services.earnings.pubsub_bridge as bridge + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setattr(bridge, "SegmentPublisher", _FakePublisher) + monkeypatch.setattr(bridge, "build_publisher_client", lambda project, topic: object()) + + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("EARNINGS_STREAMING_ENABLED", "true") + monkeypatch.setenv("EARNINGS_STREAMING_PROJECT", "mr-earnings-ingest") + + assert stt_job.main() == 0 + # Two segments + one end-of-call marker. + assert len(published) == 3 + assert published[-1] == "EOC" + + +# --------------------------------------------------------------------------- +# rolefact.main +# --------------------------------------------------------------------------- +def _seed_transcript(cache_dir, ticker: str, call_id: str, texts: list[str]) -> None: + from mostlyright.weather.earnings.ledger import TranscriptLedger + + rows = [ + { + "ticker": ticker, + "call_id": call_id, + "segment_index": i, + "segment": "batch", + "text": t, + "is_final": True, + "source": "earnings_call", + "delivery": "hosted", + } + for i, t in enumerate(texts) + ] + TranscriptLedger().append(rows, ticker=ticker, call_id=call_id) + + +def test_rolefact_main_builds_facts_no_audio(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-1", ["we mentioned tariff twice", "tariff again here"]) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + + assert rolefact_job.main() == 0 + + from mostlyright.weather.earnings.ledger import FactLedger + + fl = FactLedger() + facts = fl.read("CHWY", "evt-1") + # 'tariff' appears once per segment -> two occurrences across the two segments. + assert len(facts) == 2 + assert all(f["term_canonical"] == "tariff" for f in facts) + # No audio field on the fact ledger. + assert not any("audio" in col.lower() for col in fl.column_names()) + assert all(not any("audio" in str(k).lower() for k in f) for f in facts) + + +def test_rolefact_main_fails_loud_on_missing_transcript( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(tmp_path / "cache")) + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-missing") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + with pytest.raises(RuntimeError, match="no persisted transcript"): + rolefact_job.main() + + +def test_rolefact_main_fails_loud_on_missing_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("ROLEFACT_TICKER", raising=False) + with pytest.raises(ValueError, match="ROLEFACT_TICKER"): + rolefact_job.main() + + +def test_rolefact_main_bad_terms_json(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(tmp_path / "cache")) + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", "not-json") + with pytest.raises(ValueError, match="ROLEFACT_TERMS is not valid JSON"): + rolefact_job.main() + + +def test_rolefact_main_r2_upload_opt_in_uploads_facts_only( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-1", ["tariff here"]) + + uploaded: dict[str, object] = {} + + # Resolve the sink module via importlib (the dotted-path resolver) rather than + # an ``import a.b.c as x`` statement — the latter trips a namespace-package + # resolution quirk on ``mostlyright.weather.satellite``. rolefact imports + # ``upload`` from this module at call time, so patching the attribute here is + # what the job's lazy ``from ... import upload`` picks up. + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + uploaded["local_path"] = str(local_path) + uploaded["bucket"] = bucket + uploaded["key"] = key + return key + + monkeypatch.setattr(sink, "upload", _fake_upload) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.setenv("ROLEFACT_R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + # Only the derived FACT parquet is uploaded (never audio). + assert uploaded["key"] == "earnings/facts/CHWY/evt-1.parquet" + assert uploaded["local_path"].endswith("evt-1.parquet") + assert "audio" not in uploaded["local_path"].lower() From 787bb0be3517742644c3189b3bf124ca6a512e38 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 19:15:18 +0200 Subject: [PATCH 03/17] feat(28-21): settlement-station backfill roster + CLI --roster/shard/--incremental MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Committed 66-station Kalshi∪Polymarket roster (D-28.8, drift-checked vs the live markets catalogs). backfill --roster resolves + shards by BATCH_TASK_INDEX (one array-task per station); --incremental yesterday scopes to the current year with resume; --progress-bucket accepted (GCS marker wiring TODO'd, 28-21 C4). The shipped infra/batch.tf container args now run. --- .../mostlyright/weather/satellite/__main__.py | 250 ++++++++++++++-- .../mostlyright/weather/satellite/_roster.py | 273 ++++++++++++++++++ .../tests/satellite/test_cli_roster.py | 261 +++++++++++++++++ .../weather/tests/satellite/test_roster.py | 136 +++++++++ 4 files changed, 902 insertions(+), 18 deletions(-) create mode 100644 packages/weather/src/mostlyright/weather/satellite/_roster.py create mode 100644 packages/weather/tests/satellite/test_cli_roster.py create mode 100644 packages/weather/tests/satellite/test_roster.py diff --git a/packages/weather/src/mostlyright/weather/satellite/__main__.py b/packages/weather/src/mostlyright/weather/satellite/__main__.py index c0b3afc..9a27fa3 100644 --- a/packages/weather/src/mostlyright/weather/satellite/__main__.py +++ b/packages/weather/src/mostlyright/weather/satellite/__main__.py @@ -7,6 +7,19 @@ and the D9 ``--mirror aws|gcp`` transport selector (default ``aws``, validated by argparse ``choices`` so an unknown mirror is rejected BEFORE any run). Dispatches to :func:`_backfill.bulk_backfill`. + + Two invocation modes (28-21): + * **Explicit** — pass ``--satellites/--products/--stations/--year-start/ + --year-end/--out`` (all required). Unchanged from Phase 25. + * **Roster** — pass ``--roster NAME`` (e.g. ``kalshi,polymarket``) and the + CLI resolves the committed settlement-station roster, selects THIS + array-task's shard (``--shard-index``/``--shard-count``, else the Cloud + Batch ``BATCH_TASK_INDEX``/``BATCH_TASK_COUNT`` env, else the whole + roster), and supplies documented defaults for satellites/products/years/ + out. ``--incremental yesterday`` scopes it to the current UTC year with + resume forced on. A ``--r2-bucket`` in roster mode ENABLES the R2 upload + sink (the fleet's whole purpose). This is the path the shipped + ``infra/batch.tf`` invokes. - ``probe`` — the on-demand / live throughput probe (D10 SAT-25-11). Measures the anonymous-throttle / diminishing-returns knee against the LIVE NOAA buckets and writes the SOURCE-LIMITS findings artifact + satellite section @@ -23,10 +36,33 @@ from __future__ import annotations import argparse +import os import sys +from datetime import UTC, datetime from pathlib import Path from ._backfill import bulk_backfill +from ._roster import resolve_roster, shard_roster + +#: Default satellites for roster/incremental mode. batch.tf passes NO +#: ``--satellites``, so roster mode must supply the canonical native-ring +#: default. The ACMC-only MVP (project memory) runs on the GOES East/West pair — +#: the two operational GOES platforms — so the fleet covers the Americas span the +#: settlement stations concentrate in. (Explicit-args mode still requires +#: ``--satellites``; this default ONLY applies when ``--roster`` is given.) +_DEFAULT_ROSTER_SATELLITES: tuple[str, ...] = ("goes16", "goes18") + +#: Default products for roster/incremental mode: the ACMC-only MVP cloud mask +#: (``ABI-L2-ACMC``, the cheap CONUS product — matches the ``_sources`` GOES +#: ``default_product``). batch.tf passes NO ``--products``. +_DEFAULT_ROSTER_PRODUCTS: tuple[str, ...] = ("ABI-L2-ACMC",) + +#: Default historical backfill start year for roster mode (batch.tf passes NO +#: ``--year-start``). GOES-16 first light is 2017; the fleet backfills the full +#: modern GOES record. The per-slice ``available_since`` clamp in ``_backfill`` +#: skips any (satellite, month) before a platform's first-light with no I/O, so a +#: conservative floor here is safe. year_end defaults to the current UTC year. +_DEFAULT_ROSTER_YEAR_START: int = 2017 def _split_csv(value: str) -> list[str]: @@ -47,27 +83,86 @@ def _build_parser() -> argparse.ArgumentParser: help="Fleet bulk backfill — per-(satellite,year,month) slices, resume, " "Thread/Process split, --mirror aws|gcp.", ) + # NOTE (28-21): --satellites/--products/--stations/--year-start/--year-end + # are REQUIRED in explicit mode but OPTIONAL when --roster is given (roster + # mode supplies documented defaults — batch.tf passes none of them). The + # required-ness is enforced in _run_backfill, not by argparse ``required=``, + # so the two modes can share one subparser. Explicit mode stays byte-identical. bf.add_argument( "--satellites", type=_split_csv, - required=True, - help="Comma-separated satellites, e.g. goes16,goes19 (GOES-East) or goes18 (GOES-West).", + default=None, + help="Comma-separated satellites, e.g. goes16,goes19 (GOES-East) or goes18 " + "(GOES-West). REQUIRED in explicit mode; in --roster mode defaults to the " + "native-ring default (goes16,goes18 — the operational GOES pair).", ) bf.add_argument( "--products", type=_split_csv, - required=True, - help="Comma-separated ABI L2 products, e.g. ABI-L2-ACMC.", + default=None, + help="Comma-separated ABI L2 products, e.g. ABI-L2-ACMC. REQUIRED in " + "explicit mode; in --roster mode defaults to ABI-L2-ACMC (the ACMC-only MVP).", ) bf.add_argument( "--stations", type=_split_csv, - required=True, - help="Comma-separated ICAO/NWS station codes, e.g. KNYC.", + default=None, + help="Comma-separated ICAO/NWS station codes, e.g. KNYC. REQUIRED in " + "explicit mode; MUTUALLY EXCLUSIVE with --roster (the roster supplies the " + "shard's stations).", + ) + bf.add_argument("--year-start", type=int, default=None, dest="year_start") + bf.add_argument("--year-end", type=int, default=None, dest="year_end") + bf.add_argument( + "--out", + type=Path, + default=None, + help="Cache root output dir. REQUIRED in explicit mode; in --roster mode " + "defaults to the resolved home/env cache root (MOSTLYRIGHT_CACHE_DIR).", + ) + # ---- 28-21 roster / shard / progress / incremental -------------------- + bf.add_argument( + "--roster", + default=None, + metavar="NAME", + help="Resolve the committed settlement-station roster NAME (e.g. " + "'kalshi,polymarket') and back-fill only THIS array-task's shard. The " + "shard is selected by --shard-index/--shard-count, else the Cloud Batch " + "env (BATCH_TASK_INDEX/BATCH_TASK_COUNT), else the whole roster " + "(index=0,count=1). Mutually exclusive with --stations.", + ) + bf.add_argument( + "--shard-index", + type=int, + default=None, + dest="shard_index", + help="0-based shard index for --roster (default: env BATCH_TASK_INDEX, else 0).", + ) + bf.add_argument( + "--shard-count", + type=int, + default=None, + dest="shard_count", + help="Total shard count for --roster (default: env BATCH_TASK_COUNT, else 1).", + ) + bf.add_argument( + "--progress-bucket", + default=None, + dest="progress_bucket", + metavar="BUCKET", + help="Durable GCS completion-marker bucket for crash-safe resume (C4). " + "Accepted in roster/incremental mode; see the 28-21 C4 note in " + "_run_backfill for its current wiring status.", + ) + bf.add_argument( + "--incremental", + choices=["yesterday"], + default=None, + help="Scope the run to a recent window instead of the full historical " + "backfill. 'yesterday' sets year_start=year_end=current UTC year and " + "forces resume=True so only new/missing partitions are fetched. " + "(Day-granular incremental is a deferred 28-22 SDK follow-up.)", ) - bf.add_argument("--year-start", type=int, required=True, dest="year_start") - bf.add_argument("--year-end", type=int, required=True, dest="year_end") - bf.add_argument("--out", type=Path, required=True, help="Cache root output dir.") bf.add_argument( "--max-workers", type=int, @@ -151,23 +246,142 @@ def _build_parser() -> argparse.ArgumentParser: return parser +def _default_out() -> Path: + """Resolve the default ``--out`` cache root for roster/incremental mode. + + batch.tf passes no ``--out``, so roster mode defaults to the SAME home/env + cache root the cache tier resolves (honoring ``MOSTLYRIGHT_CACHE_DIR``), so + the fleet's partitions land in the canonical cache layout. + """ + from mostlyright._internal._cache_dir import resolve_cache_root_without_v1 + + return resolve_cache_root_without_v1() + + +def _resolve_shard_index_count(args: argparse.Namespace) -> tuple[int, int]: + """Resolve (shard_index, shard_count) for --roster mode. + + Precedence: explicit ``--shard-index``/``--shard-count`` > the Cloud Batch + env (``BATCH_TASK_INDEX``/``BATCH_TASK_COUNT``) > the default ``(0, 1)`` (the + whole roster — e.g. the incremental Cloud Run Job, which is NOT an array job). + Index and count are resolved INDEPENDENTLY (either can come from its flag or + its env). Non-integer env values raise a clear error. + """ + + def _from(flag: int | None, env_name: str, default: int) -> int: + if flag is not None: + return flag + raw = os.environ.get(env_name) + if raw is None or raw == "": + return default + try: + return int(raw) + except ValueError as exc: + raise ValueError(f"env {env_name}={raw!r} is not an integer") from exc + + index = _from(args.shard_index, "BATCH_TASK_INDEX", 0) + count = _from(args.shard_count, "BATCH_TASK_COUNT", 1) + return index, count + + def _run_backfill(args: argparse.Namespace) -> int: + # --- 28-21: reconcile roster mode vs the explicit-args mode -------------- + if args.roster is not None and args.stations is not None: + raise ValueError( + "--roster and --stations are mutually exclusive: the roster supplies " + "this shard's stations, so do not also pass --stations." + ) + + resume = args.resume + year_start = args.year_start + year_end = args.year_end + + # --incremental yesterday: year-granular resume window (28-22 deferred the + # true day-granular incremental). Force resume so only new/missing partitions + # for the CURRENT UTC year are fetched. + if args.incremental == "yesterday": + current_year = datetime.now(UTC).year + year_start = current_year + year_end = current_year + resume = True + + if args.roster is not None: + # Roster mode: resolve the committed roster, select this task's shard, + # and supply documented defaults for the params batch.tf does NOT pass. + roster = resolve_roster(args.roster) # raises loud ValueError on unknown + index, count = _resolve_shard_index_count(args) + stations = list(shard_roster(roster, index, count)) + satellites = args.satellites or list(_DEFAULT_ROSTER_SATELLITES) + products = args.products or list(_DEFAULT_ROSTER_PRODUCTS) + if year_start is None: + year_start = _DEFAULT_ROSTER_YEAR_START + if year_end is None: + year_end = datetime.now(UTC).year + out = args.out if args.out is not None else _default_out() + else: + # Explicit mode: unchanged contract — all of these are REQUIRED. + missing = [ + name + for name, val in ( + ("--satellites", args.satellites), + ("--products", args.products), + ("--stations", args.stations), + ("--year-start", year_start), + ("--year-end", year_end), + ("--out", args.out), + ) + if val is None + ] + if missing: + raise ValueError( + f"missing required argument(s) {missing} (required in explicit mode; " + f"pass --roster to run the committed settlement-station roster instead)" + ) + satellites = args.satellites + products = args.products + stations = args.stations + out = args.out + kwargs: dict = { - "satellites": args.satellites, - "products": args.products, - "stations": args.stations, - "year_start": args.year_start, - "year_end": args.year_end, - "out": args.out, - "resume": args.resume, + "satellites": satellites, + "products": products, + "stations": stations, + "year_start": year_start, + "year_end": year_end, + "out": out, + "resume": resume, "executor": args.executor, "mirror": args.mirror, } if args.max_workers is not None: kwargs["max_workers"] = args.max_workers - # 28-20: thread the OPT-IN R2 sink target. Off (None) unless --r2-target. - if getattr(args, "r2_target", False): + + # 28-20/28-21: the OPT-IN R2 sink. Explicit mode keeps the pre-28-20 gate + # (--r2-target flag required, backward compatible). Roster mode is the fleet + # upload path: batch.tf passes ONLY --r2-bucket and EXPECTS the derived + # parquet to be uploaded, so a --r2-bucket in roster mode ENABLES the sink + # even without the explicit --r2-target flag. + r2_enabled = bool(getattr(args, "r2_target", False)) or ( + args.roster is not None and args.r2_bucket is not None + ) + if r2_enabled: kwargs["r2_target"] = args.r2_bucket + + # C4 (28-21): --progress-bucket is the durable GCS completion-marker bucket + # for crash-safe resume. bulk_backfill takes a pluggable ``progress_store`` + # (not a bucket NAME), and the GCS-backed ProgressStore is NOT yet wired to a + # bucket string here, so we thread it only as far as validation and record + # the gap loudly rather than silently dropping it. + # TODO(28-21 C4): GCS marker bucket not yet consumed by bulk_backfill — + # construct a GCS-backed _progress.ProgressStore(args.progress_bucket) and + # pass it as progress_store= once the durable store lands. + if args.progress_bucket is not None: + print( + f"note: --progress-bucket {args.progress_bucket!r} accepted but not yet " + f"consumed by bulk_backfill (28-21 C4 follow-up); using the local JSON " + f"progress file under --out for this run." + ) + result = bulk_backfill(**kwargs) print( f"backfill done: {result.slices_completed} slices completed, " diff --git a/packages/weather/src/mostlyright/weather/satellite/_roster.py b/packages/weather/src/mostlyright/weather/satellite/_roster.py new file mode 100644 index 0000000..73a4d81 --- /dev/null +++ b/packages/weather/src/mostlyright/weather/satellite/_roster.py @@ -0,0 +1,273 @@ +"""Committed Kalshi/Polymarket settlement-station roster for the fleet backfill. + +Phase 28 (28-21). The hosted weather backfill runs as a Cloud Batch ARRAY JOB: +``task_count = 66`` array tasks, one shard per settlement station (D-28.8). Each +array task resolves its shard from a STABLE, DETERMINISTIC roster so that shard +``N`` always maps to the same station across Spot retries — a stable +shard-index→station mapping is load-bearing for crash-safe resume (a retried +shard must re-derive the SAME station's partitions, never a different one). + +**Source of truth (why this is a committed snapshot, NOT a runtime import).** +The canonical roster is the union of two live ``markets``-package catalogs: + + - ``markets.catalog.kalshi_stations.KALSHI_SETTLEMENT_STATIONS`` — the Kalshi + NHIGH/NLOW settlement stations (values are ``StationCitation`` objects whose + ``.station`` is the 4-letter ICAO). + - ``markets.polymarket.load_polymarket_city_stations()`` — the Polymarket + city→role→ICAO map (inner values are the ICAO strings). + +The union of those two — sorted, deduped — is EXACTLY the 66 ICAOs frozen below +(D-28.8; matches ``infra/batch.tf`` ``task_count = 66``). We snapshot it here, +in ``packages/weather``, rather than importing ``markets`` at runtime because: + + 1. The satellite/weather deploy image MUST NOT pull the ``markets`` package + (dependency + audit isolation — the weather backfill has no business + importing the markets catalogs on the fleet). + 2. A frozen roster is deterministic and reviewable; drift is caught in CI. + +``tests/satellite/test_roster.py`` imports the LIVE ``markets`` catalogs and +asserts this snapshot still equals their sorted union, so any catalog drift +(a station added/removed upstream) fails CI loudly and forces a conscious +re-snapshot here — the roster can never silently diverge from the markets truth. +""" + +from __future__ import annotations + +__all__ = [ + "ROSTERS", + "SETTLEMENT_STATION_ROSTER", + "resolve_roster", + "shard_roster", +] + +#: The canonical Kalshi/Polymarket settlement-station roster (D-28.8): the 66 +#: 4-letter ICAO codes that are the union of the live Kalshi + Polymarket +#: settlement catalogs, SORTED and deduped. Verified against the live ``markets`` +#: union by ``test_roster.py`` (drift fails CI). Sorted + frozen so the +#: shard-index→station mapping is stable across array-task retries. +SETTLEMENT_STATION_ROSTER: tuple[str, ...] = ( + "CYYZ", + "EDDM", + "EFHK", + "EGLC", + "EHAM", + "EPWA", + "FACT", + "HKO", + "KATL", + "KAUS", + "KBKF", + "KBNA", + "KBOS", + "KCVG", + "KDAL", + "KDCA", + "KDEN", + "KDFW", + "KDTW", + "KHOU", + "KIAH", + "KLAS", + "KLAX", + "KLGA", + "KMDW", + "KMIA", + "KMSP", + "KNYC", + "KORD", + "KPHL", + "KPHX", + "KSEA", + "KSFO", + "KSLC", + "LEMD", + "LFPB", + "LIMC", + "LLBG", + "LTAC", + "LTFM", + "MMMX", + "MPMG", + "NZWN", + "OEJN", + "OPKC", + "RCSS", + "RJTT", + "RKPK", + "RKSI", + "RPLL", + "SAEZ", + "SBGR", + "UUWW", + "VILK", + "WMKK", + "WSSS", + "ZBAA", + "ZGGG", + "ZGSZ", + "ZHCC", + "ZHHH", + "ZSJN", + "ZSPD", + "ZSQD", + "ZUCK", + "ZUUU", +) + + +# Committed sub-snapshot: the Kalshi-only membership of the union. This is NOT a +# second source of truth — ``test_roster.py`` asserts (a) the full union equals +# the live Kalshi/Polymarket catalog and (b) this set equals every live Kalshi +# settlement ICAO, so drift in either fails CI. The two markets overlap (many +# stations appear in both), which is why the union — not the sum — is 66. The +# split rosters below are derived from this set so they stay in lockstep. +_KALSHI_STATIONS: frozenset[str] = frozenset( + { + "KATL", + "KAUS", + "KBNA", + "KBOS", + "KCVG", + "KDCA", + "KDEN", + "KDFW", + "KDTW", + "KIAH", + "KLAS", + "KLAX", + "KMDW", + "KMIA", + "KMSP", + "KNYC", + "KPHL", + "KPHX", + "KSEA", + "KSFO", + "KSLC", + } +) + + +# Committed sub-snapshot: the Polymarket-only membership of the union (the inner +# ICAOs of ``load_polymarket_city_stations()``). Like ``_KALSHI_STATIONS`` this is +# NOT a second source of truth — ``test_roster.py`` asserts it equals the live +# Polymarket catalog. Kalshi + Polymarket == the 66 (the two overlap), which is +# why ``len(_KALSHI_STATIONS)`` (21) + ``len(_POLYMARKET_STATIONS)`` (51) != 66. +_POLYMARKET_STATIONS: frozenset[str] = frozenset( + { + "CYYZ", + "EDDM", + "EFHK", + "EGLC", + "EHAM", + "EPWA", + "FACT", + "HKO", + "KATL", + "KAUS", + "KBKF", + "KDAL", + "KHOU", + "KLAX", + "KLGA", + "KMIA", + "KORD", + "KSEA", + "KSFO", + "LEMD", + "LFPB", + "LIMC", + "LLBG", + "LTAC", + "LTFM", + "MMMX", + "MPMG", + "NZWN", + "OEJN", + "OPKC", + "RCSS", + "RJTT", + "RKPK", + "RKSI", + "RPLL", + "SAEZ", + "SBGR", + "UUWW", + "VILK", + "WMKK", + "WSSS", + "ZBAA", + "ZGGG", + "ZGSZ", + "ZHCC", + "ZHHH", + "ZSJN", + "ZSPD", + "ZSQD", + "ZUCK", + "ZUUU", + } +) + + +#: The CLI ``--roster NAME`` registry. ``batch.tf`` passes the literal +#: ``"kalshi,polymarket"`` (the full union). The split names are provided as a +#: convenience; every value is a sorted slice of the committed snapshot. +ROSTERS: dict[str, tuple[str, ...]] = { + "kalshi,polymarket": SETTLEMENT_STATION_ROSTER, + "kalshi": tuple(s for s in SETTLEMENT_STATION_ROSTER if s in _KALSHI_STATIONS), + "polymarket": tuple(s for s in SETTLEMENT_STATION_ROSTER if s in _POLYMARKET_STATIONS), +} + + +def resolve_roster(name: str) -> tuple[str, ...]: + """Resolve a ``--roster`` name to its station tuple. + + Fails LOUD and EARLY (before any I/O) on an unknown roster name so a typo in + the Terraform container args surfaces as a clear error at startup rather than + a silent empty run. + + Args: + name: The roster name, e.g. ``"kalshi,polymarket"`` (the batch.tf value). + + Returns: + The sorted station tuple for that roster. + + Raises: + ValueError: ``name`` is not a registered roster. + """ + try: + return ROSTERS[name] + except KeyError: + raise ValueError(f"unknown roster {name!r}; expected one of {sorted(ROSTERS)}") from None + + +def shard_roster(roster: tuple[str, ...], index: int, count: int) -> tuple[str, ...]: + """Return this array-task's deterministic shard of ``roster``. + + Round-robin slice (``roster[index::count]``). Round-robin keeps every shard + NON-EMPTY whenever ``count <= len(roster)`` (each of the first ``len(roster)`` + shards gets ≥1 station), and — because it is a pure function of + ``(index, count)`` — shard ``index`` maps to the SAME stations across retries. + For ``count == len(roster)`` (the batch.tf ``task_count = 66`` case) each shard + is exactly one station. + + Args: + roster: The full station tuple (e.g. from :func:`resolve_roster`). + index: This task's 0-based shard index (``BATCH_TASK_INDEX``). + count: The total number of shards (``BATCH_TASK_COUNT``). + + Returns: + The stations owned by shard ``index`` (possibly empty when + ``count > len(roster)`` and ``index >= len(roster)``). + + Raises: + ValueError: ``count < 1`` or ``index`` is out of ``[0, count)``. + """ + if count < 1: + raise ValueError(f"shard count must be >= 1; got {count}") + if not (0 <= index < count): + raise ValueError( + f"shard index {index} out of range for count {count} (need 0 <= index < count)" + ) + return roster[index::count] diff --git a/packages/weather/tests/satellite/test_cli_roster.py b/packages/weather/tests/satellite/test_cli_roster.py new file mode 100644 index 0000000..1af7864 --- /dev/null +++ b/packages/weather/tests/satellite/test_cli_roster.py @@ -0,0 +1,261 @@ +"""CLI roster/shard/incremental tests (28-21). + +Argparse-level tests: call ``main([...])`` with the module-level ``bulk_backfill`` +monkeypatched so no network / no cache I/O happens. We assert the CLI resolves +the roster shard, defaults, incremental window, and R2 enablement into the exact +kwargs ``bulk_backfill`` receives — proving the shipped ``infra/batch.tf`` +container args actually run. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import pytest +from mostlyright.weather.satellite._roster import ( + SETTLEMENT_STATION_ROSTER, + resolve_roster, + shard_roster, +) + +# ``__main__`` imports ``_backfill`` which imports the transport (_goes_s3 → +# boto3/s3fs) at module scope, so importing the CLI requires the [satellite] +# optional extra. In the base no-extra CI fast-suite skip this whole module +# cleanly — the dedicated satellite-coverage lane installs the extra and runs +# these. Mirrors test_satellite_backfill.py. +try: + from mostlyright.weather.satellite import __main__ as cli + + _HAVE_SATELLITE_DEPS = True +except ImportError: # pragma: no cover - exercised only without the extra + cli = None # type: ignore[assignment] + _HAVE_SATELLITE_DEPS = False + +pytestmark = pytest.mark.skipif( + not _HAVE_SATELLITE_DEPS, + reason="satellite CLI tests require the [satellite] optional extra (boto3/s3fs)", +) + + +@dataclass +class _FakeResult: + slices_completed: int = 0 + slices_skipped_resume: int = 0 + total_rows_written: int = 0 + duration_s: float = 0.0 + + +@pytest.fixture +def captured(monkeypatch): + """Monkeypatch the module-level bulk_backfill; capture its kwargs.""" + calls: list[dict] = [] + + def _fake(**kwargs): + calls.append(kwargs) + return _FakeResult() + + monkeypatch.setattr(cli, "bulk_backfill", _fake) + return calls + + +def test_batch_tf_backfill_args_run(captured, monkeypatch): + """The exact backfill container args from batch.tf parse + dispatch.""" + # batch.tf appends these to the `backfill` entrypoint; shard via flags here. + rc = cli.main( + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--r2-bucket", + "b", + "--shard-index", + "0", + "--shard-count", + "66", + ] + ) + assert rc == 0 + assert len(captured) == 1 + kw = captured[0] + expected_shard = list(shard_roster(resolve_roster("kalshi,polymarket"), 0, 66)) + assert kw["stations"] == expected_shard + assert len(kw["stations"]) == 1 # count == len(roster) -> one station/shard + assert kw["mirror"] == "gcp" + # roster mode defaults + assert kw["satellites"] == list(cli._DEFAULT_ROSTER_SATELLITES) + assert kw["products"] == list(cli._DEFAULT_ROSTER_PRODUCTS) + assert kw["year_start"] == cli._DEFAULT_ROSTER_YEAR_START + # --r2-bucket in roster mode ENABLES the sink even without --r2-target. + assert kw["r2_target"] == "b" + + +def test_shard_from_batch_task_env(captured, monkeypatch): + """Shard index/count come from BATCH_TASK_INDEX/COUNT when flags are absent.""" + monkeypatch.setenv("BATCH_TASK_INDEX", "5") + monkeypatch.setenv("BATCH_TASK_COUNT", "66") + rc = cli.main( + ["backfill", "--mirror", "gcp", "--roster", "kalshi,polymarket", "--r2-bucket", "b"] + ) + assert rc == 0 + kw = captured[0] + expected_shard = list(shard_roster(resolve_roster("kalshi,polymarket"), 5, 66)) + assert kw["stations"] == expected_shard + + +def test_shard_index_flag_overrides_env(captured, monkeypatch): + """Explicit --shard-index/--shard-count win over the env.""" + monkeypatch.setenv("BATCH_TASK_INDEX", "5") + monkeypatch.setenv("BATCH_TASK_COUNT", "66") + rc = cli.main( + [ + "backfill", + "--roster", + "kalshi,polymarket", + "--shard-index", + "2", + "--shard-count", + "8", + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["stations"] == list(shard_roster(resolve_roster("kalshi,polymarket"), 2, 8)) + + +def test_roster_no_shard_defaults_to_whole_roster(captured, monkeypatch): + """No shard flags + no env -> whole roster (index=0, count=1), e.g. Cloud Run Job.""" + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + rc = cli.main(["backfill", "--roster", "kalshi,polymarket", "--r2-bucket", "b"]) + assert rc == 0 + kw = captured[0] + assert kw["stations"] == list(SETTLEMENT_STATION_ROSTER) + + +def test_incremental_yesterday_single_year_resume(captured, monkeypatch): + """--incremental yesterday sets a single-year window and forces resume.""" + from datetime import UTC, datetime + + year = datetime.now(UTC).year + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + rc = cli.main( + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--incremental", + "yesterday", + "--r2-bucket", + "b", + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["year_start"] == year + assert kw["year_end"] == year + assert kw["resume"] is True + + +def test_roster_and_stations_mutually_exclusive(captured): + """--roster + --stations raises a clear error before any dispatch.""" + with pytest.raises(ValueError, match="mutually exclusive"): + cli.main( + [ + "backfill", + "--roster", + "kalshi,polymarket", + "--stations", + "KNYC", + ] + ) + assert captured == [] + + +def test_progress_bucket_accepted(captured): + """--progress-bucket is accepted without error (C4 threading TODO).""" + rc = cli.main( + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--progress-bucket", + "marker-bkt", + "--r2-bucket", + "b", + "--shard-index", + "0", + "--shard-count", + "66", + ] + ) + assert rc == 0 + assert len(captured) == 1 + + +def test_explicit_mode_unchanged(captured, tmp_path): + """Explicit-args path stays backward compatible (no roster).""" + rc = cli.main( + [ + "backfill", + "--satellites", + "goes16", + "--products", + "ABI-L2-ACMC", + "--stations", + "KNYC", + "--year-start", + "2020", + "--year-end", + "2020", + "--out", + str(tmp_path), + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["satellites"] == ["goes16"] + assert kw["stations"] == ["KNYC"] + assert kw["year_start"] == 2020 + # No --r2-target -> sink stays OFF (byte-identical to pre-28-20). + assert "r2_target" not in kw + + +def test_explicit_mode_missing_required_raises(captured): + """Explicit mode still requires the core args when no --roster.""" + with pytest.raises(ValueError, match="missing required argument"): + cli.main(["backfill", "--satellites", "goes16"]) + assert captured == [] + + +def test_incremental_explicit_mode(captured, tmp_path): + """--incremental works in explicit mode too (year window override + resume).""" + from datetime import UTC, datetime + + year = datetime.now(UTC).year + rc = cli.main( + [ + "backfill", + "--satellites", + "goes16", + "--products", + "ABI-L2-ACMC", + "--stations", + "KNYC", + "--out", + str(tmp_path), + "--incremental", + "yesterday", + ] + ) + assert rc == 0 + kw = captured[0] + assert kw["year_start"] == year + assert kw["year_end"] == year + assert kw["resume"] is True diff --git a/packages/weather/tests/satellite/test_roster.py b/packages/weather/tests/satellite/test_roster.py new file mode 100644 index 0000000..fe864f2 --- /dev/null +++ b/packages/weather/tests/satellite/test_roster.py @@ -0,0 +1,136 @@ +"""Roster snapshot tests (28-21). + +The committed ``SETTLEMENT_STATION_ROSTER`` MUST stay byte-equal to the live +``markets`` Kalshi/Polymarket union. These tests import the live catalogs and +assert the snapshot matches, so ANY upstream catalog drift (a station added or +removed) fails CI and forces a conscious re-snapshot in ``_roster.py`` — the +weather fleet's roster can never silently diverge from the markets truth. +""" + +from __future__ import annotations + +import pytest +from mostlyright.weather.satellite._roster import ( + ROSTERS, + SETTLEMENT_STATION_ROSTER, + resolve_roster, + shard_roster, +) + + +def _live_kalshi_stations() -> set[str]: + from mostlyright.markets.catalog.kalshi_stations import KALSHI_SETTLEMENT_STATIONS + + return {c.station for c in KALSHI_SETTLEMENT_STATIONS.values()} + + +def _live_polymarket_stations() -> set[str]: + from mostlyright.markets.polymarket import load_polymarket_city_stations + + return {icao for roles in load_polymarket_city_stations().values() for icao in roles.values()} + + +def test_roster_equals_live_union() -> None: + """The committed roster == the sorted live Kalshi/Polymarket union (drift gate).""" + live_union = sorted(_live_kalshi_stations() | _live_polymarket_stations()) + assert list(SETTLEMENT_STATION_ROSTER) == live_union + + +def test_roster_count_is_66() -> None: + """D-28.8 / batch.tf task_count = 66.""" + assert len(SETTLEMENT_STATION_ROSTER) == 66 + + +def test_every_kalshi_settlement_station_present() -> None: + """Every live Kalshi NHIGH/NLOW settlement station is in the roster.""" + for station in _live_kalshi_stations(): + assert station in SETTLEMENT_STATION_ROSTER + + +def test_every_polymarket_station_present() -> None: + """Every live Polymarket city ICAO is in the roster.""" + for station in _live_polymarket_stations(): + assert station in SETTLEMENT_STATION_ROSTER + + +def test_roster_sorted_and_deduped() -> None: + """The roster is sorted (stable shard mapping) and has no duplicates.""" + assert list(SETTLEMENT_STATION_ROSTER) == sorted(SETTLEMENT_STATION_ROSTER) + assert len(SETTLEMENT_STATION_ROSTER) == len(set(SETTLEMENT_STATION_ROSTER)) + + +def test_resolve_roster_kalshi_polymarket() -> None: + """The batch.tf literal 'kalshi,polymarket' resolves to the 66.""" + resolved = resolve_roster("kalshi,polymarket") + assert resolved == SETTLEMENT_STATION_ROSTER + assert len(resolved) == 66 + + +def test_resolve_roster_splits_match_live_catalogs() -> None: + """The convenience split rosters match the live per-market catalogs.""" + assert set(resolve_roster("kalshi")) == _live_kalshi_stations() + assert set(resolve_roster("polymarket")) == _live_polymarket_stations() + # Split rosters stay sorted slices of the union. + for name in ("kalshi", "polymarket"): + r = resolve_roster(name) + assert list(r) == sorted(r) + assert set(r) <= set(SETTLEMENT_STATION_ROSTER) + + +def test_resolve_roster_unknown_raises() -> None: + """An unknown roster name fails loud (before any I/O).""" + with pytest.raises(ValueError, match="unknown roster"): + resolve_roster("nope") + + +@pytest.mark.parametrize("count", [66, 8]) +def test_shard_roster_partitions_with_no_overlap_full_coverage(count: int) -> None: + """Sharding partitions the roster: no overlap, full coverage across all shards.""" + roster = SETTLEMENT_STATION_ROSTER + seen: list[str] = [] + for index in range(count): + shard = shard_roster(roster, index, count) + seen.extend(shard) + # Every station covered exactly once across the shards. + assert sorted(seen) == sorted(roster) + assert len(seen) == len(set(seen)) # no overlap + + +def test_shard_roster_count_66_gives_one_station_each() -> None: + """With count == len(roster) every shard is exactly one station (batch.tf case).""" + roster = SETTLEMENT_STATION_ROSTER + for index in range(len(roster)): + shard = shard_roster(roster, index, len(roster)) + assert len(shard) == 1 + + +def test_shard_roster_deterministic() -> None: + """Shard index maps to the SAME stations across calls (retry-stable).""" + roster = SETTLEMENT_STATION_ROSTER + assert shard_roster(roster, 0, 8) == shard_roster(roster, 0, 8) + assert shard_roster(roster, 3, 8) == shard_roster(roster, 3, 8) + + +def test_shard_roster_nonempty_when_count_le_len() -> None: + """Round-robin keeps every shard non-empty when count <= len(roster).""" + roster = SETTLEMENT_STATION_ROSTER + for count in (8, 66): + for index in range(count): + assert len(shard_roster(roster, index, count)) >= 1 + + +@pytest.mark.parametrize( + "index,count", + [(-1, 8), (8, 8), (0, 0), (1, 0), (5, 3)], +) +def test_shard_roster_invalid_index_count_raises(index: int, count: int) -> None: + """Out-of-range index or non-positive count fails loud.""" + with pytest.raises(ValueError): + shard_roster(SETTLEMENT_STATION_ROSTER, index, count) + + +def test_rosters_dict_names() -> None: + """The CLI roster registry exposes the batch.tf name + the cheap splits.""" + assert "kalshi,polymarket" in ROSTERS + assert "kalshi" in ROSTERS + assert "polymarket" in ROSTERS From dd13aed5409f409ff83e15c6d3bcf18d36b3e5e6 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 19:15:19 +0200 Subject: [PATCH 04/17] feat(28): manual-only per-service deploy workflows + deploy-time IAM Per-service workflow_dispatch deploys (serving, capture, stt, rolefact, weather ingest); run-weather-backfill gates the full 66-shard ~28 TB fleet behind an explicit cost sign-off (1-station pilot default). deploy_iam.tf adds the Codex-flagged deploy-time grants: public run.invoker on the serving services (GATE #2), deploy-SA run.developer + act-as on runtime SAs + artifactregistry.writer. --- .github/workflows/deploy-earnings-capture.yml | 71 ++++++++ .../workflows/deploy-earnings-rolefact.yml | 68 ++++++++ .github/workflows/deploy-earnings-serving.yml | 104 +++++++++++ .github/workflows/deploy-earnings-stt.yml | 84 +++++++++ .github/workflows/deploy-weather-ingest.yml | 101 +++++++++++ .github/workflows/deploy.yml | 40 +++-- .github/workflows/run-weather-backfill.yml | 162 ++++++++++++++++++ infra/deploy_iam.tf | 148 ++++++++++++++++ infra/outputs.tf | 29 +++- 9 files changed, 785 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/deploy-earnings-capture.yml create mode 100644 .github/workflows/deploy-earnings-rolefact.yml create mode 100644 .github/workflows/deploy-earnings-serving.yml create mode 100644 .github/workflows/deploy-earnings-stt.yml create mode 100644 .github/workflows/deploy-weather-ingest.yml create mode 100644 .github/workflows/run-weather-backfill.yml create mode 100644 infra/deploy_iam.tf diff --git a/.github/workflows/deploy-earnings-capture.yml b/.github/workflows/deploy-earnings-capture.yml new file mode 100644 index 0000000..dd05205 --- /dev/null +++ b/.github/workflows/deploy-earnings-capture.yml @@ -0,0 +1,71 @@ +name: Deploy earnings-capture (28-10) + +# Phase 28 (28-10) — WIF build+deploy for the earnings CAPTURE Cloud Run Job +# (Chromium/ffmpeg webcast capture) in mr-earnings-ingest / eu-west3. This is the +# AUDIO side of the firewall (D-27.9): audio dies on the Job's ephemeral disk and +# NEVER gets an R2 key. Image build/push + Job image-swap only; the job's args, +# the static-egress VPC pin (IVS), secrets + SA are Terraform-owned (infra/). +# +# OPERATOR-GATED: the live capture pipeline (IVS edge, static egress IP) is +# validated by an operator with a scheduled live call (28-10 Task 3). This +# workflow ships the deploy path; it does not run a live capture. +# +# Setup Variables (from `tofu -chdir=infra output`): WIF_PROVIDER, DEPLOY_SA_INGEST, +# AR_HOST, INGEST_PROJECT_ID. + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-capture + JOB: earnings-capture + REGION: europe-west3 + +jobs: + deploy: + name: Build + push capture image, roll the capture Job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_INGEST }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build capture image + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + docker build -f deploy/earnings/capture.Dockerfile -t "${IMAGE}" . + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy earnings-capture Job (image swap only; config is Terraform-owned) + run: | + gcloud run jobs deploy "${JOB}" \ + --project "${{ vars.INGEST_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy-earnings-rolefact.yml b/.github/workflows/deploy-earnings-rolefact.yml new file mode 100644 index 0000000..fbc3301 --- /dev/null +++ b/.github/workflows/deploy-earnings-rolefact.yml @@ -0,0 +1,68 @@ +name: Deploy earnings-rolefact (28-13) + +# Phase 28 (28-13) — WIF build+deploy for the role/fact-builder Cloud Run Job +# (CPU) in mr-earnings-ingest / eu-west3. POST-audio side of the firewall: it +# reads transcript text, builds derived facts, and writes transcript + fact +# parquet to R2 with the WRITE token — it holds NO audio and needs no audio +# toolchain. Image build/push + Job image-swap only; args + R2-write secrets + SA +# are Terraform-owned (infra/). +# +# Setup Variables (from `tofu -chdir=infra output`): WIF_PROVIDER, DEPLOY_SA_INGEST, +# AR_HOST, INGEST_PROJECT_ID. + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-rolefact + JOB: earnings-rolefact + REGION: europe-west3 + +jobs: + deploy: + name: Build + push rolefact image, roll the rolefact Job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_INGEST }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build rolefact image + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + docker build -f deploy/earnings/rolefact.Dockerfile -t "${IMAGE}" . + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy earnings-rolefact Job (image swap only; config is Terraform-owned) + run: | + gcloud run jobs deploy "${JOB}" \ + --project "${{ vars.INGEST_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy-earnings-serving.yml b/.github/workflows/deploy-earnings-serving.yml new file mode 100644 index 0000000..95e4018 --- /dev/null +++ b/.github/workflows/deploy-earnings-serving.yml @@ -0,0 +1,104 @@ +name: Deploy earnings-serving (28-12) + +# Phase 28 (28-12) — WIF-authenticated build+deploy for the earnings serving +# Cloud Run service (/transcripts /facts /capabilities /stream) in +# mr-serving/eu-west3. KEYLESS auth via Workload Identity Federation — no SA key +# files. The service resource + its R2-read-only secret wiring + EARNINGS_API_KEY +# + EARNINGS_STREAMING_SUBSCRIPTION + the H2 min=max=1 scaling all live in infra/ +# (cloud_run.tf earnings_serving); this workflow only builds+pushes the AUDIO-FREE +# image (deploy/earnings/serving.Dockerfile) and rolls a new revision. +# +# H2 (load-bearing): the SSE fan-out over ONE shared earnings-streaming +# subscription is correct ONLY at exactly one always-warm instance. The smoke +# step asserts min-instances=1 is preserved (a broken scaling config would +# silently split-brain the stream). Config is Terraform-owned; --image only swaps +# the container on the current config. +# +# Setup (repo/environment Variables, from `tofu -chdir=infra output`): +# WIF_PROVIDER = +# DEPLOY_SA_SERVING = deploy@mr-serving... +# AR_HOST = europe-west3-docker.pkg.dev +# SERVING_PROJECT_ID = mr-serving + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy (e.g. a git SHA or 'latest')." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-serving + SERVICE: earnings-serving + REGION: europe-west3 + +jobs: + deploy: + name: Build + push audio-free image, roll earnings-serving revision + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_SERVING }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build audio-free earnings-serving image + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + # Build from the repo root so the Dockerfile can COPY packages/ + services/. + docker build \ + -f deploy/earnings/serving.Dockerfile \ + -t "${IMAGE}" \ + . + + - name: Assert the built image carries NO audio toolchain (firewall a) + run: | + # Defense in depth: the serving image must physically omit + # faster-whisper / av / ffmpeg / chromium (D-27.9). Fail the deploy if + # any slipped in via a transitive dep. + if docker run --rm --entrypoint sh "${IMAGE}" -c \ + "pip list 2>/dev/null | grep -Eiq 'faster-whisper|^av |ffmpeg' && exit 1 || exit 0"; then + echo "audio-free image OK" + else + echo "::error::earnings-serving image contains an audio dependency — firewall a (D-27.9) breach" + exit 1 + fi + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy revision (image swap only; config is Terraform-owned) + run: | + gcloud run deploy "${SERVICE}" \ + --project "${{ vars.SERVING_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet + + - name: Verify H2 min-instances=1 preserved (single always-warm SSE instance) + run: | + MIN=$(gcloud run services describe "${SERVICE}" \ + --project "${{ vars.SERVING_PROJECT_ID }}" \ + --region "${REGION}" \ + --format="value(spec.template.metadata.annotations['autoscaling.knative.dev/minScale'])") + echo "min-instances = ${MIN:-0}" + test "${MIN:-0}" = "1" || { echo "::error::expected H2 min-instances=1 (single always-warm SSE instance)"; exit 1; } diff --git a/.github/workflows/deploy-earnings-stt.yml b/.github/workflows/deploy-earnings-stt.yml new file mode 100644 index 0000000..68bd6a9 --- /dev/null +++ b/.github/workflows/deploy-earnings-stt.yml @@ -0,0 +1,84 @@ +name: Deploy earnings-stt (28-11) + +# Phase 28 (28-11) — WIF build+deploy for the STT Cloud Run service (NVIDIA L4 +# GPU, scale-to-zero) in mr-earnings-ingest / us-central1 (L4 GPU is NOT in +# eu-west3 — 28-OPERATOR-INPUTS). faster-whisper / CTranslate2, NO torch (D-27.5). +# Image build/push + service image-swap only; the GPU config, bounded concurrency +# (≤ L4 quota, H8), secrets + SA are Terraform-owned (infra/cloud_run.tf stt). +# +# OPERATOR-GATED: the live GPU smoke (a real transcription on L4) is 28-11 Task 4 +# (operator, autonomous:false) — this workflow ships the deploy path only. +# +# NOTE: the STT image is CUDA-based and large; the GitHub-hosted runner builds it +# but does not need a GPU (the build installs the CUDA runtime + faster-whisper; +# it never runs inference here). +# +# Setup Variables (from `tofu -chdir=infra output`): WIF_PROVIDER, DEPLOY_SA_INGEST, +# AR_HOST, INGEST_PROJECT_ID. + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + deploy." + required: true + default: "latest" + type: string + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: earnings-stt + SERVICE: earnings-stt + REGION: us-central1 + +jobs: + deploy: + name: Build + push STT (CUDA) image, roll the STT service + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_INGEST }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build STT (CUDA + faster-whisper, no torch) image + run: | + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" + docker build -f deploy/earnings/stt.Dockerfile -t "${IMAGE}" . + + - name: Assert the STT image has NO torch (D-27.5 — CTranslate2 only) + run: | + if docker run --rm --entrypoint sh "${IMAGE}" -c \ + "pip list 2>/dev/null | grep -Eiq '^torch ' && exit 1 || exit 0"; then + echo "no-torch OK" + else + echo "::error::earnings-stt image pulled torch — D-27.5 forbids it (faster-whisper/CTranslate2 only)" + exit 1 + fi + + - name: Push image + run: docker push "${IMAGE}" + + - name: Deploy earnings-stt service (image swap only; GPU config is Terraform-owned) + run: | + gcloud run deploy "${SERVICE}" \ + --project "${{ vars.INGEST_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy-weather-ingest.yml b/.github/workflows/deploy-weather-ingest.yml new file mode 100644 index 0000000..392b439 --- /dev/null +++ b/.github/workflows/deploy-weather-ingest.yml @@ -0,0 +1,101 @@ +name: Deploy weather-ingest image + incremental job (28-22) + +# Phase 28 (28-20/28-22) — WIF-authenticated build+push of the SHARED weather +# ingest image and deploy of the daily INCREMENTAL Cloud Run Job in +# mostlyright-satellite (H1) / us-central1. KEYLESS via Workload Identity +# Federation — no SA key files. +# +# ONE image (deploy/weather/ingest.Dockerfile) backs BOTH the backfill fleet +# (weather-backfill, Cloud Batch — submitted by run-weather-backfill.yml) and the +# incremental job (weather-incremental, Cloud Run Job — deployed here). Infra +# references two AR image names (var.image_weather_backfill / +# image_weather_incremental), so this workflow builds ONCE and pushes the SAME +# image under BOTH names/tags — the backfill and incremental deploys then resolve +# the identical bytes. +# +# This is the CHEAP PATH (serving + daily incremental) — deploy it freely. The +# expensive 28 TB backfill fleet is gated behind run-weather-backfill.yml's cost +# sign-off, not this workflow. +# +# Setup (repo/environment Variables, from `tofu -chdir=infra output`): +# WIF_PROVIDER = +# DEPLOY_SA_SATELLITE = deploy@mostlyright-satellite... (H1: EXISTING project) +# AR_HOST = europe-west3-docker.pkg.dev +# SATELLITE_PROJECT_ID = mostlyright-satellite + +on: + workflow_dispatch: + inputs: + image_tag: + description: "Image tag to build + push under both weather image names." + required: true + default: "latest" + type: string + deploy_incremental: + description: "Also roll the weather-incremental Cloud Run Job onto the new image." + required: true + default: true + type: boolean + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + BACKFILL_IMAGE_NAME: weather-backfill + INCREMENTAL_IMAGE_NAME: weather-incremental + INCREMENTAL_JOB: weather-incremental + REGION: us-central1 + +jobs: + build-and-deploy: + name: Build once, push both names, roll the incremental job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_SATELLITE }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Configure Docker for Artifact Registry + run: gcloud auth configure-docker "${AR_HOST}" --quiet + + - name: Build the shared ingest image + run: | + BASE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}" + BACKFILL_IMAGE="${BASE}/${BACKFILL_IMAGE_NAME}:${{ inputs.image_tag }}" + INCREMENTAL_IMAGE="${BASE}/${INCREMENTAL_IMAGE_NAME}:${{ inputs.image_tag }}" + echo "BACKFILL_IMAGE=${BACKFILL_IMAGE}" >> "$GITHUB_ENV" + echo "INCREMENTAL_IMAGE=${INCREMENTAL_IMAGE}" >> "$GITHUB_ENV" + docker build \ + -f deploy/weather/ingest.Dockerfile \ + -t "${BACKFILL_IMAGE}" \ + -t "${INCREMENTAL_IMAGE}" \ + . + + - name: Push both image names (identical bytes) + run: | + docker push "${BACKFILL_IMAGE}" + docker push "${INCREMENTAL_IMAGE}" + + # Roll the daily incremental Cloud Run Job onto the new image (image swap + # only; the --roster/--incremental args + R2-write secrets + SA are + # Terraform-owned in infra/batch.tf). The backfill Batch job is NOT touched + # here — it is submitted separately, after cost sign-off. + - name: Deploy weather-incremental job (image swap only) + if: ${{ inputs.deploy_incremental }} + run: | + gcloud run jobs deploy "${INCREMENTAL_JOB}" \ + --project "${{ vars.SATELLITE_PROJECT_ID }}" \ + --region "${REGION}" \ + --image "${INCREMENTAL_IMAGE}" \ + --quiet diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 0105c03..76c4ac9 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -9,10 +9,16 @@ name: Deploy (hosted GCE platform) # and no SA key is stored in repo secrets. The WIF provider + per-project deploy SAs # are provisioned by the Terraform root in infra/ (28-00). # -# The image-build/push + `gcloud run deploy` / Cloud Batch / MIG steps are -# STUBBED here — later waves (W1 serving, W2 ingest/fleet) fill them in against -# the existing Artifact Registry -# (europe-west3-docker.pkg.dev/mostlyright-backend/mostlyright). +# Per-service build/push + deploy now live in dedicated manual-only workflows +# (this file stays a WIF-auth identity smoke + index): +# - deploy-earnings-serving.yml earnings-serving Cloud Run svc (mr-serving) +# - deploy-weather-serving.yml weather-serving Cloud Run svc (mr-serving) +# - deploy-earnings-capture.yml earnings-capture Cloud Run Job (ingest) +# - deploy-earnings-stt.yml earnings-stt Cloud Run GPU svc (ingest, us-central1) +# - deploy-earnings-rolefact.yml earnings-rolefact Cloud Run Job (ingest) +# - deploy-weather-ingest.yml weather ingest img + weather-incremental Job (satellite) +# - run-weather-backfill.yml weather-backfill Cloud Batch fleet (satellite; cost-gated) +# All push to europe-west3-docker.pkg.dev/mostlyright-backend/mostlyright. # # Setup (one-time, after `tofu apply` in infra/): # Set the following repo/environment variables (Settings -> Variables), read @@ -82,17 +88,17 @@ jobs: - name: Verify auth (identity smoke test) run: gcloud auth list --filter=status:ACTIVE --format="value(account)" - # --------------------------------------------------------------------- - # STUB — per-service deploy workflows fill these in (28-10/11/12/13/21/22/30): - # serving : earnings-serving + weather-serving → Cloud Run (eu-west3), - # timeout 3600, SSE max-instances=1 + affinity deploy check (H2). - # ingest : capture Job + rolefact Job (eu-west3) + STT Cloud Run GPU L4 - # (us-central1, bounded concurrency ≤ L4 quota, H8). - # satellite : weather backfill (Cloud Batch, us-central1) + incremental Job - # (H1: EXISTING mostlyright-satellite project). - # All push to europe-west3-docker.pkg.dev/mostlyright-backend/mostlyright. - # --------------------------------------------------------------------- - - name: Build & deploy (placeholder) + # Per-service build+deploy moved to the dedicated workflows listed in the + # header. This job stays a keyless-auth identity smoke: it proves the + # selected deploy SA can federate, then points the operator at the right + # per-service workflow to run. + - name: Identity smoke + workflow pointer run: | - echo "Deploy target: ${{ inputs.target }}" - echo "Image build + push + gcloud run deploy stubbed — filled in by W1/W2." + echo "Authenticated deploy SA for target '${{ inputs.target }}':" + gcloud auth list --filter=status:ACTIVE --format="value(account)" + case "${{ inputs.target }}" in + serving) echo "Run: deploy-earnings-serving.yml / deploy-weather-serving.yml" ;; + ingest) echo "Run: deploy-earnings-capture.yml / deploy-earnings-stt.yml / deploy-earnings-rolefact.yml" ;; + satellite) echo "Run: deploy-weather-ingest.yml (incremental) then run-weather-backfill.yml (fleet, cost-gated)" ;; + staging) echo "Staging is gated off (enable_staging=false) until the billing quota increase." ;; + esac diff --git a/.github/workflows/run-weather-backfill.yml b/.github/workflows/run-weather-backfill.yml new file mode 100644 index 0000000..32d1c07 --- /dev/null +++ b/.github/workflows/run-weather-backfill.yml @@ -0,0 +1,162 @@ +name: Run weather backfill fleet (28-21) + +# Phase 28 (28-21) — submit the weather backfill as a Cloud Batch job in +# mostlyright-satellite (H1) / us-central1. KEYLESS via Workload Identity +# Federation. +# +# ROLLOUT GATE (operator sequence): the DEFAULT run is a 1-STATION PILOT +# (task_count=1) — cheap, proves the read→reduce→R2-upload loop + the big-bytes +# firewall end to end. The FULL 66-shard fleet (the ~28 TB Kalshi∪Polymarket +# roster, D-28.8) reduces ~28 TB of raw imagery in-region and is the phase's +# largest spend — it is BLOCKED unless the operator sets mode=full AND +# confirm_cost_signoff=true (the H5 pilot cost sign-off). This encodes the +# "serving + incremental first → 1-station pilot → stop at the 28 TB cost number +# for sign-off" rollout as a workflow gate, not a convention. +# +# The Batch config below mirrors infra/batch.tf (Spot n2-standard-4, parallelism +# 16, 6h/shard ceiling, dedicated weather-backfill SA, R2-write + EUMETSAT +# secrets, durable progress bucket for crash-safe resume). Keep it in sync with +# infra/batch.tf if that spec changes. +# +# Setup (repo/environment Variables, from `tofu -chdir=infra output`): +# WIF_PROVIDER = +# DEPLOY_SA_SATELLITE = deploy@mostlyright-satellite... +# AR_HOST = europe-west3-docker.pkg.dev +# SATELLITE_PROJECT_ID = mostlyright-satellite +# SATELLITE_PROJECT_NUMBER = +# RUNTIME_SA_WEATHER_BACKFILL= +# R2_BUCKET = mostlyright-derived +# PROGRESS_BUCKET = mostlyright-backfill-progress- + +on: + workflow_dispatch: + inputs: + image_tag: + description: "weather-backfill image tag to run (must already be pushed by deploy-weather-ingest)." + required: true + default: "latest" + type: string + mode: + description: "pilot = 1 station (cheap, default). full = the 66-shard ~28 TB fleet (needs cost sign-off)." + required: true + default: "pilot" + type: choice + options: + - pilot + - full + pilot_station: + description: "ICAO for the pilot (mode=pilot only), e.g. KNYC." + required: true + default: "KNYC" + type: string + confirm_cost_signoff: + description: "H5 pilot cost sign-off — REQUIRED true to run mode=full (the ~28 TB spend)." + required: true + default: false + type: boolean + +permissions: + id-token: write + contents: read + +env: + AR_HOST: ${{ vars.AR_HOST }} + AR_PROJECT: mostlyright-backend + AR_REPO: mostlyright + IMAGE_NAME: weather-backfill + REGION: us-central1 + +jobs: + submit: + name: Submit the backfill Batch job (pilot by default; full gated on cost sign-off) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Enforce the cost sign-off gate for a full run + run: | + if [ "${{ inputs.mode }}" = "full" ] && [ "${{ inputs.confirm_cost_signoff }}" != "true" ]; then + echo "::error::mode=full runs the 66-shard ~28 TB fleet (the phase's largest spend)." + echo "::error::Set confirm_cost_signoff=true (H5 pilot cost sign-off) to proceed, or use mode=pilot." + exit 1 + fi + echo "gate OK: mode=${{ inputs.mode }}" + + - name: Authenticate to GCP (WIF, keyless) + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ vars.WIF_PROVIDER }} + service_account: ${{ vars.DEPLOY_SA_SATELLITE }} + + - name: Set up gcloud + uses: google-github-actions/setup-gcloud@v2 + + - name: Build the Batch job config (task_count + roster args by mode) + run: | + set -euo pipefail + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + NUM="${{ vars.SATELLITE_PROJECT_NUMBER }}" + if [ "${{ inputs.mode }}" = "full" ]; then + TASK_COUNT=66 + ROSTER_ARGS='"--roster", "kalshi,polymarket"' + else + TASK_COUNT=1 + # Pilot: one explicit station (no roster sharding). + ROSTER_ARGS='"--stations", "${{ inputs.pilot_station }}"' + fi + cat > batch-job.json <` +# is denied. +# 4. Deploy SA → roles/artifactregistry.writer on the REUSED backend repo, so +# the per-service workflows can `docker push`. This AMENDS the reader-only +# posture (artifact_registry.tf, T-28-00-04): images are built+pushed FROM CI +# as the deploy SA (build-and-push-from-Actions), which needs writer on the +# target repo. Scoped to the one repo, additive, WRITER (not admin) — the +# backend project is otherwise untouched (D-28.1). +# +# Least privilege: run.developer (not run.admin); serviceAccountUser scoped to the +# specific runtime SA (not project-wide); writer (not admin) on one repo. + +# ===================================================================== +# 1. Public invoker on the two serving services (GATE #2, fail-closed auth) +# ===================================================================== +resource "google_cloud_run_v2_service_iam_member" "earnings_serving_public" { + project = google_cloud_run_v2_service.earnings_serving.project + location = google_cloud_run_v2_service.earnings_serving.location + name = google_cloud_run_v2_service.earnings_serving.name + role = "roles/run.invoker" + member = "allUsers" +} + +resource "google_cloud_run_v2_service_iam_member" "weather_serving_public" { + project = google_cloud_run_v2_service.weather_serving.project + location = google_cloud_run_v2_service.weather_serving.location + name = google_cloud_run_v2_service.weather_serving.name + role = "roles/run.invoker" + member = "allUsers" +} + +# NOTE: the STT service (google_cloud_run_v2_service.stt) is deliberately NOT +# granted public invoker — it is an internal ingest GPU workload, not an +# internet-facing surface (audio firewall). + +# ===================================================================== +# 2. Deploy SA → run.developer in each target project +# ===================================================================== +resource "google_project_iam_member" "deploy_serving_run_developer" { + project = google_project.serving.project_id + role = "roles/run.developer" + member = "serviceAccount:${google_service_account.deploy["serving"].email}" +} + +resource "google_project_iam_member" "deploy_ingest_run_developer" { + project = google_project.ingest.project_id + role = "roles/run.developer" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +# Satellite deploy SA rolls the weather-incremental Cloud Run Job (H1). +resource "google_project_iam_member" "deploy_satellite_run_developer" { + project = var.satellite_project_id + role = "roles/run.developer" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +# Satellite deploy SA also SUBMITS the backfill Cloud Batch fleet +# (run-weather-backfill.yml). batch.jobsEditor is the submit/get/delete role. +resource "google_project_iam_member" "deploy_satellite_batch_editor" { + project = var.satellite_project_id + role = "roles/batch.jobsEditor" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +# ===================================================================== +# 3. Deploy SA → act-as (serviceAccountUser) on each runtime SA it assigns +# ===================================================================== +# Serving deploy SA runs both serving services as the `serving` runtime SA. +resource "google_service_account_iam_member" "deploy_serving_actas_serving" { + service_account_id = google_service_account.serving.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["serving"].email}" +} + +# Ingest deploy SA runs capture / stt / rolefact as their dedicated runtime SAs. +resource "google_service_account_iam_member" "deploy_ingest_actas_capture" { + service_account_id = google_service_account.earnings_capture.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +resource "google_service_account_iam_member" "deploy_ingest_actas_stt" { + service_account_id = google_service_account.earnings_stt.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +resource "google_service_account_iam_member" "deploy_ingest_actas_rolefact" { + service_account_id = google_service_account.earnings_rolefact.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy["ingest"].email}" +} + +# Satellite deploy SA runs the incremental Job + the backfill Batch fleet as +# their dedicated runtime SAs (both need act-as: the Job's service_account and +# the Batch allocation_policy.service_account). +resource "google_service_account_iam_member" "deploy_satellite_actas_incremental" { + service_account_id = google_service_account.weather_incremental.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +resource "google_service_account_iam_member" "deploy_satellite_actas_backfill" { + service_account_id = google_service_account.weather_backfill.name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} + +# ===================================================================== +# 4. Deploy SA → artifactregistry.writer on the REUSED backend repo (CI push) +# ===================================================================== +# The created-project deploy SAs (serving + ingest [+ staging when enabled]). +resource "google_artifact_registry_repository_iam_member" "writer" { + for_each = google_service_account.deploy + + project = local.ar_project + location = local.ar_location + repository = local.ar_repository + role = "roles/artifactregistry.writer" + member = "serviceAccount:${each.value.email}" +} + +# The satellite deploy SA (EXISTING project, H1) also pushes the weather ingest +# image, so it needs writer too (it already has reader from wif.tf). +resource "google_artifact_registry_repository_iam_member" "writer_satellite" { + project = local.ar_project + location = local.ar_location + repository = local.ar_repository + role = "roles/artifactregistry.writer" + member = "serviceAccount:${google_service_account.deploy_satellite.email}" +} diff --git a/infra/outputs.tf b/infra/outputs.tf index e281f65..b696766 100644 --- a/infra/outputs.tf +++ b/infra/outputs.tf @@ -69,11 +69,11 @@ output "serving_urls" { output "pubsub_topics" { description = "Pub/Sub transport resource IDs — the earnings-streaming SSE bridge (C2) + capture-jobs (+ dead-letter, H7)." value = { - earnings_streaming = google_pubsub_topic.earnings_streaming.id - earnings_streaming_sub = google_pubsub_subscription.earnings_streaming.id - capture_jobs = google_pubsub_topic.capture_jobs.id - capture_jobs_sub = google_pubsub_subscription.capture_jobs.id - capture_jobs_deadletter = google_pubsub_topic.capture_jobs_deadletter.id + earnings_streaming = google_pubsub_topic.earnings_streaming.id + earnings_streaming_sub = google_pubsub_subscription.earnings_streaming.id + capture_jobs = google_pubsub_topic.capture_jobs.id + capture_jobs_sub = google_pubsub_subscription.capture_jobs.id + capture_jobs_deadletter = google_pubsub_topic.capture_jobs_deadletter.id } } @@ -84,3 +84,22 @@ output "budget_notification_channels" { pubsub = google_monitoring_notification_channel.budget_pubsub.id } } + +# --- Deploy-runtime layer (28 deploy workflows) --- +# The extra repo Variables the per-service deploy workflows read, beyond the +# project_ids / deploy_service_accounts / wif_provider_name above. + +output "satellite_project_number" { + description = "mostlyright-satellite project number (H1) — used by run-weather-backfill.yml to build the Batch secret resource paths. Set as SATELLITE_PROJECT_NUMBER." + value = var.satellite_project_number +} + +output "backfill_progress_bucket" { + description = "Durable GCS completion-marker bucket for the backfill fleet (C4 crash-safe resume). Set as PROGRESS_BUCKET for run-weather-backfill.yml." + value = google_storage_bucket.backfill_progress.name +} + +output "r2_bucket" { + description = "The single platform R2 bucket derived parquet is written to / served from. Set as R2_BUCKET for the ingest deploy workflows." + value = var.r2_bucket +} From 741225b22ca8e79051e04acf851d5a4ce9afc6ce Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 19:28:56 +0200 Subject: [PATCH 05/17] =?UTF-8?q?fix(28):=20review=20round=201=20=E2=80=94?= =?UTF-8?q?=20STT=20HTTP=20service,=20pilot=20args,=20injection-safe=20wor?= =?UTF-8?q?kflows?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex + Python Architect findings (P1/P2): - P1: STT deploys as a Cloud Run SERVICE but ran a one-shot CLI (never bound $PORT). Add services.earnings.jobs.stt_server (uvicorn /healthz + /transcribe wrapping the shipped transcriber); repoint stt.Dockerfile to serve it (one-shot CLI kept for the GCE MIG fallback). - P2: run-weather-backfill pilot passed only --stations (explicit CLI mode needs satellites/products/year-window/out) — pass the full explicit arg set. - P2: script-injection — image_tag/pilot_station now flow via env: and the Batch JSON is built with jq --arg (no shell/JSON interpolation) across all new workflows. - P2: rolefact skips the R2 upload when a zero-mention call writes no fact parquet. - P2: satellite CLI warns loudly for roster stations outside the GOES footprint. - P3: /healthz exemption is now trailing-slash tolerant across all 5 middlewares. --- .github/workflows/deploy-earnings-capture.yml | 4 +- .../workflows/deploy-earnings-rolefact.yml | 4 +- .github/workflows/deploy-earnings-serving.yml | 4 +- .github/workflows/deploy-earnings-stt.yml | 4 +- .github/workflows/deploy-weather-ingest.yml | 6 +- .github/workflows/run-weather-backfill.yml | 110 ++++++++------- deploy/earnings/stt.Dockerfile | 20 ++- .../mostlyright/weather/satellite/__main__.py | 30 +++++ services/earnings/jobs/rolefact.py | 13 +- services/earnings/jobs/stt.py | 125 +++++++++++++----- services/earnings/jobs/stt_server.py | 93 +++++++++++++ services/earnings/middleware/auth.py | 2 +- services/earnings/middleware/ratelimit.py | 2 +- .../earnings/tests/test_jobs_entrypoints.py | 4 +- services/earnings/tests/test_stt_server.py | 60 +++++++++ services/weather/middleware/auth.py | 2 +- services/weather/middleware/ceiling.py | 2 +- services/weather/middleware/ratelimit.py | 2 +- 18 files changed, 383 insertions(+), 104 deletions(-) create mode 100644 services/earnings/jobs/stt_server.py create mode 100644 services/earnings/tests/test_stt_server.py diff --git a/.github/workflows/deploy-earnings-capture.yml b/.github/workflows/deploy-earnings-capture.yml index dd05205..d41ef4f 100644 --- a/.github/workflows/deploy-earnings-capture.yml +++ b/.github/workflows/deploy-earnings-capture.yml @@ -54,8 +54,10 @@ jobs: run: gcloud auth configure-docker "${AR_HOST}" --quiet - name: Build capture image + env: + IMAGE_TAG: ${{ inputs.image_tag }} run: | - IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" docker build -f deploy/earnings/capture.Dockerfile -t "${IMAGE}" . diff --git a/.github/workflows/deploy-earnings-rolefact.yml b/.github/workflows/deploy-earnings-rolefact.yml index fbc3301..3bae666 100644 --- a/.github/workflows/deploy-earnings-rolefact.yml +++ b/.github/workflows/deploy-earnings-rolefact.yml @@ -51,8 +51,10 @@ jobs: run: gcloud auth configure-docker "${AR_HOST}" --quiet - name: Build rolefact image + env: + IMAGE_TAG: ${{ inputs.image_tag }} run: | - IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" docker build -f deploy/earnings/rolefact.Dockerfile -t "${IMAGE}" . diff --git a/.github/workflows/deploy-earnings-serving.yml b/.github/workflows/deploy-earnings-serving.yml index 95e4018..4d7a92c 100644 --- a/.github/workflows/deploy-earnings-serving.yml +++ b/.github/workflows/deploy-earnings-serving.yml @@ -61,8 +61,10 @@ jobs: run: gcloud auth configure-docker "${AR_HOST}" --quiet - name: Build audio-free earnings-serving image + env: + IMAGE_TAG: ${{ inputs.image_tag }} run: | - IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" # Build from the repo root so the Dockerfile can COPY packages/ + services/. docker build \ diff --git a/.github/workflows/deploy-earnings-stt.yml b/.github/workflows/deploy-earnings-stt.yml index 68bd6a9..3c33f54 100644 --- a/.github/workflows/deploy-earnings-stt.yml +++ b/.github/workflows/deploy-earnings-stt.yml @@ -57,8 +57,10 @@ jobs: run: gcloud auth configure-docker "${AR_HOST}" --quiet - name: Build STT (CUDA + faster-whisper, no torch) image + env: + IMAGE_TAG: ${{ inputs.image_tag }} run: | - IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" echo "IMAGE=${IMAGE}" >> "$GITHUB_ENV" docker build -f deploy/earnings/stt.Dockerfile -t "${IMAGE}" . diff --git a/.github/workflows/deploy-weather-ingest.yml b/.github/workflows/deploy-weather-ingest.yml index 392b439..67fcfad 100644 --- a/.github/workflows/deploy-weather-ingest.yml +++ b/.github/workflows/deploy-weather-ingest.yml @@ -70,10 +70,12 @@ jobs: run: gcloud auth configure-docker "${AR_HOST}" --quiet - name: Build the shared ingest image + env: + IMAGE_TAG: ${{ inputs.image_tag }} run: | BASE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}" - BACKFILL_IMAGE="${BASE}/${BACKFILL_IMAGE_NAME}:${{ inputs.image_tag }}" - INCREMENTAL_IMAGE="${BASE}/${INCREMENTAL_IMAGE_NAME}:${{ inputs.image_tag }}" + BACKFILL_IMAGE="${BASE}/${BACKFILL_IMAGE_NAME}:${IMAGE_TAG}" + INCREMENTAL_IMAGE="${BASE}/${INCREMENTAL_IMAGE_NAME}:${IMAGE_TAG}" echo "BACKFILL_IMAGE=${BACKFILL_IMAGE}" >> "$GITHUB_ENV" echo "INCREMENTAL_IMAGE=${INCREMENTAL_IMAGE}" >> "$GITHUB_ENV" docker build \ diff --git a/.github/workflows/run-weather-backfill.yml b/.github/workflows/run-weather-backfill.yml index 32d1c07..aa7b948 100644 --- a/.github/workflows/run-weather-backfill.yml +++ b/.github/workflows/run-weather-backfill.yml @@ -91,64 +91,74 @@ jobs: - name: Set up gcloud uses: google-github-actions/setup-gcloud@v2 - - name: Build the Batch job config (task_count + roster args by mode) + # Untrusted workflow_dispatch string inputs are passed via env (NOT + # interpolated into the shell/JSON text) and the JSON is assembled with jq + # --arg, so a hostile image_tag / pilot_station cannot inject shell or break + # the Batch JSON (GitHub script-injection-safe pattern). + - name: Build the Batch job config (task_count + args by mode) + env: + IMAGE_TAG: ${{ inputs.image_tag }} + MODE: ${{ inputs.mode }} + PILOT_STATION: ${{ inputs.pilot_station }} + NUM: ${{ vars.SATELLITE_PROJECT_NUMBER }} + R2_BUCKET: ${{ vars.R2_BUCKET }} + PROGRESS_BUCKET: ${{ vars.PROGRESS_BUCKET }} + RUNTIME_SA: ${{ vars.RUNTIME_SA_WEATHER_BACKFILL }} run: | set -euo pipefail - IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${{ inputs.image_tag }}" - NUM="${{ vars.SATELLITE_PROJECT_NUMBER }}" - if [ "${{ inputs.mode }}" = "full" ]; then + IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + if [ "$MODE" = "full" ]; then TASK_COUNT=66 - ROSTER_ARGS='"--roster", "kalshi,polymarket"' + # Roster mode: the CLI resolves + shards the 66-station roster by + # BATCH_TASK_INDEX and supplies satellite/product/year defaults. + COMMANDS=$(jq -nc --arg pb "$PROGRESS_BUCKET" --arg rb "$R2_BUCKET" \ + '["--mirror","gcp","--roster","kalshi,polymarket","--progress-bucket",$pb,"--r2-bucket",$rb]') else TASK_COUNT=1 - # Pilot: one explicit station (no roster sharding). - ROSTER_ARGS='"--stations", "${{ inputs.pilot_station }}"' + YEAR=$(date -u +%Y) + # Pilot: explicit single-station backfill. The CLI's explicit mode + # (no --roster) requires satellites/products/year-window/out, so pass + # them all. GOES-East ACMC covers the default KNYC pilot; an + # international pilot station needs --satellites overridden. + COMMANDS=$(jq -nc --arg st "$PILOT_STATION" --arg y "$YEAR" \ + --arg pb "$PROGRESS_BUCKET" --arg rb "$R2_BUCKET" \ + '["--mirror","gcp","--satellites","goes16","--products","ABI-L2-ACMC","--stations",$st,"--year-start",$y,"--year-end",$y,"--out","/tmp/derived","--r2-target","--r2-bucket",$rb,"--progress-bucket",$pb]') fi - cat > batch-job.json < batch-job.json echo "task_count=${TASK_COUNT}" cat batch-job.json diff --git a/deploy/earnings/stt.Dockerfile b/deploy/earnings/stt.Dockerfile index fed2a82..f507b19 100644 --- a/deploy/earnings/stt.Dockerfile +++ b/deploy/earnings/stt.Dockerfile @@ -46,15 +46,23 @@ COPY packages/weather/ packages/weather/ # core + weather[earnings] — the [earnings] extra pins faster-whisper>=1.0,<2.0 # (CTranslate2 Whisper; NO torch) + av. python3.12's pip resolves the wheels. +# fastapi + uvicorn back the HTTP surface: infra declares STT as a Cloud Run +# SERVICE (google_cloud_run_v2_service.stt), so the container MUST serve $PORT. RUN python -m pip install --break-system-packages \ ./packages/core \ - "./packages/weather[earnings]" + "./packages/weather[earnings]" \ + "fastapi>=0.115,<1" \ + "uvicorn[standard]>=0.30" # --- App layer --------------------------------------------------------------- COPY services/earnings/ services/earnings/ -# Cloud Run Jobs pass the STT spec via env (STT_AUDIO_PATH / STT_TICKER / -# STT_CALL_ID / STT_TIER / STT_DEVICE / STT_COMPUTE_TYPE / STT_INITIAL_PROMPT; -# EARNINGS_STREAMING_* opt in to the live publish). The entrypoint fails loud on -# a missing var. -ENTRYPOINT ["python", "-m", "services.earnings.jobs.stt"] +# STT runs as a Cloud Run SERVICE: serve the HTTP transcription surface +# (services.earnings.jobs.stt_server:app — GET /healthz + POST /transcribe) so the +# revision becomes ready and can accept GPU transcription requests. The one-shot +# `python -m services.earnings.jobs.stt` CLI is retained in the image for the GCE +# L4 MIG fallback. faster-whisper is lazy-loaded on the first /transcribe, so +# /healthz answers without the GPU model-load cost. Cloud Run injects $PORT. +ENV PORT=8080 +EXPOSE 8080 +CMD ["sh", "-c", "uvicorn services.earnings.jobs.stt_server:app --host 0.0.0.0 --port ${PORT} --workers 1"] diff --git a/packages/weather/src/mostlyright/weather/satellite/__main__.py b/packages/weather/src/mostlyright/weather/satellite/__main__.py index 9a27fa3..aafbf00 100644 --- a/packages/weather/src/mostlyright/weather/satellite/__main__.py +++ b/packages/weather/src/mostlyright/weather/satellite/__main__.py @@ -284,6 +284,28 @@ def _from(flag: int | None, env_name: str, default: int) -> int: return index, count +#: ICAO first-letter prefixes inside the GOES-East/West footprint (Americas + +#: E-Pacific). A roster station outside this set is NOT visible to the GOES-only +#: default satellites and needs Himawari (Asia/Pacific) / Meteosat (Europe/Africa) +#: / VIIRS. Coarse by design — it drives a WARNING, never a silent skip. +_GOES_FOOTPRINT_ICAO_PREFIXES: tuple[str, ...] = ("K", "C", "M", "P", "S", "T") + + +def _warn_non_goes_stations(stations: list[str]) -> None: + """Warn (stderr) for shard stations the GOES-only default satellites can't see.""" + off_footprint = [ + s for s in stations if not s[:1].upper().startswith(_GOES_FOOTPRINT_ICAO_PREFIXES) + ] + if off_footprint: + print( + "WARNING: roster stations outside the GOES footprint will produce NO " + f"coverage under the default GOES-only satellites: {', '.join(off_footprint)}. " + "Pass --satellites (Himawari/Meteosat/VIIRS) to cover them, or expect empty " + "partitions for these shards (28-26 native-ring backfill).", + file=sys.stderr, + ) + + def _run_backfill(args: argparse.Namespace) -> int: # --- 28-21: reconcile roster mode vs the explicit-args mode -------------- if args.roster is not None and args.stations is not None: @@ -313,6 +335,14 @@ def _run_backfill(args: argparse.Namespace) -> int: stations = list(shard_roster(roster, index, count)) satellites = args.satellites or list(_DEFAULT_ROSTER_SATELLITES) products = args.products or list(_DEFAULT_ROSTER_PRODUCTS) + # Coverage guard (no SILENT under-coverage): the 66-station roster spans + # the globe, but the GOES-only default satellites see only the Americas / + # E-Pacific. Warn LOUDLY (stderr, visible in Cloud Batch logs) for any + # shard station outside the GOES footprint so a `mode=full` operator is not + # blindsided by empty partitions + wasted Spot spend. Override --satellites + # (Himawari/Meteosat/VIIRS) to actually cover those stations. + if args.satellites is None: + _warn_non_goes_stations(stations) if year_start is None: year_start = _DEFAULT_ROSTER_YEAR_START if year_end is None: diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py index aa60977..c238e8d 100644 --- a/services/earnings/jobs/rolefact.py +++ b/services/earnings/jobs/rolefact.py @@ -156,7 +156,18 @@ def main(argv: list[str] | None = None) -> int: fact_path, ) - _maybe_upload_r2(str(fact_path), ticker=ticker, call_id=call_id) + # A legitimate zero-mention call builds no fact rows, so FactLedger.append + # writes NO parquet — uploading a non-existent path would crash an otherwise + # valid call. Only upload when a partition actually exists (guard on both the + # row count and the file, since the ledger may no-op an empty append). + if fact_rows and fact_path.exists(): + _maybe_upload_r2(str(fact_path), ticker=ticker, call_id=call_id) + else: + _LOG.info( + "rolefact: no fact rows for %s/%s — skipping R2 upload (nothing to upload)", + ticker, + call_id, + ) return 0 diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py index 31be6ee..b1b68d8 100644 --- a/services/earnings/jobs/stt.py +++ b/services/earnings/jobs/stt.py @@ -90,41 +90,43 @@ def _segment_rows( return rows -def main(argv: list[str] | None = None) -> int: - """Transcribe the transient audio → transcript ledger (+ optional live publish). - - Returns ``0`` on success. A missing required env var, a transcription failure, - or a ledger write failure propagates as a non-zero exit (fail loud). No audio - ever reaches the ledger or the wire. +def transcribe_call( + audio_path: str, + *, + ticker: str, + call_id: str, + tier: str = _DEFAULT_TIER, + device: str = _DEFAULT_DEVICE, + compute_type: str = _DEFAULT_COMPUTE_TYPE, + initial_prompt: str | None = None, + publish_live: bool = False, + streaming_project: str | None = None, + streaming_topic: str = "earnings-streaming", +) -> dict[str, object]: + """Transcribe one transient audio file → transcript ledger (+ optional publish). + + The shared core used by BOTH the one-shot Cloud Run Job entrypoint + (:func:`main`) and the Cloud Run SERVICE HTTP handler + (:mod:`services.earnings.jobs.stt_server`). Returns a small audio-free summary + dict (segment/row counts, language, duration) — NEVER audio. Raises on a + transcription / ledger failure (fail loud). No audio reaches the ledger or wire. """ - logging.basicConfig(level=logging.INFO) - - audio_path = require_env("STT_AUDIO_PATH") - ticker = require_env("STT_TICKER") - call_id = require_env("STT_CALL_ID") - tier = optional_env("STT_TIER", _DEFAULT_TIER) or _DEFAULT_TIER - device = optional_env("STT_DEVICE", _DEFAULT_DEVICE) or _DEFAULT_DEVICE - compute_type = optional_env("STT_COMPUTE_TYPE", _DEFAULT_COMPUTE_TYPE) or _DEFAULT_COMPUTE_TYPE - initial_prompt = optional_env("STT_INITIAL_PROMPT") - if not os.path.exists(audio_path): raise FileNotFoundError( - f"STT_AUDIO_PATH {audio_path!r} does not exist — the capture job's " - "transient audio must be present on the shared ephemeral disk." + f"audio path {audio_path!r} does not exist — the capture job's transient " + "audio must be present on the shared ephemeral disk." ) # Lazy import: SttTranscriber lazy-imports faster-whisper inside transcribe, # so nothing heavy loads at module import. from mostlyright.weather.earnings.stt import SttTranscriber - _LOG.info( - "stt job start: ticker=%s call_id=%s tier=%s device=%s", ticker, call_id, tier, device - ) + _LOG.info("stt start: ticker=%s call_id=%s tier=%s device=%s", ticker, call_id, tier, device) transcriber = SttTranscriber(tier, device=device, compute_type=compute_type) result = transcriber.transcribe(audio_path, initial_prompt=initial_prompt) _LOG.info( - "stt job transcribed: ticker=%s call_id=%s segments=%d language=%s duration=%s", + "stt transcribed: ticker=%s call_id=%s segments=%d language=%s duration=%s", ticker, call_id, len(result.segments), @@ -135,18 +137,68 @@ def main(argv: list[str] | None = None) -> int: rows = _segment_rows(result.segments, ticker=ticker, call_id=call_id) # Lazy import: the ledger pulls pyarrow/filelock; kept out of module load so a - # fake-ledger test can import this module without them. (These are base weather - # deps, but keeping the import inside main mirrors the audio-toolchain seam.) + # fake-ledger test can import this module without them. from mostlyright.weather.earnings.ledger import TranscriptLedger ledger = TranscriptLedger() total = ledger.append(rows, ticker=ticker, call_id=call_id) _LOG.info( - "stt job wrote transcript ledger: ticker=%s call_id=%s rows_now=%d", ticker, call_id, total + "stt wrote transcript ledger: ticker=%s call_id=%s rows_now=%d", ticker, call_id, total ) - _maybe_publish_live(result.segments, ticker=ticker, call_id=call_id) + if publish_live: + _maybe_publish_live( + result.segments, + ticker=ticker, + call_id=call_id, + project=streaming_project, + topic=streaming_topic, + ) + + return { + "ticker": ticker, + "call_id": call_id, + "segments": len(result.segments), + "rows_written": total, + "language": result.language, + "duration": result.duration, + } + +def main(argv: list[str] | None = None) -> int: + """Transcribe the transient audio → transcript ledger (+ optional live publish). + + The one-shot Cloud Run Job / GCE MIG entrypoint (env-driven). Returns ``0`` on + success; a missing required env var, a transcription failure, or a ledger write + failure propagates as a non-zero exit (fail loud). No audio ever reaches the + ledger or the wire. + """ + logging.basicConfig(level=logging.INFO) + + audio_path = require_env("STT_AUDIO_PATH") + ticker = require_env("STT_TICKER") + call_id = require_env("STT_CALL_ID") + tier = optional_env("STT_TIER", _DEFAULT_TIER) or _DEFAULT_TIER + device = optional_env("STT_DEVICE", _DEFAULT_DEVICE) or _DEFAULT_DEVICE + compute_type = optional_env("STT_COMPUTE_TYPE", _DEFAULT_COMPUTE_TYPE) or _DEFAULT_COMPUTE_TYPE + initial_prompt = optional_env("STT_INITIAL_PROMPT") + + enabled = optional_env("EARNINGS_STREAMING_ENABLED") + publish_live = bool(enabled) and enabled.lower() not in ("0", "false", "no") + + transcribe_call( + audio_path, + ticker=ticker, + call_id=call_id, + tier=tier, + device=device, + compute_type=compute_type, + initial_prompt=initial_prompt, + publish_live=publish_live, + streaming_project=optional_env("EARNINGS_STREAMING_PROJECT"), + streaming_topic=optional_env("EARNINGS_STREAMING_TOPIC", "earnings-streaming") + or "earnings-streaming", + ) return 0 @@ -155,27 +207,30 @@ def _maybe_publish_live( *, ticker: str, call_id: str, + project: str | None, + topic: str = "earnings-streaming", ) -> None: """Opt-in live publish of segments to the ``earnings-streaming`` topic. - Enabled ONLY when ``EARNINGS_STREAMING_ENABLED`` is truthy AND a project is - configured. The real ``google.cloud.pubsub_v1`` client is lazy-constructed by + The caller decides whether live publish is enabled; this fail-softs when no + ``project`` is configured. The real ``google.cloud.pubsub_v1`` client is + lazy-constructed by :func:`~services.earnings.pubsub_bridge.build_publisher_client` (never at module load). The batch STT segments are published as final transcript segments; the true partial→final streaming path is the operator-gated 27-10 live engine (not driven from this batch job). + + NOTE (temporal seam): ``Segment.spoken_at``/``knowledge_time`` are typed float; + here they carry the segment's ENGINE-RELATIVE offset-seconds (not a wallclock + epoch). This is the opt-in, explicitly-non-authoritative live-preview path — + the authoritative ledger write uses ``offset_seconds`` (see ``_segment_rows``). """ - enabled = optional_env("EARNINGS_STREAMING_ENABLED") - if not enabled or enabled.lower() in ("0", "false", "no"): - return - project = optional_env("EARNINGS_STREAMING_PROJECT") if not project: _LOG.warning( - "EARNINGS_STREAMING_ENABLED set but EARNINGS_STREAMING_PROJECT unset — " - "skipping live publish (fail soft: the batch ledger is authoritative)." + "live publish requested but no streaming project configured — skipping " + "(fail soft: the batch ledger is authoritative)." ) return - topic = optional_env("EARNINGS_STREAMING_TOPIC", "earnings-streaming") or "earnings-streaming" from mostlyright.weather.earnings.streaming_transcriber import Segment diff --git a/services/earnings/jobs/stt_server.py b/services/earnings/jobs/stt_server.py new file mode 100644 index 0000000..9b88c47 --- /dev/null +++ b/services/earnings/jobs/stt_server.py @@ -0,0 +1,93 @@ +"""Earnings STT Cloud Run SERVICE — HTTP transcription surface (Phase 28, 28-11). + +The infra declares STT as a Cloud Run **service** (``google_cloud_run_v2_service.stt``, +L4 GPU, scale-to-zero), so its container MUST serve HTTP on ``$PORT`` — a one-shot +CLI would never pass Cloud Run readiness. This is that HTTP surface: a thin FastAPI +app around the SAME shipped transcriber the one-shot job uses +(:func:`services.earnings.jobs.stt.transcribe_call`). The one-shot +``python -m services.earnings.jobs.stt`` entrypoint is kept for the GCE L4 MIG +fallback (28-OPERATOR-INPUTS). + +Routes: + * ``GET /healthz`` — unauthenticated liveness (no model load): lets the Cloud Run + revision become ready WITHOUT paying the GPU model-load cost. It touches no + faster-whisper state (the model is lazy-loaded on the first /transcribe). + * ``POST /transcribe`` — body ``{audio_path, ticker, call_id, tier?, device?, + compute_type?, initial_prompt?, publish_live?, streaming_project?, + streaming_topic?}`` → transcribes the transient audio, writes the AUDIO-FREE + transcript ledger, returns an audio-free summary (segment/row counts, language, + duration). NEVER returns or persists audio (D-27.9). + +**Audio firewall (D-27.9).** No route exposes an audio path/media-type/field. The +audio file is a transient input on the shared ephemeral disk; only transcript TEXT +crosses into the ledger. + +**Lazy imports.** faster-whisper / CTranslate2 stay lazy inside the shipped +transcriber, so importing this module (and answering /healthz) needs no GPU. +""" + +from __future__ import annotations + +import logging +from typing import Annotated + +from fastapi import Body, FastAPI, HTTPException + +from services.earnings.jobs.stt import transcribe_call + +_LOG = logging.getLogger("services.earnings.jobs.stt_server") + +app = FastAPI( + title="mostlyright earnings STT service", + summary="GPU transcription of transient earnings audio → audio-free transcript ledger.", + version="0.1.0", +) + + +@app.get("/healthz", summary="Liveness probe (no model load, unauthenticated)") +def healthz() -> dict[str, str]: + """Static liveness token — lets the Cloud Run revision become ready cheaply.""" + return {"status": "ok"} + + +@app.post("/transcribe", summary="Transcribe transient audio → transcript ledger (text only)") +def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: + """Transcribe one call's transient audio; return an AUDIO-FREE summary. + + Fails 400 on a missing required field and 500 on a transcription/ledger error + (fail loud) — never a silent partial write. No audio is returned. + """ + audio_path = payload.get("audio_path") + ticker = payload.get("ticker") + call_id = payload.get("call_id") + missing = [ + k + for k, v in (("audio_path", audio_path), ("ticker", ticker), ("call_id", call_id)) + if not v + ] + if missing: + raise HTTPException( + status_code=400, detail=f"missing required field(s): {', '.join(missing)}" + ) + + try: + return transcribe_call( + str(audio_path), + ticker=str(ticker), + call_id=str(call_id), + tier=str(payload.get("tier") or "large-v3"), + device=str(payload.get("device") or "cuda"), + compute_type=str(payload.get("compute_type") or "float16"), + initial_prompt=payload.get("initial_prompt"), + publish_live=bool(payload.get("publish_live", False)), + streaming_project=payload.get("streaming_project"), + streaming_topic=str(payload.get("streaming_topic") or "earnings-streaming"), + ) + except FileNotFoundError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + except Exception as exc: # pragma: no cover - transcription/ledger failure path + _LOG.exception("transcription failed for %s/%s", ticker, call_id) + raise HTTPException(status_code=500, detail=f"transcription failed: {exc}") from exc + + +__all__ = ["app", "healthz", "transcribe"] diff --git a/services/earnings/middleware/auth.py b/services/earnings/middleware/auth.py index 251789d..ddfe508 100644 --- a/services/earnings/middleware/auth.py +++ b/services/earnings/middleware/auth.py @@ -89,7 +89,7 @@ async def dispatch( # /healthz is the unauthenticated Cloud Run liveness probe — a health check # cannot present the API key, so it bypasses the gate BEFORE the key check # (mirrors the /stream signed-token exemption below). - if request.url.path == "/healthz": + if request.url.path.rstrip("/") == "/healthz": return await call_next(request) if self._expected_key is None: # Keyless local/dev mode — gate open. diff --git a/services/earnings/middleware/ratelimit.py b/services/earnings/middleware/ratelimit.py index 1127846..7d52cfc 100644 --- a/services/earnings/middleware/ratelimit.py +++ b/services/earnings/middleware/ratelimit.py @@ -162,7 +162,7 @@ async def dispatch( ) -> Response: # /healthz is the Cloud Run liveness probe — never throttle it: a probe # answered with 429 would make Cloud Run kill a healthy instance. - if request.url.path == "/healthz": + if request.url.path.rstrip("/") == "/healthz": return await call_next(request) if not self._consume(self._client_key(request)): return JSONResponse( diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py index 0ac5c06..2018e34 100644 --- a/services/earnings/tests/test_jobs_entrypoints.py +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -244,7 +244,9 @@ def test_stt_main_fails_loud_when_audio_missing(monkeypatch: pytest.MonkeyPatch, monkeypatch.setenv("STT_AUDIO_PATH", str(tmp_path / "nope.wav")) monkeypatch.setenv("STT_TICKER", "CHWY") monkeypatch.setenv("STT_CALL_ID", "evt-1") - with pytest.raises(FileNotFoundError, match="STT_AUDIO_PATH"): + # transcribe_call raises a generic "audio path ... does not exist" (shared by + # the one-shot main() and the HTTP server); the fail-loud property is what matters. + with pytest.raises(FileNotFoundError, match="does not exist"): stt_job.main() diff --git a/services/earnings/tests/test_stt_server.py b/services/earnings/tests/test_stt_server.py new file mode 100644 index 0000000..75883d9 --- /dev/null +++ b/services/earnings/tests/test_stt_server.py @@ -0,0 +1,60 @@ +"""Phase 28 (28-11): the STT Cloud Run SERVICE HTTP surface. + +The STT image is deployed as a Cloud Run service (must serve $PORT), so it exposes +GET /healthz (ready without a GPU model load) + POST /transcribe (wraps the shipped +transcriber). Audio is never returned. faster-whisper stays lazy — importing the +server + answering /healthz must not load it. +""" + +from __future__ import annotations + +import sys + +from fastapi.testclient import TestClient + + +def test_import_and_healthz_do_not_load_whisper(monkeypatch) -> None: + # Poison the heavy audio deps: importing the server + hitting /healthz must not + # touch them (they are lazy-loaded only inside transcribe_call on a real call). + for mod in ("faster_whisper", "ctranslate2", "av"): + monkeypatch.setitem(sys.modules, mod, None) + from services.earnings.jobs.stt_server import app + + client = TestClient(app) + resp = client.get("/healthz") + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + + +def test_transcribe_missing_field_returns_400() -> None: + from services.earnings.jobs.stt_server import app + + client = TestClient(app) + resp = client.post("/transcribe", json={"ticker": "GIS"}) # no audio_path/call_id + assert resp.status_code == 400 + assert "missing required field" in resp.json()["detail"] + + +def test_transcribe_delegates_to_transcribe_call(monkeypatch) -> None: + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured["audio_path"] = audio_path + captured.update(kwargs) + return {"ticker": kwargs["ticker"], "call_id": kwargs["call_id"], "segments": 3} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + client = TestClient(server.app) + resp = client.post( + "/transcribe", + json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1", "tier": "small"}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body == {"ticker": "GIS", "call_id": "c1", "segments": 3} + # No audio field leaks in the response (D-27.9). + assert not any("audio" in k.lower() for k in body) + assert captured["audio_path"] == "/tmp/a.wav" + assert captured["tier"] == "small" diff --git a/services/weather/middleware/auth.py b/services/weather/middleware/auth.py index bb43489..0327572 100644 --- a/services/weather/middleware/auth.py +++ b/services/weather/middleware/auth.py @@ -67,7 +67,7 @@ async def dispatch( ) -> Response: # /healthz is the unauthenticated Cloud Run liveness probe — it cannot # present the API key, so it bypasses the gate before the key check. - if request.url.path == "/healthz": + if request.url.path.rstrip("/") == "/healthz": return await call_next(request) if self._expected_key is None: # Keyless local/dev mode — gate open. diff --git a/services/weather/middleware/ceiling.py b/services/weather/middleware/ceiling.py index 0e4b99b..6e9ad1e 100644 --- a/services/weather/middleware/ceiling.py +++ b/services/weather/middleware/ceiling.py @@ -90,7 +90,7 @@ async def dispatch( # /healthz is the Cloud Run liveness probe — it must not consume a global # ceiling token, or a probe answered 429 would make Cloud Run kill a # healthy instance. - if request.url.path == "/healthz": + if request.url.path.rstrip("/") == "/healthz": return await call_next(request) if not self._consume(): return JSONResponse( diff --git a/services/weather/middleware/ratelimit.py b/services/weather/middleware/ratelimit.py index 05a1fe6..fd23ebc 100644 --- a/services/weather/middleware/ratelimit.py +++ b/services/weather/middleware/ratelimit.py @@ -150,7 +150,7 @@ async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]] ) -> Response: # /healthz is the Cloud Run liveness probe — never throttle it. - if request.url.path == "/healthz": + if request.url.path.rstrip("/") == "/healthz": return await call_next(request) if not self._consume(self._client_key(request)): return JSONResponse( From fde6e55f4f52f6ace25ac9dfc806781439d0535c Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 19:42:57 +0200 Subject: [PATCH 06/17] =?UTF-8?q?fix(28):=20review=20round=202=20=E2=80=94?= =?UTF-8?q?=20reconcile=20ingest=20entrypoints=20to=20the=20infra=20env=20?= =?UTF-8?q?contract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-2 (gpt-5.5) P1/P2 — the deployed capture/STT/rolefact containers must match infra/cloud_run.tf's env contract: - capture: consume CAPTURE_JOBS_SUBSCRIPTION (pull the per-call spec) + upload the transient audio to AUDIO_HANDOFF_BUCKET (private GCS handoff, never R2); direct CAPTURE_* env kept as an operator override. - stt: transcribe_call/main/POST-/transcribe accept a gs:// handoff object in AUDIO_HANDOFF_BUCKET (capture + STT don't share a disk) — download → transcribe → cleanup; local paths still work; /healthz stays model-free. - rolefact: read R2_BUCKET (infra name); Dockerfile drops [earnings] (no faster-whisper/av in the post-audio CPU image — fact_builder imports clean without it). - stt.Dockerfile: bootstrap pip for deadsnakes 3.12 (ensurepip) with a build-time guard; add google-cloud-storage. capture.Dockerfile: add pubsub + storage clients. --- deploy/earnings/capture.Dockerfile | 16 +- deploy/earnings/rolefact.Dockerfile | 36 ++- deploy/earnings/stt.Dockerfile | 23 +- services/earnings/jobs/capture.py | 252 +++++++++++---- services/earnings/jobs/rolefact.py | 24 +- services/earnings/jobs/stt.py | 164 ++++++++-- services/earnings/jobs/stt_server.py | 7 + .../earnings/tests/test_jobs_entrypoints.py | 301 +++++++++++++++++- 8 files changed, 708 insertions(+), 115 deletions(-) diff --git a/deploy/earnings/capture.Dockerfile b/deploy/earnings/capture.Dockerfile index 93f7a1f..418d228 100644 --- a/deploy/earnings/capture.Dockerfile +++ b/deploy/earnings/capture.Dockerfile @@ -40,15 +40,25 @@ COPY packages/weather/ packages/weather/ # Install core + weather[earnings] (the [earnings] extra pulls faster-whisper + # av/PyAV — the audio extract + STT engine deps, lazy-imported at runtime). +# google-cloud-pubsub pulls ONE capture-job spec off CAPTURE_JOBS_SUBSCRIPTION; +# google-cloud-storage uploads the transient audio to the private +# AUDIO_HANDOFF_BUCKET for the cross-service handoff to STT (capture + STT are +# separate Cloud Run resources with NO shared disk). Both are lazy-imported +# inside capture.py (never at module load). The handoff bucket is a private, +# in-firewall GCS bucket — audio never gets an R2 key, never served (D-27.9). RUN pip install \ ./packages/core \ - "./packages/weather[earnings]" + "./packages/weather[earnings]" \ + "google-cloud-pubsub>=2.18,<3" \ + "google-cloud-storage>=2.10,<4" # --- App layer --------------------------------------------------------------- # The non-published service is imported as `services.earnings.*` (matching the # repo-root conftest sys.path convention), so it is copied under /app/services. COPY services/earnings/ services/earnings/ -# Cloud Run Jobs pass the capture spec via env (CAPTURE_TICKER / CAPTURE_CALL_ID / -# CAPTURE_WEBCAST_URL / CAPTURE_OUT_DIR). The entrypoint fails loud on a missing var. +# Cloud Run Job env (infra path): CAPTURE_JOBS_SUBSCRIPTION (per-call spec pulled +# off Pub/Sub) + AUDIO_HANDOFF_BUCKET (private GCS bucket the transient audio is +# uploaded to for the STT handoff). Operator-override manual path: CAPTURE_TICKER / +# CAPTURE_CALL_ID / CAPTURE_WEBCAST_URL. The entrypoint fails loud on a missing var. ENTRYPOINT ["python", "-m", "services.earnings.jobs.capture"] diff --git a/deploy/earnings/rolefact.Dockerfile b/deploy/earnings/rolefact.Dockerfile index 94e4e9d..34ccf1f 100644 --- a/deploy/earnings/rolefact.Dockerfile +++ b/deploy/earnings/rolefact.Dockerfile @@ -5,17 +5,24 @@ # `schema.earnings_fact.v1` rows (fail-closed Kalshi filter), writes the fact # ledger, and OPTIONALLY uploads the derived fact parquet to R2 via the shipped # write sink. It packages `services/earnings/` + the two SDK packages with the -# `[earnings]` extra + boto3 (R2 write) and runs +# `[parquet]` extra + boto3 (R2 write) and runs # `python -m services.earnings.jobs.rolefact`. # -# NO ffmpeg / NO faster-whisper GPU / NO chromium: this stage is entirely -# post-audio (it never touches audio bytes — D-27.9). SLIM CPython base. The -# `[earnings]` extra still pulls faster-whisper/av transitively, but they are -# lazy-imported and never exercised on this CPU path. boto3 is added explicitly -# for the R2 write sink (`mostlyright.weather.satellite._r2_sink`, which imports -# boto3 lazily and reads the write-token creds from the env by NAME). +# IMAGE-LEVEL AUDIO FIREWALL: this image does NOT install the `[earnings]` extra +# (which pulls the AUDIO toolchain — faster-whisper + av — into a POST-audio CPU +# image, breaking the firewall + bloating the image). rolefact only needs the +# ledger / fact-builder / role-parser code, which lazy-import NOTHING from the +# audio toolchain (verified: `mostlyright.weather.earnings.fact_builder` / +# `role_parser` / `ledger` / `classify_mentions` all import cleanly with +# faster-whisper + av absent). The `[parquet]` extra adds pandas; pyarrow + +# filelock are already base `mostlyrightmd-weather` runtime deps the ledger uses. +# boto3 is added explicitly for the R2 write sink +# (`mostlyright.weather.satellite._r2_sink`, boto3 lazy-imported there, reads the +# write-token creds from the env by NAME). # -# NON-published: COPYs `services/`, never a PyPI wheel. +# NO ffmpeg / NO faster-whisper / NO av / NO chromium: entirely post-audio (never +# touches audio bytes — D-27.9). SLIM CPython base. NON-published: COPYs +# `services/`, never a PyPI wheel. FROM python:3.12-slim AS base @@ -30,17 +37,20 @@ WORKDIR /app COPY packages/core/ packages/core/ COPY packages/weather/ packages/weather/ -# core + weather[earnings] (fact_builder / ledger / role_parser / classify_mentions -# live in the weather earnings module) + boto3 for the R2 write sink. +# core + weather[parquet] (fact_builder / ledger / role_parser / classify_mentions +# live in the weather earnings module; they need pyarrow + filelock — base weather +# runtime deps — plus pandas from [parquet]; the AUDIO toolchain from [earnings] is +# deliberately NOT installed) + boto3 for the R2 write sink. RUN pip install \ ./packages/core \ - "./packages/weather[earnings]" \ + "./packages/weather[parquet]" \ "boto3>=1.34,<2.0" # --- App layer --------------------------------------------------------------- COPY services/earnings/ services/earnings/ # Cloud Run Jobs pass the rolefact spec via env (ROLEFACT_TICKER / ROLEFACT_CALL_ID -# / ROLEFACT_TERMS / ROLEFACT_ROSTER; ROLEFACT_R2_BUCKET + R2_* write creds opt in -# to the R2 upload). The entrypoint fails loud on a missing var. +# / ROLEFACT_TERMS / ROLEFACT_ROSTER; R2_BUCKET — the infra env name — + R2_* write +# creds opt in to the R2 upload; ROLEFACT_R2_BUCKET is a manual-run override). The +# entrypoint fails loud on a missing var. ENTRYPOINT ["python", "-m", "services.earnings.jobs.rolefact"] diff --git a/deploy/earnings/stt.Dockerfile b/deploy/earnings/stt.Dockerfile index f507b19..901056f 100644 --- a/deploy/earnings/stt.Dockerfile +++ b/deploy/earnings/stt.Dockerfile @@ -24,8 +24,14 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ WORKDIR /app -# python3.12 + pip on the CUDA base (Ubuntu 22.04 ships 3.10; add the deadsnakes -# 3.12 the SDK floors target). ffmpeg for any codec PyAV's bundled libs defer to. +# python3.12 on the CUDA base (Ubuntu 22.04 ships 3.10; add the deadsnakes 3.12 +# the SDK floors target). ffmpeg for any codec PyAV's bundled libs defer to. +# +# pip bootstrap: `python3-pip` installs pip for the DISTRO python (3.10), but we +# symlink `python`->3.12 — so `python -m pip` would run under 3.12 with NO pip +# module and the build would fail. Instead bootstrap pip for 3.12 itself via +# `python3.12 -m ensurepip` (needs python3.12-venv, which ships ensurepip's +# wheels), then verify the pip target IS 3.12 before any install. RUN apt-get update \ && apt-get install -y --no-install-recommends \ software-properties-common \ @@ -34,11 +40,15 @@ RUN apt-get update \ && apt-get install -y --no-install-recommends \ python3.12 \ python3.12-venv \ - python3-pip \ ffmpeg \ && rm -rf /var/lib/apt/lists/* \ && ln -sf /usr/bin/python3.12 /usr/local/bin/python \ - && ln -sf /usr/bin/python3.12 /usr/local/bin/python3 + && ln -sf /usr/bin/python3.12 /usr/local/bin/python3 \ + && python3.12 -m ensurepip --upgrade \ + && python3.12 -m pip install --upgrade --break-system-packages pip \ + # Fail the build loudly if `python -m pip` is not running under 3.12. + && python -m pip --version \ + && python -m pip --version | grep -q "python 3.12" # --- Dependency layer -------------------------------------------------------- COPY packages/core/ packages/core/ @@ -48,9 +58,14 @@ COPY packages/weather/ packages/weather/ # (CTranslate2 Whisper; NO torch) + av. python3.12's pip resolves the wheels. # fastapi + uvicorn back the HTTP surface: infra declares STT as a Cloud Run # SERVICE (google_cloud_run_v2_service.stt), so the container MUST serve $PORT. +# google-cloud-storage downloads the transient audio HANDOFF object from the +# private AUDIO_HANDOFF_BUCKET (capture + STT are separate Cloud Run resources +# with NO shared disk) — lazy-imported inside stt._resolve_audio_reference. NO +# torch anywhere (D-27.5): the STT engine is CTranslate2/faster-whisper only. RUN python -m pip install --break-system-packages \ ./packages/core \ "./packages/weather[earnings]" \ + "google-cloud-storage>=2.10,<4" \ "fastapi>=0.115,<1" \ "uvicorn[standard]>=0.30" diff --git a/services/earnings/jobs/capture.py b/services/earnings/jobs/capture.py index 307e1f2..4353889 100644 --- a/services/earnings/jobs/capture.py +++ b/services/earnings/jobs/capture.py @@ -1,42 +1,67 @@ """Earnings webcast-capture Cloud Run Job (Phase 28, 28-13). -The AUDIO side of the firewall (``mr-earnings-ingest``). Reads a capture job spec -from the environment (Cloud Run Jobs pass config via env), invokes the SHIPPED -capture surface (:class:`mostlyright.weather.earnings.capture.q4.Q4CaptureAdapter`) -to cold-fetch the webcast media into an EPHEMERAL dir, and hands the resulting -transient :class:`~mostlyright.weather.earnings.capture.base.AudioArtifact` to the -downstream STT job (via the shared ephemeral disk within the same Cloud Run task, -or by leaving the artifact for an operator-gated orchestration seam — see below). - -**Audio firewall (D-27.9, legal).** The captured audio is a TRANSIENT artifact on -ephemeral disk. This job NEVER uploads it, NEVER writes it to a ledger, and NEVER -serves it — it asserts the artifact's ``is_transient`` flag and that its path -stays under the local capture dir. The bytes die with the ephemeral task. - -**Env contract:** - -* ``CAPTURE_TICKER`` (required) — the market ticker (e.g. ``CHWY``). -* ``CAPTURE_CALL_ID`` (required) — the provider event/call id. -* ``CAPTURE_WEBCAST_URL`` (required) — the sniffed static media URL - (``static.events.q4inc.com/.../{uuid}.mp4``). The shipped SSRF guard rejects a - non-Q4/non-https URL BEFORE any fetch. -* ``CAPTURE_OUT_DIR`` (optional) — the ephemeral dir the transient audio is - written under (default: a fresh ``tempfile`` dir, still ephemeral). +The AUDIO side of the firewall (``mr-earnings-ingest``). In the DEPLOYED topology +(``infra/cloud_run.tf`` ``google_cloud_run_v2_job.capture``) this job: + + 1. pulls ONE capture-job spec message off the ``CAPTURE_JOBS_SUBSCRIPTION`` + Pub/Sub subscription (the per-call ticker / call_id / webcast_url), then + 2. invokes the SHIPPED capture surface + (:class:`mostlyright.weather.earnings.capture.q4.Q4CaptureAdapter`) to + cold-fetch the webcast media into an EPHEMERAL dir, then + 3. UPLOADS the transient audio to the private, in-firewall GCS handoff bucket + ``AUDIO_HANDOFF_BUCKET`` (``earnings-audio-handoff-``) so the + SEPARATE STT Cloud Run service — which does NOT share this job's ephemeral + filesystem — can fetch it, then + 4. acks the message. + +**Why the GCS handoff (not a shared local path).** capture and STT are separate +Cloud Run resources with NO shared disk. The audio therefore crosses the two +firewalled stages via a private GCS object in ``AUDIO_HANDOFF_BUCKET`` — an +in-region, in-firewall bucket. It is NEVER an R2 key and NEVER served: only STT +(inside the firewall) reads it, transcribes, and deletes it. The audio bytes +never reach the ledger, the wire, or R2 (D-27.9). + +**Audio firewall (D-27.9, legal).** The captured audio is a TRANSIENT artifact. +This job asserts ``is_transient`` and that the local path stays under the capture +dir before the handoff. The handoff target is the private GCS bucket the infra +provides — audio never gets an R2 key and is never a ledger column. + +**Env contract (DEPLOYED / infra path):** + +* ``CAPTURE_JOBS_SUBSCRIPTION`` (required) — the Pub/Sub subscription id carrying + the per-call capture-job spec (``ticker`` / ``call_id`` / + ``webcast_url``/``media_url``). One message is pulled + acked per run. +* ``AUDIO_HANDOFF_BUCKET`` (required) — the private GCS bucket the transient + audio is uploaded to for the cross-service handoff to STT. +* ``CAPTURE_OUT_DIR`` (optional) — the ephemeral dir the transient audio + is written under (default: a fresh ``tempfile`` dir, still ephemeral). + +**Operator-override / manual single-call path.** For a manual single-call run +(no subscription), the per-call spec may instead be supplied directly via env — +``CAPTURE_TICKER`` / ``CAPTURE_CALL_ID`` / ``CAPTURE_WEBCAST_URL`` — which takes +precedence over the subscription pull. This is the operator-gated override; the +DEFAULT deployed path reads ``CAPTURE_JOBS_SUBSCRIPTION`` + ``AUDIO_HANDOFF_BUCKET``. +The handoff upload is skipped ONLY when ``AUDIO_HANDOFF_BUCKET`` is unset (a bare +local operator run), and then the transient path is emitted on stdout instead. **Lazy imports.** ffmpeg/PyAV (``av``) and httpx are pulled in only by the shipped -capture surface, which lazy-imports them inside its own methods — this module and -its ``main`` import nothing heavy at module load, so the entrypoint imports cleanly -with no audio toolchain (the test stubs the capture surface). - -**Operator-gated live seam.** The live-during-call path -(:meth:`CaptureAdapter.live` → Amazon-IVS HLS) is OPERATOR-GATED (27-09) and is -NOT driven here — this job is the VOD/replay cold-fetch. A live orchestration -would follow the HLS edge into the streaming STT (27-10); that is out of scope -for the deploy-runtime scaffolding and intentionally not wired. +capture surface (lazy inside its own methods). ``google-cloud-pubsub`` and +``google-cloud-storage`` are lazy-imported inside :func:`_pull_capture_job` / +:func:`_upload_handoff` — so this module and its ``main`` import nothing heavy at +module load (the tests stub the capture surface + fake pubsub/GCS). + +**Live IVS capture seam (27-09, OPERATOR-GATED).** The live-during-call path +(:meth:`CaptureAdapter.live` → the Amazon-IVS HLS edge for a real in-progress +call) is OPERATOR-GATED and NOT driven here — this job is the VOD/replay +cold-fetch. WHO triggers a live capture job (a scheduler watching the earnings +calendar vs. an operator hand-enqueuing a capture-job message) is the +operator-gated orchestration decision; the streaming STT hand-off (27-10) follows +the HLS edge and is intentionally out of this deploy-runtime scaffolding. """ from __future__ import annotations +import json import logging import os import tempfile @@ -63,20 +88,114 @@ def _assert_audio_local(audio_path: str, out_dir: str) -> None: ) +def _pull_capture_job(subscription: str) -> tuple[dict[str, str], object]: + """Pull ONE capture-job spec message off the subscription; return (spec, ack). + + Returns the decoded per-call spec (``ticker`` / ``call_id`` / + ``webcast_url``/``media_url``) plus a zero-arg ``ack`` callable the caller + invokes AFTER a successful capture+handoff (so a crash mid-capture leaves the + message un-acked and the job is retried). ``google-cloud-pubsub`` is + lazy-imported here (never at module load). + + Raises: + RuntimeError: no message is available on the subscription, or the message + payload is missing a required field. + """ + from google.cloud import pubsub_v1 + + client = pubsub_v1.SubscriberClient() + response = client.pull(subscription=subscription, max_messages=1, return_immediately=True) + received = list(response.received_messages) + if not received: + raise RuntimeError( + f"no capture-job message available on subscription {subscription!r} — " + "nothing to capture (fail loud rather than silently no-op)." + ) + msg = received[0] + spec = json.loads(msg.message.data.decode("utf-8")) + ticker = spec.get("ticker") + call_id = spec.get("call_id") + webcast_url = spec.get("webcast_url") or spec.get("media_url") + missing = [ + k + for k, v in (("ticker", ticker), ("call_id", call_id), ("webcast_url", webcast_url)) + if not v + ] + if missing: + raise RuntimeError( + f"capture-job message on {subscription!r} is missing required field(s) " + f"{missing}; cannot capture a settlement-adjacent call from a partial spec." + ) + + def _ack() -> None: + client.acknowledge(subscription=subscription, ack_ids=[msg.ack_id]) + + return {"ticker": ticker, "call_id": call_id, "webcast_url": webcast_url}, _ack + + +def _upload_handoff(audio_path: str, bucket: str, *, ticker: str, call_id: str) -> str: + """Upload the transient audio to the private GCS handoff bucket; return the gs:// uri. + + The cross-service handoff to STT (capture and STT do NOT share a disk). The + object key is namespaced by ``(ticker, call_id)``; it lands in the private, + in-firewall ``AUDIO_HANDOFF_BUCKET`` — NOT R2, NEVER served. STT downloads it, + transcribes, and deletes it. ``google-cloud-storage`` is lazy-imported here. + """ + from google.cloud import storage + + ext = os.path.splitext(audio_path)[1] or ".audio" + blob_name = f"handoff/{ticker}/{call_id}{ext}" + client = storage.Client() + blob = client.bucket(bucket).blob(blob_name) + blob.upload_from_filename(audio_path) + uri = f"gs://{bucket}/{blob_name}" + _LOG.info("capture handoff: uploaded transient audio to %s (%s/%s)", uri, ticker, call_id) + return uri + + +def _resolve_spec() -> tuple[dict[str, str], object]: + """Resolve the per-call capture spec + an ack callable. + + Operator-override precedence: if ``CAPTURE_TICKER`` is set, read the whole + spec from env (the manual single-call path) with a no-op ack. Otherwise pull + ONE message off ``CAPTURE_JOBS_SUBSCRIPTION`` (the DEFAULT deployed path). + """ + if optional_env("CAPTURE_TICKER"): + spec = { + "ticker": require_env("CAPTURE_TICKER"), + "call_id": require_env("CAPTURE_CALL_ID"), + "webcast_url": require_env("CAPTURE_WEBCAST_URL"), + } + + def _noop_ack() -> None: + return None + + _LOG.info("capture: using operator-override env spec (manual single-call path)") + return spec, _noop_ack + + subscription = require_env("CAPTURE_JOBS_SUBSCRIPTION") + return _pull_capture_job(subscription) + + def main(argv: list[str] | None = None) -> int: - """Cold-fetch the webcast media to a transient local :class:`AudioArtifact`. + """Pull a capture job, cold-fetch the webcast media, hand the audio to STT via GCS. - Reads the job spec from the environment, invokes the shipped Q4 capture - surface, asserts the audio stays local, logs the transient path for the - downstream STT job, and returns ``0`` on success. Any failure (missing env, - SSRF-rejected URL, no HTTP media, extract failure) propagates as a non-zero - exit (fail loud). + The DEFAULT deployed path pulls the per-call spec off + ``CAPTURE_JOBS_SUBSCRIPTION``, runs the shipped Q4 capture, uploads the + transient audio to ``AUDIO_HANDOFF_BUCKET`` (the cross-service handoff), and + acks the message. A manual single-call run may instead supply the spec via + ``CAPTURE_TICKER`` / ``CAPTURE_CALL_ID`` / ``CAPTURE_WEBCAST_URL``. Any failure + (missing env, no message, SSRF-rejected URL, no HTTP media, extract failure) + propagates as a non-zero exit (fail loud) and leaves the message un-acked. """ logging.basicConfig(level=logging.INFO) - ticker = require_env("CAPTURE_TICKER") - call_id = require_env("CAPTURE_CALL_ID") - webcast_url = require_env("CAPTURE_WEBCAST_URL") + spec, ack = _resolve_spec() + ticker = spec["ticker"] + call_id = spec["call_id"] + webcast_url = spec["webcast_url"] + + handoff_bucket = optional_env("AUDIO_HANDOFF_BUCKET") out_dir = optional_env("CAPTURE_OUT_DIR") or tempfile.mkdtemp(prefix="earnings-capture-") os.makedirs(out_dir, exist_ok=True) @@ -85,15 +204,19 @@ def main(argv: list[str] | None = None) -> int: # importable with no audio toolchain). from mostlyright.weather.earnings.capture.q4 import Q4CaptureAdapter - _LOG.info("capture job start: ticker=%s call_id=%s out_dir=%s", ticker, call_id, out_dir) + _LOG.info( + "capture job start: ticker=%s call_id=%s out_dir=%s handoff_bucket=%s", + ticker, + call_id, + out_dir, + handoff_bucket, + ) adapter = Q4CaptureAdapter() event = {"ticker": ticker, "call_id": call_id, "media_url": webcast_url} - # The AudioArtifact is a context manager whose __exit__ cleans up the transient - # audio. We do NOT enter it here — the downstream STT job (same ephemeral task / - # operator orchestration) consumes the path, then cleans it up. We DO assert the - # firewall invariants and log the transient path. + # The AudioArtifact's audio is a transient local file; we assert the firewall + # invariants, hand it off to STT via the private GCS bucket, then ack. artifact = adapter.capture(event, tmp_dir=out_dir) if not artifact.is_transient: @@ -103,17 +226,34 @@ def main(argv: list[str] | None = None) -> int: ) _assert_audio_local(artifact.audio_path, out_dir) - _LOG.info( - "capture job done: ticker=%s call_id=%s transient_audio=%s source=%s", - artifact.ticker, - artifact.call_id, - artifact.audio_path, - artifact.source_media_url, - ) - # The transient audio path is emitted on stdout for the downstream STT job / - # operator orchestration to pick up off the same ephemeral disk. It is NEVER - # uploaded or served here. - print(artifact.audio_path) + if handoff_bucket: + # DEFAULT deployed path: hand the transient audio to STT via the private + # in-firewall GCS bucket, THEN ack (a crash before the ack retries the job). + handoff_uri = _upload_handoff( + artifact.audio_path, handoff_bucket, ticker=ticker, call_id=call_id + ) + _LOG.info( + "capture job done: ticker=%s call_id=%s handoff=%s source=%s", + artifact.ticker, + artifact.call_id, + handoff_uri, + artifact.source_media_url, + ) + # The gs:// handoff URI is the STT reference — the transient local file + # dies with the ephemeral task; only STT (in-firewall) reads the object. + print(handoff_uri) + else: + # Bare local operator run (no handoff bucket): emit the transient path for + # a co-located STT run off the same disk. Audio is still never served. + _LOG.info( + "capture job done (local, no handoff bucket): ticker=%s call_id=%s transient_audio=%s", + artifact.ticker, + artifact.call_id, + artifact.audio_path, + ) + print(artifact.audio_path) + + ack() return 0 diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py index c238e8d..a0593ef 100644 --- a/services/earnings/jobs/rolefact.py +++ b/services/earnings/jobs/rolefact.py @@ -23,10 +23,11 @@ * ``ROLEFACT_ROSTER`` (optional) — a JSON array of ``[speaker_name, label]`` roster pairs anchoring exec identity for the fail-closed Kalshi rule. * ``MOSTLYRIGHT_CACHE_DIR`` (optional) — the ledger cache root. -* R2 upload (opt-in; ``ROLEFACT_R2_BUCKET`` enables it): - ``ROLEFACT_R2_BUCKET`` + the write-token creds ``R2_ACCOUNT_ID`` / - ``R2_WRITE_ACCESS_KEY_ID`` / ``R2_WRITE_SECRET_ACCESS_KEY`` (read from the env - by NAME by the shipped sink). +* R2 upload (opt-in; the upload bucket enables it): ``R2_BUCKET`` (the infra env + name — ``infra/cloud_run.tf`` ``google_cloud_run_v2_job.rolefact`` sets it), + with ``ROLEFACT_R2_BUCKET`` accepted as a fallback/override for a manual run, + plus the write-token creds ``R2_ACCOUNT_ID`` / ``R2_WRITE_ACCESS_KEY_ID`` / + ``R2_WRITE_SECRET_ACCESS_KEY`` (read from the env by NAME by the shipped sink). **Audio firewall (D-27.9).** This stage is entirely post-audio — it reads TEXT from the transcript ledger and writes DERIVED FACTS. There is no audio anywhere in @@ -206,12 +207,17 @@ def _parse_roster(raw: str | None) -> list[tuple[str, str]]: def _maybe_upload_r2(fact_path: str, *, ticker: str, call_id: str) -> None: """Opt-in upload of the derived fact parquet to R2 via the shipped write sink. - Enabled only when ``ROLEFACT_R2_BUCKET`` is set. The write-token creds - (``R2_ACCOUNT_ID`` / ``R2_WRITE_ACCESS_KEY_ID`` / ``R2_WRITE_SECRET_ACCESS_KEY``) - are read from the env by NAME by the shipped sink (boto3 lazy-imported there). - ONLY the derived FACT parquet (text/facts, never audio) is uploaded (D-27.9). + Enabled when an upload bucket is configured: ``R2_BUCKET`` (the infra env name + the shipped ``infra/cloud_run.tf`` sets on the rolefact Job), with + ``ROLEFACT_R2_BUCKET`` accepted as a fallback/override for a manual run. The + write-token creds (``R2_ACCOUNT_ID`` / ``R2_WRITE_ACCESS_KEY_ID`` / + ``R2_WRITE_SECRET_ACCESS_KEY``) are read from the env by NAME by the shipped + sink (boto3 lazy-imported there). ONLY the derived FACT parquet (text/facts, + never audio) is uploaded (D-27.9). """ - bucket = optional_env("ROLEFACT_R2_BUCKET") + # R2_BUCKET is the infra env name; ROLEFACT_R2_BUCKET is the manual-run + # override (checked first so an operator can redirect a one-off upload). + bucket = optional_env("ROLEFACT_R2_BUCKET") or optional_env("R2_BUCKET") if not bucket: return key = f"earnings/facts/{ticker}/{call_id}.parquet" diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py index b1b68d8..8ba6b1c 100644 --- a/services/earnings/jobs/stt.py +++ b/services/earnings/jobs/stt.py @@ -8,9 +8,20 @@ opt-in live mode — publishes segments to the ``earnings-streaming`` Pub/Sub topic via :class:`~services.earnings.pubsub_bridge.SegmentPublisher`. +**Deployed topology.** capture and STT are SEPARATE Cloud Run resources with NO +shared disk, so STT receives an audio REFERENCE to a handoff OBJECT in the private +``AUDIO_HANDOFF_BUCKET`` — a ``gs:///`` URI (or a bare object key, +resolved against ``AUDIO_HANDOFF_BUCKET``), NOT a local path. STT DOWNLOADS the +object to a local temp file, transcribes, and deletes it. A local filesystem path +still works (operator / test / GCE MIG fallback). + **Env contract:** -* ``STT_AUDIO_PATH`` (required) — the transient audio file from the capture job. +* ``STT_AUDIO_PATH`` (required) — the audio reference: a ``gs://`` handoff-bucket + URI (the deployed path), a bare handoff object key (resolved against + ``AUDIO_HANDOFF_BUCKET``), or a local file path (operator / test). +* ``AUDIO_HANDOFF_BUCKET`` (optional) — the private GCS handoff bucket a bare + object key in ``STT_AUDIO_PATH`` is resolved against (the infra sets this). * ``STT_TICKER`` (required) — the market ticker. * ``STT_CALL_ID`` (required) — the provider call id (ledger partition key). * ``STT_TIER`` (optional) — model size (default ``large-v3``; on-device @@ -36,20 +47,87 @@ envelopes are text/facts-only (the bridge fails closed on any audio field). **Lazy imports.** faster-whisper / CTranslate2 are lazy-imported inside the shipped -:class:`SttTranscriber` (never at module load), and ``google-cloud-pubsub`` is -lazy-constructed only inside :func:`~services.earnings.pubsub_bridge.build_publisher_client` -— so this entrypoint imports cleanly with no GPU / no whisper / no GCP SDK. +:class:`SttTranscriber` (never at module load); ``google-cloud-storage`` is +lazy-imported inside :func:`_resolve_audio_reference` (only when the reference is a +``gs://`` handoff object); and ``google-cloud-pubsub`` is lazy-constructed only +inside :func:`~services.earnings.pubsub_bridge.build_publisher_client` — so this +entrypoint imports cleanly with no GPU / no whisper / no GCP SDK. """ from __future__ import annotations +import contextlib import logging import os +import tempfile +from collections.abc import Iterator from services.earnings.jobs._env import optional_env, require_env _LOG = logging.getLogger("services.earnings.jobs.stt") + +def _split_gs_uri(reference: str, *, handoff_bucket: str | None) -> tuple[str, str] | None: + """Return (bucket, object_key) if ``reference`` names a GCS handoff object, else None. + + Recognizes a ``gs:///`` URI, OR a bare object key when + ``handoff_bucket`` is configured and the reference is not an existing local + path (the deployed capture→STT handoff passes a gs:// URI; a bare key is the + tolerant fallback). A plain local path returns ``None`` (transcribe it directly). + """ + if reference.startswith("gs://"): + rest = reference[len("gs://") :] + bucket, _, key = rest.partition("/") + if not bucket or not key: + raise ValueError( + f"malformed gs:// audio reference {reference!r} (need gs://bucket/key)" + ) + return bucket, key + # A bare object key resolved against the handoff bucket — ONLY when it is not + # already a real local file (so a local path is never mis-read as a GCS key). + if handoff_bucket and not os.path.exists(reference): + return handoff_bucket, reference.lstrip("/") + return None + + +@contextlib.contextmanager +def _resolve_audio_reference(reference: str, *, handoff_bucket: str | None) -> Iterator[str]: + """Yield a LOCAL path for ``reference``, downloading a GCS handoff object if needed. + + In the deployed topology STT receives a ``gs://`` reference to a private + handoff-bucket object (capture and STT do not share a disk). This downloads it + to an ephemeral temp file, yields the local path, then DELETES the temp file on + exit (the audio is transient — it never persists past transcription; D-27.9). A + local path is yielded unchanged (operator / test / GCE MIG fallback). + """ + gs = _split_gs_uri(reference, handoff_bucket=handoff_bucket) + if gs is None: + if not os.path.exists(reference): + raise FileNotFoundError( + f"audio path {reference!r} does not exist — the capture job's transient " + "audio must be present on the shared ephemeral disk (local path), or the " + "reference must be a gs:// handoff object in AUDIO_HANDOFF_BUCKET." + ) + yield reference + return + + bucket, key = gs + # Lazy import: google-cloud-storage only when a GCS handoff object is fetched. + from google.cloud import storage + + ext = os.path.splitext(key)[1] or ".audio" + fd, local_path = tempfile.mkstemp(prefix="earnings-stt-handoff-", suffix=ext) + os.close(fd) + try: + _LOG.info("stt: downloading handoff audio gs://%s/%s -> %s", bucket, key, local_path) + storage.Client().bucket(bucket).blob(key).download_to_filename(local_path) + yield local_path + finally: + # The audio is transient — delete the temp file once transcription is done. + with contextlib.suppress(FileNotFoundError): + os.remove(local_path) + + #: Default STT tier — the hosted / our-infra source-of-truth model (D-27.5). _DEFAULT_TIER = "large-v3" #: Default device/compute for the L4 GPU image. @@ -90,41 +168,29 @@ def _segment_rows( return rows -def transcribe_call( - audio_path: str, +def _transcribe_local( + local_audio: str, *, ticker: str, call_id: str, - tier: str = _DEFAULT_TIER, - device: str = _DEFAULT_DEVICE, - compute_type: str = _DEFAULT_COMPUTE_TYPE, - initial_prompt: str | None = None, - publish_live: bool = False, - streaming_project: str | None = None, - streaming_topic: str = "earnings-streaming", -) -> dict[str, object]: - """Transcribe one transient audio file → transcript ledger (+ optional publish). - - The shared core used by BOTH the one-shot Cloud Run Job entrypoint - (:func:`main`) and the Cloud Run SERVICE HTTP handler - (:mod:`services.earnings.jobs.stt_server`). Returns a small audio-free summary - dict (segment/row counts, language, duration) — NEVER audio. Raises on a - transcription / ledger failure (fail loud). No audio reaches the ledger or wire. + tier: str, + device: str, + compute_type: str, + initial_prompt: str | None, +) -> object: + """Run the shipped transcriber on a LOCAL audio file; return its result. + + Split from :func:`transcribe_call` so the GCS-download / temp-file lifecycle + (:func:`_resolve_audio_reference`) wraps ONLY the transcription, and the temp + file is deleted the instant the model is done reading it. """ - if not os.path.exists(audio_path): - raise FileNotFoundError( - f"audio path {audio_path!r} does not exist — the capture job's transient " - "audio must be present on the shared ephemeral disk." - ) - # Lazy import: SttTranscriber lazy-imports faster-whisper inside transcribe, # so nothing heavy loads at module import. from mostlyright.weather.earnings.stt import SttTranscriber _LOG.info("stt start: ticker=%s call_id=%s tier=%s device=%s", ticker, call_id, tier, device) - transcriber = SttTranscriber(tier, device=device, compute_type=compute_type) - result = transcriber.transcribe(audio_path, initial_prompt=initial_prompt) + result = transcriber.transcribe(local_audio, initial_prompt=initial_prompt) _LOG.info( "stt transcribed: ticker=%s call_id=%s segments=%d language=%s duration=%s", ticker, @@ -133,6 +199,45 @@ def transcribe_call( result.language, result.duration, ) + return result + + +def transcribe_call( + audio_path: str, + *, + ticker: str, + call_id: str, + tier: str = _DEFAULT_TIER, + device: str = _DEFAULT_DEVICE, + compute_type: str = _DEFAULT_COMPUTE_TYPE, + initial_prompt: str | None = None, + publish_live: bool = False, + streaming_project: str | None = None, + streaming_topic: str = "earnings-streaming", + handoff_bucket: str | None = None, +) -> dict[str, object]: + """Transcribe one call's transient audio → transcript ledger (+ optional publish). + + The shared core used by BOTH the one-shot Cloud Run Job entrypoint + (:func:`main`) and the Cloud Run SERVICE HTTP handler + (:mod:`services.earnings.jobs.stt_server`). ``audio_path`` may be a ``gs://`` + handoff-bucket URI (the deployed capture→STT handoff), a bare object key + resolved against ``handoff_bucket``, or a local file path (operator / test) — + a GCS reference is downloaded to an ephemeral temp file that is deleted after + transcription. Returns a small audio-free summary dict (segment/row counts, + language, duration) — NEVER audio. Raises on a transcription / ledger failure + (fail loud). No audio reaches the ledger or wire. + """ + with _resolve_audio_reference(audio_path, handoff_bucket=handoff_bucket) as local_audio: + result = _transcribe_local( + local_audio, + ticker=ticker, + call_id=call_id, + tier=tier, + device=device, + compute_type=compute_type, + initial_prompt=initial_prompt, + ) rows = _segment_rows(result.segments, ticker=ticker, call_id=call_id) @@ -198,6 +303,7 @@ def main(argv: list[str] | None = None) -> int: streaming_project=optional_env("EARNINGS_STREAMING_PROJECT"), streaming_topic=optional_env("EARNINGS_STREAMING_TOPIC", "earnings-streaming") or "earnings-streaming", + handoff_bucket=optional_env("AUDIO_HANDOFF_BUCKET"), ) return 0 diff --git a/services/earnings/jobs/stt_server.py b/services/earnings/jobs/stt_server.py index 9b88c47..349ff6f 100644 --- a/services/earnings/jobs/stt_server.py +++ b/services/earnings/jobs/stt_server.py @@ -29,6 +29,7 @@ from __future__ import annotations import logging +import os from typing import Annotated from fastapi import Body, FastAPI, HTTPException @@ -70,6 +71,11 @@ def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: status_code=400, detail=f"missing required field(s): {', '.join(missing)}" ) + # The deployed STT service has AUDIO_HANDOFF_BUCKET set; a bare object key / + # gs:// reference in the body is resolved + downloaded against it. A request + # may also override it explicitly. + handoff_bucket = payload.get("handoff_bucket") or os.environ.get("AUDIO_HANDOFF_BUCKET") + try: return transcribe_call( str(audio_path), @@ -82,6 +88,7 @@ def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: publish_live=bool(payload.get("publish_live", False)), streaming_project=payload.get("streaming_project"), streaming_topic=str(payload.get("streaming_topic") or "earnings-streaming"), + handoff_bucket=handoff_bucket, ) except FileNotFoundError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py index 2018e34..7020bec 100644 --- a/services/earnings/tests/test_jobs_entrypoints.py +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -21,7 +21,10 @@ import builtins import importlib +import json +import os import sys +import types import pytest @@ -124,8 +127,146 @@ def capture(self, event, *, tmp_dir=None, **_): def test_capture_main_fails_loud_on_missing_env(monkeypatch: pytest.MonkeyPatch) -> None: + # Operator-override path: CAPTURE_TICKER is set but CAPTURE_CALL_ID is not. + monkeypatch.setenv("CAPTURE_TICKER", "CHWY") + monkeypatch.delenv("CAPTURE_CALL_ID", raising=False) + with pytest.raises(ValueError, match="CAPTURE_CALL_ID"): + capture_job.main() + + +def test_capture_main_fails_loud_without_subscription_or_override( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Neither the operator-override CAPTURE_TICKER nor CAPTURE_JOBS_SUBSCRIPTION is + # set: the deployed path resolver fails loud naming the subscription var. monkeypatch.delenv("CAPTURE_TICKER", raising=False) - with pytest.raises(ValueError, match="CAPTURE_TICKER"): + monkeypatch.delenv("CAPTURE_JOBS_SUBSCRIPTION", raising=False) + with pytest.raises(ValueError, match="CAPTURE_JOBS_SUBSCRIPTION"): + capture_job.main() + + +def test_capture_main_pulls_subscription_and_uploads_handoff( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """DEPLOYED path: pull ONE capture-job message, capture, upload to handoff bucket, ack.""" + out_dir = tmp_path / "cap" + out_dir.mkdir() + audio_path = str(out_dir / "audio.wav") + (out_dir / "audio.wav").write_bytes(b"fake-audio") + + # --- fake google-cloud-pubsub: one message carrying the per-call spec -------- + acked: dict[str, object] = {} + + class _FakeMessage: + def __init__(self, data: bytes) -> None: + self.data = data + + class _FakeReceived: + def __init__(self, data: bytes) -> None: + self.ack_id = "ack-123" + self.message = _FakeMessage(data) + + class _FakePullResponse: + def __init__(self, data: bytes) -> None: + self.received_messages = [_FakeReceived(data)] + + class _FakeSubscriber: + def pull(self, *, subscription, max_messages, return_immediately): + acked["subscription_pulled"] = subscription + spec = { + "ticker": "CHWY", + "call_id": "evt-1", + "webcast_url": "https://static.events.q4inc.com/x/y.mp4", + } + return _FakePullResponse(json.dumps(spec).encode("utf-8")) + + def acknowledge(self, *, subscription, ack_ids): + acked["ack_subscription"] = subscription + acked["ack_ids"] = list(ack_ids) + + fake_pubsub = types.SimpleNamespace(SubscriberClient=lambda: _FakeSubscriber()) + + # --- fake google-cloud-storage: capture the uploaded handoff object ---------- + uploaded: dict[str, object] = {} + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def upload_from_filename(self, filename: str) -> None: + uploaded["blob"] = self._name + uploaded["local"] = filename + + class _FakeBucket: + def __init__(self, name: str) -> None: + self._name = name + + def blob(self, name: str) -> _FakeBlob: + uploaded["bucket"] = self._name + return _FakeBlob(name) + + class _FakeStorageClient: + def bucket(self, name: str) -> _FakeBucket: + return _FakeBucket(name) + + fake_storage = types.SimpleNamespace(Client=lambda: _FakeStorageClient()) + + # google.cloud.{pubsub_v1,storage} are lazy-imported inside capture.py; inject + # the fakes into sys.modules so the `from google.cloud import ...` picks them up. + monkeypatch.setitem(sys.modules, "google.cloud.pubsub_v1", fake_pubsub) + monkeypatch.setitem(sys.modules, "google.cloud.storage", fake_storage) + + class _FakeArtifactLocal: + def __init__(self) -> None: + self.audio_path = audio_path + self.ticker = "CHWY" + self.call_id = "evt-1" + self.source_media_url = "https://static.events.q4inc.com/x/y.mp4" + self.is_transient = True + + captured: dict[str, object] = {} + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + captured["event"] = event + return _FakeArtifactLocal() + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + # DEPLOYED env: no CAPTURE_TICKER override; subscription + handoff bucket set. + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + monkeypatch.setenv("CAPTURE_JOBS_SUBSCRIPTION", "projects/p/subscriptions/capture-jobs") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + + assert capture_job.main() == 0 + # The per-call spec was pulled off the subscription and rode into capture. + assert acked["subscription_pulled"] == "projects/p/subscriptions/capture-jobs" + assert captured["event"]["media_url"] == "https://static.events.q4inc.com/x/y.mp4" + # The transient audio was uploaded to the PRIVATE handoff bucket (never R2). + assert uploaded["bucket"] == "earnings-audio-handoff-123" + assert uploaded["blob"] == "handoff/CHWY/evt-1.wav" + assert uploaded["local"] == audio_path + # The message was acked AFTER the successful capture + handoff. + assert acked["ack_ids"] == ["ack-123"] + + +def test_capture_main_missing_message_fails_loud(monkeypatch: pytest.MonkeyPatch) -> None: + """An empty subscription pull fails loud rather than silently no-op'ing.""" + + class _FakeSubscriber: + def pull(self, *, subscription, max_messages, return_immediately): + return types.SimpleNamespace(received_messages=[]) + + fake_pubsub = types.SimpleNamespace(SubscriberClient=lambda: _FakeSubscriber()) + monkeypatch.setitem(sys.modules, "google.cloud.pubsub_v1", fake_pubsub) + + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + monkeypatch.setenv("CAPTURE_JOBS_SUBSCRIPTION", "projects/p/subscriptions/capture-jobs") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + + with pytest.raises(RuntimeError, match="no capture-job message"): capture_job.main() @@ -250,6 +391,134 @@ def test_stt_main_fails_loud_when_audio_missing(monkeypatch: pytest.MonkeyPatch, stt_job.main() +def test_stt_main_downloads_gs_handoff_before_transcribing( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """DEPLOYED path: STT_AUDIO_PATH is a gs:// handoff object → downloaded, transcribed.""" + cache = tmp_path / "cache" + handoff_bytes = b"RIFF-handoff-audio" + + downloaded: dict[str, object] = {} + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def download_to_filename(self, local_path: str) -> None: + downloaded["blob"] = self._name + downloaded["local_path"] = local_path + # The download must produce a real local file the transcriber reads. + with open(local_path, "wb") as fh: + fh.write(handoff_bytes) + + class _FakeBucket: + def __init__(self, name: str) -> None: + self._name = name + + def blob(self, name: str) -> _FakeBlob: + downloaded["bucket"] = self._name + return _FakeBlob(name) + + class _FakeStorageClient: + def bucket(self, name: str) -> _FakeBucket: + return _FakeBucket(name) + + fake_storage = types.SimpleNamespace(Client=lambda: _FakeStorageClient()) + monkeypatch.setitem(sys.modules, "google.cloud.storage", fake_storage) + + seen: dict[str, object] = {} + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + # The transcriber must receive the LOCAL downloaded temp path (which + # must EXIST at transcription time), never the gs:// reference. + seen["audio_path"] = audio_path + seen["existed_during_transcribe"] = os.path.exists(audio_path) + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", "gs://earnings-audio-handoff-123/handoff/CHWY/evt-1.wav") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + + assert stt_job.main() == 0 + assert downloaded["bucket"] == "earnings-audio-handoff-123" + assert downloaded["blob"] == "handoff/CHWY/evt-1.wav" + # The transcriber saw a LOCAL path (not the gs:// ref) and it existed. + assert not str(seen["audio_path"]).startswith("gs://") + assert seen["existed_during_transcribe"] is True + # The transient temp file is cleaned up after transcription (D-27.9). + assert not os.path.exists(downloaded["local_path"]) + + from mostlyright.weather.earnings.ledger import TranscriptLedger + + rows = TranscriptLedger().read("CHWY", "evt-1") + assert len(rows) == 2 + + +def test_stt_transcribe_call_bare_key_resolves_against_handoff_bucket( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A bare object key (not gs://) resolves against handoff_bucket + downloads.""" + cache = tmp_path / "cache" + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def download_to_filename(self, local_path: str) -> None: + with open(local_path, "wb") as fh: + fh.write(b"audio") + + seen: dict[str, object] = {} + + class _FakeBucket: + def __init__(self, name: str) -> None: + seen["bucket"] = name + + def blob(self, name: str) -> _FakeBlob: + seen["blob"] = name + return _FakeBlob(name) + + fake_storage = types.SimpleNamespace( + Client=lambda: types.SimpleNamespace(bucket=lambda name: _FakeBucket(name)) + ) + monkeypatch.setitem(sys.modules, "google.cloud.storage", fake_storage) + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr( + sttmod, + "SttTranscriber", + lambda *a, **k: types.SimpleNamespace( + transcribe=lambda audio_path, *, initial_prompt=None: _FakeTranscriptResult() + ), + ) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + + out = stt_job.transcribe_call( + "handoff/CHWY/evt-2.wav", + ticker="CHWY", + call_id="evt-2", + tier="small", + device="cpu", + handoff_bucket="earnings-audio-handoff-123", + ) + assert seen["bucket"] == "earnings-audio-handoff-123" + assert seen["blob"] == "handoff/CHWY/evt-2.wav" + assert out["ticker"] == "CHWY" + + def test_stt_main_live_publish_opt_in(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: cache = tmp_path / "cache" audio = tmp_path / "audio.wav" @@ -401,3 +670,33 @@ def _fake_upload(local_path, bucket, key, *, r2_target=None): assert uploaded["key"] == "earnings/facts/CHWY/evt-1.parquet" assert uploaded["local_path"].endswith("evt-1.parquet") assert "audio" not in uploaded["local_path"].lower() + + +def test_rolefact_main_uploads_via_infra_r2_bucket_env( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """The infra env name R2_BUCKET (not ROLEFACT_R2_BUCKET) enables + targets the upload.""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-9", ["tariff here"]) + + uploaded: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + uploaded["bucket"] = bucket + uploaded["key"] = key + return key + + monkeypatch.setattr(sink, "upload", _fake_upload) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-9") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + # infra sets R2_BUCKET (NOT ROLEFACT_R2_BUCKET) on the rolefact Cloud Run Job. + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + assert uploaded["bucket"] == "mostlyright-derived" + assert uploaded["key"] == "earnings/facts/CHWY/evt-9.parquet" From 9a34fa5b9c31829aac0920c3d7cd7f409600eb25 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 20:02:21 +0200 Subject: [PATCH 07/17] =?UTF-8?q?fix(28):=20review=20round=203=20=E2=80=94?= =?UTF-8?q?=20roster=20HKO=20hole,=20STT=20handoff-audio=20delete,=20progr?= =?UTF-8?q?ess=20store?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-3 (gpt-5.5) P1/P2: - P1: HKO (Hong Kong Observatory pseudo-station) has no satellite StationInfo, so its shard resolved to zero partitions (silent data hole). Exclude it via _roster._NON_SATELLITE_STATIONS -> the satellite roster is 65 (66 union minus HKO); task_count 66->65 in run-weather-backfill.yml + infra/batch.tf; a new test asserts EVERY roster station resolves (no empty shards). - P1: STT now DELETES the source handoff object in AUDIO_HANDOFF_BUCKET after a successful transcription (kept on failure for retry) — raw audio no longer accumulates in GCS (D-27.9). - P2: wire --progress-bucket to a durable GcsProgressStore (shard-disjoint marker URI) so preempted Spot slices rehydrate markers from GCS instead of reprocessing. --- .github/workflows/run-weather-backfill.yml | 12 ++- infra/batch.tf | 7 +- .../mostlyright/weather/satellite/__main__.py | 32 ++++---- .../mostlyright/weather/satellite/_roster.py | 73 ++++++++++++------- .../tests/satellite/test_cli_roster.py | 28 ++++++- .../weather/tests/satellite/test_roster.py | 67 ++++++++++++----- services/earnings/jobs/stt.py | 27 ++++++- 7 files changed, 173 insertions(+), 73 deletions(-) diff --git a/.github/workflows/run-weather-backfill.yml b/.github/workflows/run-weather-backfill.yml index aa7b948..6c44fe2 100644 --- a/.github/workflows/run-weather-backfill.yml +++ b/.github/workflows/run-weather-backfill.yml @@ -6,8 +6,9 @@ name: Run weather backfill fleet (28-21) # # ROLLOUT GATE (operator sequence): the DEFAULT run is a 1-STATION PILOT # (task_count=1) — cheap, proves the read→reduce→R2-upload loop + the big-bytes -# firewall end to end. The FULL 66-shard fleet (the ~28 TB Kalshi∪Polymarket -# roster, D-28.8) reduces ~28 TB of raw imagery in-region and is the phase's +# firewall end to end. The FULL 65-shard fleet (the ~28 TB Kalshi∪Polymarket +# roster minus the non-satellite HKO, D-28.8) reduces ~28 TB of raw imagery +# in-region and is the phase's # largest spend — it is BLOCKED unless the operator sets mode=full AND # confirm_cost_signoff=true (the H5 pilot cost sign-off). This encodes the # "serving + incremental first → 1-station pilot → stop at the 28 TB cost number @@ -108,8 +109,11 @@ jobs: set -euo pipefail IMAGE="${AR_HOST}/${AR_PROJECT}/${AR_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" if [ "$MODE" = "full" ]; then - TASK_COUNT=66 - # Roster mode: the CLI resolves + shards the 66-station roster by + # 65 = the Kalshi∪Polymarket union (66) minus the one non-satellite + # station HKO (_roster._NON_SATELLITE_STATIONS); one shard per + # satellite-resolvable station so no shard resolves to zero partitions. + TASK_COUNT=65 + # Roster mode: the CLI resolves + shards the 65-station roster by # BATCH_TASK_INDEX and supplies satellite/product/year defaults. COMMANDS=$(jq -nc --arg pb "$PROGRESS_BUCKET" --arg rb "$R2_BUCKET" \ '["--mirror","gcp","--roster","kalshi,polymarket","--progress-bucket",$pb,"--r2-bucket",$rb]') diff --git a/infra/batch.tf b/infra/batch.tf index 75d0118..614785a 100644 --- a/infra/batch.tf +++ b/infra/batch.tf @@ -80,7 +80,12 @@ resource "google_batch_job" "weather_backfill" { # a fresh job (with a run-scoped name) at execution time. This resource is the # canonical SPEC. Task count is the shard count (roster-driven). task_groups { - task_count = 66 # ~Kalshi∪Polymarket roster (D-28.8); one shard per station + # 65 = the Kalshi∪Polymarket roster (66) MINUS the one non-satellite station + # HKO (satellite/_roster._NON_SATELLITE_STATIONS: HKO has no satellite + # StationInfo, so a shard for it would resolve to zero partitions). One shard + # per satellite-resolvable station (D-28.8). run-weather-backfill.yml (the + # actual submitter) uses the SAME 65; keep them in lockstep with the roster. + task_count = 65 parallelism = 16 # bounded concurrent Spot slices task_spec { diff --git a/packages/weather/src/mostlyright/weather/satellite/__main__.py b/packages/weather/src/mostlyright/weather/satellite/__main__.py index aafbf00..bb2151a 100644 --- a/packages/weather/src/mostlyright/weather/satellite/__main__.py +++ b/packages/weather/src/mostlyright/weather/satellite/__main__.py @@ -151,8 +151,8 @@ def _build_parser() -> argparse.ArgumentParser: dest="progress_bucket", metavar="BUCKET", help="Durable GCS completion-marker bucket for crash-safe resume (C4). " - "Accepted in roster/incremental mode; see the 28-21 C4 note in " - "_run_backfill for its current wiring status.", + "Wired to a durable GcsProgressStore in roster/incremental mode so preempted " + "Spot slices rehydrate markers from GCS and skip completed partitions.", ) bf.add_argument( "--incremental", @@ -335,7 +335,7 @@ def _run_backfill(args: argparse.Namespace) -> int: stations = list(shard_roster(roster, index, count)) satellites = args.satellites or list(_DEFAULT_ROSTER_SATELLITES) products = args.products or list(_DEFAULT_ROSTER_PRODUCTS) - # Coverage guard (no SILENT under-coverage): the 66-station roster spans + # Coverage guard (no SILENT under-coverage): the settlement-station roster spans # the globe, but the GOES-only default satellites see only the Americas / # E-Pacific. Warn LOUDLY (stderr, visible in Cloud Batch logs) for any # shard station outside the GOES footprint so a `mode=full` operator is not @@ -397,20 +397,20 @@ def _run_backfill(args: argparse.Namespace) -> int: if r2_enabled: kwargs["r2_target"] = args.r2_bucket - # C4 (28-21): --progress-bucket is the durable GCS completion-marker bucket - # for crash-safe resume. bulk_backfill takes a pluggable ``progress_store`` - # (not a bucket NAME), and the GCS-backed ProgressStore is NOT yet wired to a - # bucket string here, so we thread it only as far as validation and record - # the gap loudly rather than silently dropping it. - # TODO(28-21 C4): GCS marker bucket not yet consumed by bulk_backfill — - # construct a GCS-backed _progress.ProgressStore(args.progress_bucket) and - # pass it as progress_store= once the durable store lands. + # C4 (28-21): --progress-bucket is the durable GCS completion-marker bucket for + # crash-safe resume. Wire it to the GCS-backed progress store so a preempted + # Spot slice (or a replacement VM) rehydrates its markers from GCS and SKIPS + # already-uploaded partitions instead of reprocessing under a fresh local disk. + # Each shard writes a DISJOINT marker object (shard-index in the key) so array + # tasks never clobber each other's progress. if args.progress_bucket is not None: - print( - f"note: --progress-bucket {args.progress_bucket!r} accepted but not yet " - f"consumed by bulk_backfill (28-21 C4 follow-up); using the local JSON " - f"progress file under --out for this run." - ) + from ._progress import GcsProgressStore + + shard_index, _shard_count = _resolve_shard_index_count(args) + bucket = args.progress_bucket.rstrip("/") + gcs_uri = f"gs://{bucket}/progress/shard-{shard_index:05d}/progress.json" + kwargs["progress_store"] = GcsProgressStore(gcs_uri) + print(f"using durable GCS progress store for crash-safe resume: {gcs_uri}") result = bulk_backfill(**kwargs) print( diff --git a/packages/weather/src/mostlyright/weather/satellite/_roster.py b/packages/weather/src/mostlyright/weather/satellite/_roster.py index 73a4d81..9eeb1a1 100644 --- a/packages/weather/src/mostlyright/weather/satellite/_roster.py +++ b/packages/weather/src/mostlyright/weather/satellite/_roster.py @@ -1,34 +1,38 @@ """Committed Kalshi/Polymarket settlement-station roster for the fleet backfill. Phase 28 (28-21). The hosted weather backfill runs as a Cloud Batch ARRAY JOB: -``task_count = 66`` array tasks, one shard per settlement station (D-28.8). Each -array task resolves its shard from a STABLE, DETERMINISTIC roster so that shard -``N`` always maps to the same station across Spot retries — a stable -shard-index→station mapping is load-bearing for crash-safe resume (a retried +one shard per SATELLITE-RESOLVABLE settlement station (``task_count = 65``: the +66-station Kalshi∪Polymarket union MINUS the one non-satellite station HKO — see +``_NON_SATELLITE_STATIONS``). Each array task resolves its shard from a STABLE, +DETERMINISTIC roster so that shard +``N`` always maps to the same station across Spot retries — a stable +shard-index→station mapping is load-bearing for crash-safe resume (a retried shard must re-derive the SAME station's partitions, never a different one). **Source of truth (why this is a committed snapshot, NOT a runtime import).** The canonical roster is the union of two live ``markets``-package catalogs: - - ``markets.catalog.kalshi_stations.KALSHI_SETTLEMENT_STATIONS`` — the Kalshi + - ``markets.catalog.kalshi_stations.KALSHI_SETTLEMENT_STATIONS`` — the Kalshi NHIGH/NLOW settlement stations (values are ``StationCitation`` objects whose ``.station`` is the 4-letter ICAO). - - ``markets.polymarket.load_polymarket_city_stations()`` — the Polymarket - city→role→ICAO map (inner values are the ICAO strings). + - ``markets.polymarket.load_polymarket_city_stations()`` — the Polymarket + city→role→ICAO map (inner values are the ICAO strings). -The union of those two — sorted, deduped — is EXACTLY the 66 ICAOs frozen below -(D-28.8; matches ``infra/batch.tf`` ``task_count = 66``). We snapshot it here, -in ``packages/weather``, rather than importing ``markets`` at runtime because: +The union of those two — sorted, deduped — is 66 ICAOs; the satellite roster +frozen below is those 66 MINUS ``_NON_SATELLITE_STATIONS`` (HKO), i.e. the 65 +that resolve to a satellite ``StationInfo`` (D-28.8; ``infra/batch.tf`` +``task_count = 65``). We snapshot it here, in ``packages/weather``, rather than +importing ``markets`` at runtime because: 1. The satellite/weather deploy image MUST NOT pull the ``markets`` package - (dependency + audit isolation — the weather backfill has no business + (dependency + audit isolation — the weather backfill has no business importing the markets catalogs on the fleet). 2. A frozen roster is deterministic and reviewable; drift is caught in CI. ``tests/satellite/test_roster.py`` imports the LIVE ``markets`` catalogs and asserts this snapshot still equals their sorted union, so any catalog drift (a station added/removed upstream) fails CI loudly and forces a conscious -re-snapshot here — the roster can never silently diverge from the markets truth. +re-snapshot here — the roster can never silently diverge from the markets truth. """ from __future__ import annotations @@ -40,11 +44,13 @@ "shard_roster", ] -#: The canonical Kalshi/Polymarket settlement-station roster (D-28.8): the 66 +#: The canonical Kalshi/Polymarket SATELLITE-backfill roster (D-28.8): the 65 #: 4-letter ICAO codes that are the union of the live Kalshi + Polymarket -#: settlement catalogs, SORTED and deduped. Verified against the live ``markets`` -#: union by ``test_roster.py`` (drift fails CI). Sorted + frozen so the -#: shard-index→station mapping is stable across array-task retries. +#: settlement catalogs (66) MINUS ``_NON_SATELLITE_STATIONS`` (HKO, which has no +#: satellite StationInfo), SORTED and deduped. Verified against the live +#: ``markets`` union by ``test_roster.py`` (both drift AND non-resolvable-station +#: regressions fail CI). Sorted + frozen so the shard-index→station mapping is +#: stable across array-task retries. SETTLEMENT_STATION_ROSTER: tuple[str, ...] = ( "CYYZ", "EDDM", @@ -53,7 +59,6 @@ "EHAM", "EPWA", "FACT", - "HKO", "KATL", "KAUS", "KBKF", @@ -116,10 +121,10 @@ # Committed sub-snapshot: the Kalshi-only membership of the union. This is NOT a -# second source of truth — ``test_roster.py`` asserts (a) the full union equals +# second source of truth — ``test_roster.py`` asserts (a) the full union equals # the live Kalshi/Polymarket catalog and (b) this set equals every live Kalshi # settlement ICAO, so drift in either fails CI. The two markets overlap (many -# stations appear in both), which is why the union — not the sum — is 66. The +# stations appear in both), which is why the union — not the sum — is 66 (the satellite roster is 65: that's 66 minus HKO). The # split rosters below are derived from this set so they stay in lockstep. _KALSHI_STATIONS: frozenset[str] = frozenset( { @@ -150,9 +155,9 @@ # Committed sub-snapshot: the Polymarket-only membership of the union (the inner # ICAOs of ``load_polymarket_city_stations()``). Like ``_KALSHI_STATIONS`` this is -# NOT a second source of truth — ``test_roster.py`` asserts it equals the live -# Polymarket catalog. Kalshi + Polymarket == the 66 (the two overlap), which is -# why ``len(_KALSHI_STATIONS)`` (21) + ``len(_POLYMARKET_STATIONS)`` (51) != 66. +# NOT a second source of truth — ``test_roster.py`` asserts it equals the live +# Polymarket catalog. Kalshi + Polymarket == 66 (the two overlap); the satellite +# roster is those 66 minus HKO (65). len(_KALSHI) 21 + len(_POLYMARKET) 51 != 66. _POLYMARKET_STATIONS: frozenset[str] = frozenset( { "CYYZ", @@ -210,9 +215,23 @@ ) +#: Settlement stations that appear in the market catalogs but have NO satellite +#: ``StationInfo`` in the SDK station registry, so a satellite backfill shard for +#: them would resolve to ZERO partitions (a silent data hole in the fleet). They +#: are EXCLUDED from the satellite roster. ``HKO`` is the Hong Kong Observatory +#: pseudo-identifier (NOT a standard ICAO — Hong Kong's airport is ``VHHH``); its +#: markets settle against the HKO Open Data API, not satellite imagery, so it is a +#: deferred non-satellite station (project HKO-unblock). ``test_roster.py`` asserts +#: every ROSTER station DOES resolve, so this list stays honest against the +#: registry — a newly-unresolvable catalog station fails CI rather than adding a +#: silent empty shard. +_NON_SATELLITE_STATIONS: frozenset[str] = frozenset({"HKO"}) + + #: The CLI ``--roster NAME`` registry. ``batch.tf`` passes the literal -#: ``"kalshi,polymarket"`` (the full union). The split names are provided as a -#: convenience; every value is a sorted slice of the committed snapshot. +#: ``"kalshi,polymarket"`` (the full union MINUS non-satellite stations). The split +#: names are provided as a convenience; every value is a sorted slice of the +#: committed snapshot. ROSTERS: dict[str, tuple[str, ...]] = { "kalshi,polymarket": SETTLEMENT_STATION_ROSTER, "kalshi": tuple(s for s in SETTLEMENT_STATION_ROSTER if s in _KALSHI_STATIONS), @@ -247,9 +266,9 @@ def shard_roster(roster: tuple[str, ...], index: int, count: int) -> tuple[str, Round-robin slice (``roster[index::count]``). Round-robin keeps every shard NON-EMPTY whenever ``count <= len(roster)`` (each of the first ``len(roster)`` - shards gets ≥1 station), and — because it is a pure function of - ``(index, count)`` — shard ``index`` maps to the SAME stations across retries. - For ``count == len(roster)`` (the batch.tf ``task_count = 66`` case) each shard + shards gets ≥1 station), and — because it is a pure function of + ``(index, count)`` — shard ``index`` maps to the SAME stations across retries. + For ``count == len(roster)`` (the batch.tf ``task_count = 65`` case) each shard is exactly one station. Args: diff --git a/packages/weather/tests/satellite/test_cli_roster.py b/packages/weather/tests/satellite/test_cli_roster.py index 1af7864..fdd8a9b 100644 --- a/packages/weather/tests/satellite/test_cli_roster.py +++ b/packages/weather/tests/satellite/test_cli_roster.py @@ -176,8 +176,24 @@ def test_roster_and_stations_mutually_exclusive(captured): assert captured == [] -def test_progress_bucket_accepted(captured): - """--progress-bucket is accepted without error (C4 threading TODO).""" +def test_progress_bucket_wires_gcs_progress_store(captured, monkeypatch): + """--progress-bucket constructs a durable GcsProgressStore and threads it in (C4). + + Each shard gets a DISJOINT marker object (shard-index in the key) so array + tasks never clobber. The store is faked (no real GCS I/O at construction). + """ + built: list[str] = [] + + class _FakeStore: + def __init__(self, gcs_uri, *, fs=None): + built.append(gcs_uri) + + # The CLI does `from ._progress import GcsProgressStore` inside _run_backfill. + # Patch on the module object (the string path would resolve the `satellite` + # public function, not the submodule). + from mostlyright.weather.satellite import _progress + + monkeypatch.setattr(_progress, "GcsProgressStore", _FakeStore) rc = cli.main( [ "backfill", @@ -190,13 +206,17 @@ def test_progress_bucket_accepted(captured): "--r2-bucket", "b", "--shard-index", - "0", + "3", "--shard-count", - "66", + "65", ] ) assert rc == 0 assert len(captured) == 1 + # A durable GCS progress store was wired into bulk_backfill's kwargs... + assert isinstance(captured[0]["progress_store"], _FakeStore) + # ...at a shard-disjoint marker URI. + assert built == ["gs://marker-bkt/progress/shard-00003/progress.json"] def test_explicit_mode_unchanged(captured, tmp_path): diff --git a/packages/weather/tests/satellite/test_roster.py b/packages/weather/tests/satellite/test_roster.py index fe864f2..cca45c8 100644 --- a/packages/weather/tests/satellite/test_roster.py +++ b/packages/weather/tests/satellite/test_roster.py @@ -11,6 +11,7 @@ import pytest from mostlyright.weather.satellite._roster import ( + _NON_SATELLITE_STATIONS, ROSTERS, SETTLEMENT_STATION_ROSTER, resolve_roster, @@ -30,26 +31,52 @@ def _live_polymarket_stations() -> set[str]: return {icao for roles in load_polymarket_city_stations().values() for icao in roles.values()} -def test_roster_equals_live_union() -> None: - """The committed roster == the sorted live Kalshi/Polymarket union (drift gate).""" - live_union = sorted(_live_kalshi_stations() | _live_polymarket_stations()) - assert list(SETTLEMENT_STATION_ROSTER) == live_union +def _live_satellite_union() -> list[str]: + """The live Kalshi/Polymarket union MINUS the non-satellite stations (sorted).""" + union = _live_kalshi_stations() | _live_polymarket_stations() + return sorted(union - _NON_SATELLITE_STATIONS) -def test_roster_count_is_66() -> None: - """D-28.8 / batch.tf task_count = 66.""" - assert len(SETTLEMENT_STATION_ROSTER) == 66 +def test_roster_equals_live_satellite_union() -> None: + """The committed roster == live Kalshi/Polymarket union minus non-sat (drift gate).""" + assert list(SETTLEMENT_STATION_ROSTER) == _live_satellite_union() + + +def test_roster_count_is_65() -> None: + """D-28.8 / batch.tf task_count = 65 (66-station union minus the non-satellite HKO).""" + assert len(SETTLEMENT_STATION_ROSTER) == 65 + + +def test_non_satellite_stations_excluded_but_in_live_catalog() -> None: + """HKO is a REAL live settlement station but excluded (no satellite StationInfo).""" + assert "HKO" in _NON_SATELLITE_STATIONS + # It IS a live catalog station (so this is a conscious exclusion, not drift)... + assert "HKO" in _live_polymarket_stations() + # ...but it is NOT in the satellite roster (would be a zero-partition shard). + for station in _NON_SATELLITE_STATIONS: + assert station not in SETTLEMENT_STATION_ROSTER + + +def test_every_roster_station_resolves_to_a_satellite_station() -> None: + """No silent empty shards: every roster station resolves to a StationInfo.""" + from mostlyright.weather.satellite._backfill import _resolve_station_infos + + for station in SETTLEMENT_STATION_ROSTER: + infos = _resolve_station_infos([station]) + assert infos, ( + f"roster station {station!r} resolves to NO satellite StationInfo (empty shard)" + ) def test_every_kalshi_settlement_station_present() -> None: - """Every live Kalshi NHIGH/NLOW settlement station is in the roster.""" + """Every live Kalshi NHIGH/NLOW settlement station is in the roster (all satellite-visible).""" for station in _live_kalshi_stations(): assert station in SETTLEMENT_STATION_ROSTER -def test_every_polymarket_station_present() -> None: - """Every live Polymarket city ICAO is in the roster.""" - for station in _live_polymarket_stations(): +def test_every_satellite_polymarket_station_present() -> None: + """Every live Polymarket city ICAO EXCEPT the non-satellite ones is in the roster.""" + for station in _live_polymarket_stations() - _NON_SATELLITE_STATIONS: assert station in SETTLEMENT_STATION_ROSTER @@ -60,16 +87,18 @@ def test_roster_sorted_and_deduped() -> None: def test_resolve_roster_kalshi_polymarket() -> None: - """The batch.tf literal 'kalshi,polymarket' resolves to the 66.""" + """The batch.tf literal 'kalshi,polymarket' resolves to the 65 satellite stations.""" resolved = resolve_roster("kalshi,polymarket") assert resolved == SETTLEMENT_STATION_ROSTER - assert len(resolved) == 66 + assert len(resolved) == 65 def test_resolve_roster_splits_match_live_catalogs() -> None: - """The convenience split rosters match the live per-market catalogs.""" - assert set(resolve_roster("kalshi")) == _live_kalshi_stations() - assert set(resolve_roster("polymarket")) == _live_polymarket_stations() + """The convenience split rosters match the live per-market catalogs (minus non-sat).""" + assert set(resolve_roster("kalshi")) == _live_kalshi_stations() - _NON_SATELLITE_STATIONS + assert ( + set(resolve_roster("polymarket")) == _live_polymarket_stations() - _NON_SATELLITE_STATIONS + ) # Split rosters stay sorted slices of the union. for name in ("kalshi", "polymarket"): r = resolve_roster(name) @@ -83,7 +112,7 @@ def test_resolve_roster_unknown_raises() -> None: resolve_roster("nope") -@pytest.mark.parametrize("count", [66, 8]) +@pytest.mark.parametrize("count", [65, 8]) def test_shard_roster_partitions_with_no_overlap_full_coverage(count: int) -> None: """Sharding partitions the roster: no overlap, full coverage across all shards.""" roster = SETTLEMENT_STATION_ROSTER @@ -96,7 +125,7 @@ def test_shard_roster_partitions_with_no_overlap_full_coverage(count: int) -> No assert len(seen) == len(set(seen)) # no overlap -def test_shard_roster_count_66_gives_one_station_each() -> None: +def test_shard_roster_full_count_gives_one_station_each() -> None: """With count == len(roster) every shard is exactly one station (batch.tf case).""" roster = SETTLEMENT_STATION_ROSTER for index in range(len(roster)): @@ -114,7 +143,7 @@ def test_shard_roster_deterministic() -> None: def test_shard_roster_nonempty_when_count_le_len() -> None: """Round-robin keeps every shard non-empty when count <= len(roster).""" roster = SETTLEMENT_STATION_ROSTER - for count in (8, 66): + for count in (8, 65): for index in range(count): assert len(shard_roster(roster, index, count)) >= 1 diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py index 8ba6b1c..233fbd6 100644 --- a/services/earnings/jobs/stt.py +++ b/services/earnings/jobs/stt.py @@ -115,15 +115,38 @@ def _resolve_audio_reference(reference: str, *, handoff_bucket: str | None) -> I # Lazy import: google-cloud-storage only when a GCS handoff object is fetched. from google.cloud import storage + blob = storage.Client().bucket(bucket).blob(key) ext = os.path.splitext(key)[1] or ".audio" fd, local_path = tempfile.mkstemp(prefix="earnings-stt-handoff-", suffix=ext) os.close(fd) try: _LOG.info("stt: downloading handoff audio gs://%s/%s -> %s", bucket, key, local_path) - storage.Client().bucket(bucket).blob(key).download_to_filename(local_path) + blob.download_to_filename(local_path) yield local_path + except BaseException: + # Transcription (or download) FAILED — KEEP the source handoff object so a + # job/service retry can re-fetch it. Only the local temp is cleaned below. + raise + else: + # Transcription SUCCEEDED — the raw handoff audio is transient and must NOT + # accumulate in the private bucket (D-27.9 firewall). Delete the SOURCE + # object; a delete failure is logged loudly (the bucket lifecycle policy is + # the backstop) but does not fail the already-successful transcription. + try: + blob.delete() + _LOG.info( + "stt: deleted transient handoff object gs://%s/%s (post-transcription)", bucket, key + ) + except Exception: + _LOG.exception( + "stt: FAILED to delete transient handoff object gs://%s/%s — raw audio " + "must not linger (D-27.9); ensure the AUDIO_HANDOFF_BUCKET lifecycle " + "policy reaps it", + bucket, + key, + ) finally: - # The audio is transient — delete the temp file once transcription is done. + # The local temp is transient — delete it once transcription is done. with contextlib.suppress(FileNotFoundError): os.remove(local_path) From 61898c7c3ccb4d0bd07ee850f7b9adff0c2c94b4 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 20:10:18 +0200 Subject: [PATCH 08/17] =?UTF-8?q?fix(28):=20review=20round=204=20=E2=80=94?= =?UTF-8?q?=20STT=20delete-after-ledger=20+=20cross-project=20subscription?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-4 (gpt-5.5) 2 P1: - STT deletes the handoff audio object ONLY after the transcript is durably written (TranscriptLedger.append) + any live publish — moved out of the download context manager into _delete_handoff_source. A ledger-write failure now KEEPS the source audio for retry instead of stranding the call. New tests assert the download->transcribe->ledger->delete order and the skip-on-failure. - earnings-serving lifespan parses the INGEST project + bare id from the FULL subscription resource path the infra sets (projects//subscriptions/); the serving instance's GOOGLE_CLOUD_PROJECT is the wrong project for the cross-project earnings-streaming subscription. New parse tests cover both forms. --- services/earnings/app.py | 44 ++++++-- services/earnings/jobs/stt.py | 81 ++++++++------ .../tests/test_healthz_and_lifespan.py | 38 ++++++- .../earnings/tests/test_stt_handoff_delete.py | 104 ++++++++++++++++++ 4 files changed, 221 insertions(+), 46 deletions(-) create mode 100644 services/earnings/tests/test_stt_handoff_delete.py diff --git a/services/earnings/app.py b/services/earnings/app.py index 52dd40a..b6b0831 100644 --- a/services/earnings/app.py +++ b/services/earnings/app.py @@ -149,6 +149,33 @@ def close(self, call_id: str) -> object: return self._registry.get_or_create(call_id).close(call_id) # type: ignore[attr-defined] +def _parse_streaming_subscription(value: str) -> tuple[str, str]: + """Resolve ``(project, subscription_id)`` from ``EARNINGS_STREAMING_SUBSCRIPTION``. + + The deployed infra sets this to the FULL cross-project resource path + ``projects//subscriptions/`` (the + ``google_pubsub_subscription.id``), so the INGEST project + bare id are parsed + from it — the serving instance's own ``GOOGLE_CLOUD_PROJECT`` is the serving + project, the WRONG one for this cross-project subscription. A BARE subscription + id (manual / test) still works when ``EARNINGS_INGEST_PROJECT`` (or + ``GOOGLE_CLOUD_PROJECT``) names the owning project. + """ + match = re.fullmatch(r"projects/([^/]+)/subscriptions/([^/]+)", value) + if match: + return match.group(1), match.group(2) + project = ( + os.environ.get(INGEST_PROJECT_ENV) or os.environ.get("GOOGLE_CLOUD_PROJECT") or "" + ).strip() + if not project: + raise RuntimeError( + f"{STREAMING_SUBSCRIPTION_ENV}={value!r} is a bare subscription id but no " + f"ingest project is configured — provide the full " + f"projects//subscriptions/ resource path (the infra default), " + f"or set {INGEST_PROJECT_ENV}." + ) + return project, value + + @contextlib.asynccontextmanager async def _streaming_lifespan(app: FastAPI) -> AsyncIterator[None]: """Start/stop the cross-project earnings-streaming subscriber (28-12 C2, H2). @@ -169,15 +196,12 @@ async def _streaming_lifespan(app: FastAPI) -> AsyncIterator[None]: if not subscription: yield return - project = ( - os.environ.get(INGEST_PROJECT_ENV) or os.environ.get("GOOGLE_CLOUD_PROJECT") or "" - ).strip() - if not project: - raise RuntimeError( - f"{STREAMING_SUBSCRIPTION_ENV} is set but no ingest project is configured — " - f"set {INGEST_PROJECT_ENV} (or GOOGLE_CLOUD_PROJECT) to the project that owns " - "the earnings-streaming topic (cross-project, C2)." - ) + # The deployed infra sets EARNINGS_STREAMING_SUBSCRIPTION to the FULL + # cross-project resource path (projects//subscriptions/) and does + # NOT set EARNINGS_INGEST_PROJECT — the serving instance runs in the SERVING + # project, so GOOGLE_CLOUD_PROJECT is the WRONG project for this cross-project + # subscription. Parse the ingest project + bare id from the resource path. + project, subscription_id = _parse_streaming_subscription(subscription) # Lazy-import the GCP-SDK-backed bridge factories so importing this module (and # the ledger-only default deploy) never needs google-cloud-pubsub. from .pubsub_bridge import ( @@ -192,7 +216,7 @@ async def _streaming_lifespan(app: FastAPI) -> AsyncIterator[None]: _RegistryBusAdapter(registry), # type: ignore[arg-type] run_on_loop=make_run_coroutine_threadsafe(registry.serving_loop), ) - streaming_pull = build_streaming_pull(project, subscription) + streaming_pull = build_streaming_pull(project, subscription_id) def _run() -> None: try: diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py index 233fbd6..84de7ef 100644 --- a/services/earnings/jobs/stt.py +++ b/services/earnings/jobs/stt.py @@ -91,14 +91,19 @@ def _split_gs_uri(reference: str, *, handoff_bucket: str | None) -> tuple[str, s @contextlib.contextmanager -def _resolve_audio_reference(reference: str, *, handoff_bucket: str | None) -> Iterator[str]: - """Yield a LOCAL path for ``reference``, downloading a GCS handoff object if needed. +def _resolve_audio_reference( + reference: str, *, handoff_bucket: str | None +) -> Iterator[tuple[str, tuple[str, str] | None]]: + """Yield ``(local_path, source)``, downloading a GCS handoff object if needed. In the deployed topology STT receives a ``gs://`` reference to a private handoff-bucket object (capture and STT do not share a disk). This downloads it - to an ephemeral temp file, yields the local path, then DELETES the temp file on - exit (the audio is transient — it never persists past transcription; D-27.9). A - local path is yielded unchanged (operator / test / GCE MIG fallback). + to an ephemeral temp file, yields ``(local_path, (bucket, key))``, and DELETES + the LOCAL temp on exit (the local copy never outlives transcription; D-27.9). + It does NOT delete the SOURCE object — the caller deletes it only AFTER the + transcript is durably written (see :func:`transcribe_call`), so a ledger-write + failure still leaves the audio retryable. A local path yields + ``(reference, None)`` (operator / test / GCE MIG fallback). """ gs = _split_gs_uri(reference, handoff_bucket=handoff_bucket) if gs is None: @@ -108,49 +113,50 @@ def _resolve_audio_reference(reference: str, *, handoff_bucket: str | None) -> I "audio must be present on the shared ephemeral disk (local path), or the " "reference must be a gs:// handoff object in AUDIO_HANDOFF_BUCKET." ) - yield reference + yield reference, None return bucket, key = gs # Lazy import: google-cloud-storage only when a GCS handoff object is fetched. from google.cloud import storage - blob = storage.Client().bucket(bucket).blob(key) ext = os.path.splitext(key)[1] or ".audio" fd, local_path = tempfile.mkstemp(prefix="earnings-stt-handoff-", suffix=ext) os.close(fd) try: _LOG.info("stt: downloading handoff audio gs://%s/%s -> %s", bucket, key, local_path) - blob.download_to_filename(local_path) - yield local_path - except BaseException: - # Transcription (or download) FAILED — KEEP the source handoff object so a - # job/service retry can re-fetch it. Only the local temp is cleaned below. - raise - else: - # Transcription SUCCEEDED — the raw handoff audio is transient and must NOT - # accumulate in the private bucket (D-27.9 firewall). Delete the SOURCE - # object; a delete failure is logged loudly (the bucket lifecycle policy is - # the backstop) but does not fail the already-successful transcription. - try: - blob.delete() - _LOG.info( - "stt: deleted transient handoff object gs://%s/%s (post-transcription)", bucket, key - ) - except Exception: - _LOG.exception( - "stt: FAILED to delete transient handoff object gs://%s/%s — raw audio " - "must not linger (D-27.9); ensure the AUDIO_HANDOFF_BUCKET lifecycle " - "policy reaps it", - bucket, - key, - ) + storage.Client().bucket(bucket).blob(key).download_to_filename(local_path) + yield local_path, (bucket, key) finally: - # The local temp is transient — delete it once transcription is done. + # The local temp is transient — delete it once transcription is done + # (regardless of success/failure). The SOURCE object is NOT touched here. with contextlib.suppress(FileNotFoundError): os.remove(local_path) +def _delete_handoff_source(bucket: str, key: str) -> None: + """Delete the transient SOURCE handoff object after the transcript is durable. + + Called by :func:`transcribe_call` ONLY after ``TranscriptLedger.append`` + succeeds, so raw earnings audio does not accumulate in the private bucket + (D-27.9) yet a failed call still has retryable audio. A delete failure loses no + data (the ledger has the transcript) and is logged loudly (the bucket lifecycle + policy is the backstop) rather than failing the already-successful call. + """ + from google.cloud import storage + + try: + storage.Client().bucket(bucket).blob(key).delete() + _LOG.info("stt: deleted transient handoff object gs://%s/%s (post-ledger)", bucket, key) + except Exception: + _LOG.exception( + "stt: FAILED to delete transient handoff object gs://%s/%s — raw audio must " + "not linger (D-27.9); ensure the AUDIO_HANDOFF_BUCKET lifecycle policy reaps it", + bucket, + key, + ) + + #: Default STT tier — the hosted / our-infra source-of-truth model (D-27.5). _DEFAULT_TIER = "large-v3" #: Default device/compute for the L4 GPU image. @@ -251,7 +257,10 @@ def transcribe_call( language, duration) — NEVER audio. Raises on a transcription / ledger failure (fail loud). No audio reaches the ledger or wire. """ - with _resolve_audio_reference(audio_path, handoff_bucket=handoff_bucket) as local_audio: + with _resolve_audio_reference(audio_path, handoff_bucket=handoff_bucket) as ( + local_audio, + source, + ): result = _transcribe_local( local_audio, ticker=ticker, @@ -283,6 +292,12 @@ def transcribe_call( topic=streaming_topic, ) + # The transcript is now DURABLY written (and any live publish done), so it is + # safe to delete the transient source handoff object. Deleting it BEFORE the + # ledger write would strand a failed call with no retryable audio (Codex R4). + if source is not None: + _delete_handoff_source(*source) + return { "ticker": ticker, "call_id": call_id, diff --git a/services/earnings/tests/test_healthz_and_lifespan.py b/services/earnings/tests/test_healthz_and_lifespan.py index 6d535b7..9ae8d49 100644 --- a/services/earnings/tests/test_healthz_and_lifespan.py +++ b/services/earnings/tests/test_healthz_and_lifespan.py @@ -16,10 +16,15 @@ import asyncio import threading +import pytest from fastapi.testclient import TestClient from mostlyright.weather.earnings.streaming_transcriber import Segment -from services.earnings.app import _RegistryBusAdapter, create_app +from services.earnings.app import ( + _parse_streaming_subscription, + _RegistryBusAdapter, + create_app, +) from services.earnings.deps import BusRegistry _KEY = "test-key-abc" @@ -91,9 +96,36 @@ def test_registry_bus_adapter_close_routes() -> None: # --------------------------------------------------------------------------- # Lifespan — starts the subscriber when the subscription env is set # --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# Subscription resource-path parsing (the deployed cross-project case) +# --------------------------------------------------------------------------- +def test_parse_full_subscription_resource_path(monkeypatch) -> None: + # The infra sets the FULL cross-project resource path; the INGEST project must + # be parsed FROM it (GOOGLE_CLOUD_PROJECT on the serving instance is wrong). + monkeypatch.delenv("EARNINGS_INGEST_PROJECT", raising=False) + monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "mr-serving") # the WRONG project + project, sub_id = _parse_streaming_subscription( + "projects/mr-earnings-ingest/subscriptions/earnings-streaming-serving" + ) + assert project == "mr-earnings-ingest" # parsed from the path, not GOOGLE_CLOUD_PROJECT + assert sub_id == "earnings-streaming-serving" + + +def test_parse_bare_subscription_requires_project(monkeypatch) -> None: + monkeypatch.delenv("EARNINGS_INGEST_PROJECT", raising=False) + monkeypatch.delenv("GOOGLE_CLOUD_PROJECT", raising=False) + with pytest.raises(RuntimeError, match="bare subscription id"): + _parse_streaming_subscription("earnings-streaming-serving") + + def test_lifespan_starts_subscriber_when_env_set(monkeypatch) -> None: - monkeypatch.setenv("EARNINGS_STREAMING_SUBSCRIPTION", "earnings-streaming-serving") - monkeypatch.setenv("EARNINGS_INGEST_PROJECT", "mr-earnings-ingest") + # The DEPLOYED form: a full cross-project subscription resource path, no + # EARNINGS_INGEST_PROJECT — the ingest project is parsed from the path. + monkeypatch.setenv( + "EARNINGS_STREAMING_SUBSCRIPTION", + "projects/mr-earnings-ingest/subscriptions/earnings-streaming-serving", + ) + monkeypatch.delenv("EARNINGS_INGEST_PROJECT", raising=False) consumed = threading.Event() diff --git a/services/earnings/tests/test_stt_handoff_delete.py b/services/earnings/tests/test_stt_handoff_delete.py new file mode 100644 index 0000000..2a5b840 --- /dev/null +++ b/services/earnings/tests/test_stt_handoff_delete.py @@ -0,0 +1,104 @@ +"""Phase 28 (28-11) — the STT handoff-object delete ordering (Codex R4 P1). + +The transient source audio object in AUDIO_HANDOFF_BUCKET must be deleted ONLY +AFTER the transcript is durably written to the ledger — never before. Deleting it +at transcription time would strand a call whose ledger write then fails with no +retryable audio. These tests fake google-cloud-storage + the transcriber + the +ledger to assert the ordering and the skip-on-failure behavior. +""" + +from __future__ import annotations + +import sys +import types + +import pytest + + +class _FakeSegments: + def __init__(self) -> None: + self.segments = [{"text": "hello", "start": 0.0}] + self.language = "en" + self.duration = 1.0 + + +def _install_fakes(monkeypatch, events: list[str], *, ledger_raises: bool) -> dict: + """Wire fake storage + transcriber + ledger that record an ordered event log.""" + state: dict = {"deleted": False} + + class _FakeBlob: + def __init__(self, name: str) -> None: + self._name = name + + def download_to_filename(self, local_path: str) -> None: + events.append("download") + with open(local_path, "wb") as fh: + fh.write(b"fake-audio") + + def delete(self) -> None: + events.append("delete") + state["deleted"] = True + + class _FakeBucket: + def blob(self, name: str) -> _FakeBlob: + return _FakeBlob(name) + + class _FakeClient: + def bucket(self, name: str) -> _FakeBucket: + return _FakeBucket() + + monkeypatch.setitem( + sys.modules, "google.cloud.storage", types.SimpleNamespace(Client=lambda: _FakeClient()) + ) + + import mostlyright.weather.earnings.stt as engine_stt + + class _FakeTranscriber: + def __init__(self, *a, **k) -> None: ... + def transcribe(self, path, *, initial_prompt=None): + events.append("transcribe") + return _FakeSegments() + + monkeypatch.setattr(engine_stt, "SttTranscriber", _FakeTranscriber) + + import mostlyright.weather.earnings.ledger as engine_ledger + + class _FakeLedger: + def append(self, rows, *, ticker, call_id): + events.append("ledger") + if ledger_raises: + raise RuntimeError("ledger write failed") + return len(rows) + + monkeypatch.setattr(engine_ledger, "TranscriptLedger", _FakeLedger) + return state + + +def test_handoff_deleted_after_ledger_write(monkeypatch) -> None: + from services.earnings.jobs.stt import transcribe_call + + events: list[str] = [] + state = _install_fakes(monkeypatch, events, ledger_raises=False) + + out = transcribe_call( + "gs://handoff-bkt/handoff/CHWY/evt-1.wav", ticker="CHWY", call_id="evt-1", device="cpu" + ) + assert out["segments"] == 1 + # The source object was deleted, and ONLY after the ledger write. + assert state["deleted"] is True + assert events == ["download", "transcribe", "ledger", "delete"] + + +def test_handoff_kept_when_ledger_write_fails(monkeypatch) -> None: + from services.earnings.jobs.stt import transcribe_call + + events: list[str] = [] + state = _install_fakes(monkeypatch, events, ledger_raises=True) + + with pytest.raises(RuntimeError, match="ledger write failed"): + transcribe_call( + "gs://handoff-bkt/handoff/CHWY/evt-2.wav", ticker="CHWY", call_id="evt-2", device="cpu" + ) + # The ledger write failed, so the source audio is KEPT for a retry (NOT deleted). + assert state["deleted"] is False + assert "delete" not in events From 784deb891a1d65f06703f53bd8dcf9d32a4b77d7 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 20:22:52 +0200 Subject: [PATCH 09/17] =?UTF-8?q?fix(28):=20review=20round=205=20=E2=80=94?= =?UTF-8?q?=20Batch=20secrets=20project=20+=20STT=20publisher=20grant?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-5 (gpt-5.5) P1/P2: - P1: run-weather-backfill Batch secretVariables pointed at the satellite project, but the r2-*/eumetsat-* secrets live in the backend secrets project (var.secrets_project) — the submitted job would 404 on the secrets. Reference the backend project (AR_PROJECT) for the secret resource paths; drop the now-unused project-number var. - P2: the live SSE path publishes from the STT SA (28-GCE-ARCHITECTURE §3), but pubsub.tf granted pubsub.publisher only to the rolefact SA — live publish would 403. Grant the STT runtime SA publisher on the earnings-streaming topic (deploy_iam.tf). --- .github/workflows/run-weather-backfill.yml | 14 ++++++-------- infra/batch.tf | 2 +- infra/deploy_iam.tf | 17 +++++++++++++++++ infra/outputs.tf | 2 +- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run-weather-backfill.yml b/.github/workflows/run-weather-backfill.yml index 6c44fe2..61cc025 100644 --- a/.github/workflows/run-weather-backfill.yml +++ b/.github/workflows/run-weather-backfill.yml @@ -24,7 +24,6 @@ name: Run weather backfill fleet (28-21) # DEPLOY_SA_SATELLITE = deploy@mostlyright-satellite... # AR_HOST = europe-west3-docker.pkg.dev # SATELLITE_PROJECT_ID = mostlyright-satellite -# SATELLITE_PROJECT_NUMBER = # RUNTIME_SA_WEATHER_BACKFILL= # R2_BUCKET = mostlyright-derived # PROGRESS_BUCKET = mostlyright-backfill-progress- @@ -101,7 +100,6 @@ jobs: IMAGE_TAG: ${{ inputs.image_tag }} MODE: ${{ inputs.mode }} PILOT_STATION: ${{ inputs.pilot_station }} - NUM: ${{ vars.SATELLITE_PROJECT_NUMBER }} R2_BUCKET: ${{ vars.R2_BUCKET }} PROGRESS_BUCKET: ${{ vars.PROGRESS_BUCKET }} RUNTIME_SA: ${{ vars.RUNTIME_SA_WEATHER_BACKFILL }} @@ -132,7 +130,7 @@ jobs: --arg img "${IMAGE}" \ --argjson tc "${TASK_COUNT}" \ --argjson cmds "${COMMANDS}" \ - --arg num "${NUM}" \ + --arg secrets_proj "${AR_PROJECT}" \ --arg rb "${R2_BUCKET}" \ --arg pb "${PROGRESS_BUCKET}" \ --arg sa "${RUNTIME_SA}" \ @@ -148,11 +146,11 @@ jobs: environment: { variables: { R2_BUCKET: $rb, R2_REGION: "auto", PROGRESS_BUCKET: $pb }, secretVariables: { - R2_ACCOUNT_ID: ("projects/" + $num + "/secrets/r2-account-id/versions/latest"), - R2_WRITE_ACCESS_KEY_ID: ("projects/" + $num + "/secrets/r2-write-access-key-id/versions/latest"), - R2_WRITE_SECRET_ACCESS_KEY: ("projects/" + $num + "/secrets/r2-write-secret-access-key/versions/latest"), - EUMETSAT_CONSUMER_KEY: ("projects/" + $num + "/secrets/eumetsat-consumer-key/versions/latest"), - EUMETSAT_CONSUMER_SECRET: ("projects/" + $num + "/secrets/eumetsat-consumer-secret/versions/latest") + R2_ACCOUNT_ID: ("projects/" + $secrets_proj + "/secrets/r2-account-id/versions/latest"), + R2_WRITE_ACCESS_KEY_ID: ("projects/" + $secrets_proj + "/secrets/r2-write-access-key-id/versions/latest"), + R2_WRITE_SECRET_ACCESS_KEY: ("projects/" + $secrets_proj + "/secrets/r2-write-secret-access-key/versions/latest"), + EUMETSAT_CONSUMER_KEY: ("projects/" + $secrets_proj + "/secrets/eumetsat-consumer-key/versions/latest"), + EUMETSAT_CONSUMER_SECRET: ("projects/" + $secrets_proj + "/secrets/eumetsat-consumer-secret/versions/latest") } } } diff --git a/infra/batch.tf b/infra/batch.tf index 614785a..8b5c911 100644 --- a/infra/batch.tf +++ b/infra/batch.tf @@ -86,7 +86,7 @@ resource "google_batch_job" "weather_backfill" { # per satellite-resolvable station (D-28.8). run-weather-backfill.yml (the # actual submitter) uses the SAME 65; keep them in lockstep with the roster. task_count = 65 - parallelism = 16 # bounded concurrent Spot slices + parallelism = 16 # bounded concurrent Spot slices task_spec { # Bounded maxRunDuration caps a runaway slice (T-28.21-02). diff --git a/infra/deploy_iam.tf b/infra/deploy_iam.tf index 61ef6f6..6af8b17 100644 --- a/infra/deploy_iam.tf +++ b/infra/deploy_iam.tf @@ -146,3 +146,20 @@ resource "google_artifact_registry_repository_iam_member" "writer_satellite" { role = "roles/artifactregistry.writer" member = "serviceAccount:${google_service_account.deploy_satellite.email}" } + +# ===================================================================== +# 5. STT runtime SA → pubsub.publisher on earnings-streaming (live SSE path) +# ===================================================================== +# The LIVE path publishes transcript segments to the earnings-streaming topic +# straight from the STT service as they are transcribed (28-GCE-ARCHITECTURE §3: +# "live: STT publishes segments -> in-process bus -> SSE /stream"), gated by the +# opt-in EARNINGS_STREAMING_ENABLED / publish_live. pubsub.tf grants +# roles/pubsub.publisher to the ROLE/FACT SA (the post-call fact publisher) but +# NOT the STT SA, so the live publish would 403. Grant the STT runtime SA +# publisher on the SAME topic (least privilege: publisher, one topic). +resource "google_pubsub_topic_iam_member" "stt_earnings_streaming_publisher" { + project = google_project.ingest.project_id + topic = google_pubsub_topic.earnings_streaming.name + role = "roles/pubsub.publisher" + member = local.sa_earnings_stt +} diff --git a/infra/outputs.tf b/infra/outputs.tf index b696766..d69b304 100644 --- a/infra/outputs.tf +++ b/infra/outputs.tf @@ -90,7 +90,7 @@ output "budget_notification_channels" { # project_ids / deploy_service_accounts / wif_provider_name above. output "satellite_project_number" { - description = "mostlyright-satellite project number (H1) — used by run-weather-backfill.yml to build the Batch secret resource paths. Set as SATELLITE_PROJECT_NUMBER." + description = "mostlyright-satellite project number (H1) — general-purpose output (the Batch secrets now resolve via the backend secrets project, not this number)." value = var.satellite_project_number } From fad955f21bcd2fe7b8889c7c623ab78e02251e88 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 20:41:23 +0200 Subject: [PATCH 10/17] =?UTF-8?q?fix(28):=20review=20round=206=20=E2=80=94?= =?UTF-8?q?=20capture->STT=20triggering,=20lease=20extension,=20s2s=20auth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-6 (gpt-5.5) 2 P1 + 1 P2 in the earnings live-ingest path: - P1: capture now TRIGGERS STT after the handoff upload (POST the gs:// ref to the private STT service /transcribe) and acks the capture message ONLY after a 2xx — no more captured-but-never-transcribed orphans; unset STT_SERVICE_URL fails loud. Adds STT_SERVICE_URL env (cloud_run.tf) + capture SA run.invoker on STT + a metadata-server ID token (audience=STT_SERVICE_URL) for the private-service call. - P1: capture holds the Pub/Sub lease across 60-90min captures (background modify_ack_deadline loop) so a long call is not redelivered/duplicated. - P2: STT image adds google-cloud-pubsub (the live-publish path imports pubsub_v1). Note: the capture->STT->rolefact live orchestration is now wired end-to-end but its live validation stays operator-gated (28-10/11/13 Task 3/4), per the plans. --- deploy/earnings/stt.Dockerfile | 1 + infra/cloud_run.tf | 18 +- infra/deploy_iam.tf | 17 ++ services/earnings/jobs/capture.py | 241 +++++++++++++++--- .../earnings/tests/test_jobs_entrypoints.py | 206 ++++++++++++++- 5 files changed, 438 insertions(+), 45 deletions(-) diff --git a/deploy/earnings/stt.Dockerfile b/deploy/earnings/stt.Dockerfile index 901056f..4912dcc 100644 --- a/deploy/earnings/stt.Dockerfile +++ b/deploy/earnings/stt.Dockerfile @@ -66,6 +66,7 @@ RUN python -m pip install --break-system-packages \ ./packages/core \ "./packages/weather[earnings]" \ "google-cloud-storage>=2.10,<4" \ + "google-cloud-pubsub>=2.18,<3" \ "fastapi>=0.115,<1" \ "uvicorn[standard]>=0.30" diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index b6220f0..c8ac4a0 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -29,11 +29,11 @@ locals { ar_image_base = var.artifact_registry image = { - capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}" - stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}" - rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}" - serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}" - wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}" + capture = "${local.ar_image_base}/${var.image_earnings_capture}:${var.image_tag}" + stt = "${local.ar_image_base}/${var.image_earnings_stt}:${var.image_tag}" + rolefact = "${local.ar_image_base}/${var.image_earnings_rolefact}:${var.image_tag}" + serving = "${local.ar_image_base}/${var.image_earnings_serving}:${var.image_tag}" + wx_serving = "${local.ar_image_base}/${var.image_weather_serving}:${var.image_tag}" } # R2 endpoint host: https://.r2.cloudflarestorage.com. The account @@ -358,6 +358,14 @@ resource "google_cloud_run_v2_job" "capture" { name = "CAPTURE_JOBS_SUBSCRIPTION" value = google_pubsub_subscription.capture_jobs.id } + # Capture triggers STT after the handoff upload by POSTing the gs:// ref to + # the STT service's /transcribe. STT is a PRIVATE Cloud Run service, so the + # capture SA holds run.invoker on it (deploy_iam.tf) and the POST carries a + # metadata-server ID token minted for this URL as its audience. + env { + name = "STT_SERVICE_URL" + value = google_cloud_run_v2_service.stt.uri + } } } } diff --git a/infra/deploy_iam.tf b/infra/deploy_iam.tf index 6af8b17..523e1cb 100644 --- a/infra/deploy_iam.tf +++ b/infra/deploy_iam.tf @@ -163,3 +163,20 @@ resource "google_pubsub_topic_iam_member" "stt_earnings_streaming_publisher" { role = "roles/pubsub.publisher" member = local.sa_earnings_stt } + +# ===================================================================== +# 6. Capture SA → run.invoker on the PRIVATE STT service (capture->STT trigger) +# ===================================================================== +# After the handoff upload, the capture Job POSTs the gs:// audio ref to the STT +# service's /transcribe to schedule transcription (capture->STT end-to-end). STT +# is deliberately NOT public (audio-side, no allUsers invoker), so the capture +# runtime SA needs run.invoker on it — and the capture POST carries a +# metadata-server ID token (audience = the STT URL). Least privilege: invoker on +# the ONE service. +resource "google_cloud_run_v2_service_iam_member" "capture_invokes_stt" { + project = google_cloud_run_v2_service.stt.project + location = google_cloud_run_v2_service.stt.location + name = google_cloud_run_v2_service.stt.name + role = "roles/run.invoker" + member = local.sa_earnings_capture +} diff --git a/services/earnings/jobs/capture.py b/services/earnings/jobs/capture.py index 4353889..49968bb 100644 --- a/services/earnings/jobs/capture.py +++ b/services/earnings/jobs/capture.py @@ -5,14 +5,24 @@ 1. pulls ONE capture-job spec message off the ``CAPTURE_JOBS_SUBSCRIPTION`` Pub/Sub subscription (the per-call ticker / call_id / webcast_url), then - 2. invokes the SHIPPED capture surface + 2. starts a background lease-extension loop (the capture can run 60-90 min, far + past the subscription ack deadline; without extension Pub/Sub would redeliver + the job mid-capture → duplicate captures + DLQ exhaustion), then + 3. invokes the SHIPPED capture surface (:class:`mostlyright.weather.earnings.capture.q4.Q4CaptureAdapter`) to - cold-fetch the webcast media into an EPHEMERAL dir, then - 3. UPLOADS the transient audio to the private, in-firewall GCS handoff bucket + cold-fetch the webcast media into an EPHEMERAL dir, then stops the lease loop, + then + 4. UPLOADS the transient audio to the private, in-firewall GCS handoff bucket ``AUDIO_HANDOFF_BUCKET`` (``earnings-audio-handoff-``) so the SEPARATE STT Cloud Run service — which does NOT share this job's ephemeral filesystem — can fetch it, then - 4. acks the message. + 5. TRIGGERS STT by POSTing the gs:// handoff URI to ``STT_SERVICE_URL/transcribe`` + (the shipped :mod:`services.earnings.jobs.stt_server` surface accepts a gs:// + ref), then + 6. acks the message ONLY after STT was successfully triggered (2xx). On a trigger + failure the message is NOT acked and Pub/Sub redelivers — re-upload to the + same handoff key is idempotent, so no audio is orphaned (captured but never + transcribed). **Why the GCS handoff (not a shared local path).** capture and STT are separate Cloud Run resources with NO shared disk. The audio therefore crosses the two @@ -21,6 +31,12 @@ (inside the firewall) reads it, transcribes, and deletes it. The audio bytes never reach the ledger, the wire, or R2 (D-27.9). +**Trigger seam (hardening).** The synchronous HTTP trigger to STT is the MVP that +closes capture→STT end-to-end. A decoupled trigger — an ``stt-jobs`` Pub/Sub topic +capture publishes to, or a GCS object-finalize notification on the handoff bucket +fanning out to STT — is a hardening seam that removes capture's dependency on STT +being reachable at that instant. Not wired here (MVP first). + **Audio firewall (D-27.9, legal).** The captured audio is a TRANSIENT artifact. This job asserts ``is_transient`` and that the local path stays under the capture dir before the handoff. The handoff target is the private GCS bucket the infra @@ -33,22 +49,30 @@ ``webcast_url``/``media_url``). One message is pulled + acked per run. * ``AUDIO_HANDOFF_BUCKET`` (required) — the private GCS bucket the transient audio is uploaded to for the cross-service handoff to STT. +* ``STT_SERVICE_URL`` (required in the deployed subscription path) — the + base URL of the STT Cloud Run service; capture POSTs ``STT_SERVICE_URL/transcribe`` + to trigger transcription. UNSET in the deployed path is a FAIL-LOUD deploy + misconfiguration (raises, non-zero exit, message NOT acked) rather than silently + orphaning the uploaded audio. * ``CAPTURE_OUT_DIR`` (optional) — the ephemeral dir the transient audio is written under (default: a fresh ``tempfile`` dir, still ephemeral). **Operator-override / manual single-call path.** For a manual single-call run (no subscription), the per-call spec may instead be supplied directly via env — ``CAPTURE_TICKER`` / ``CAPTURE_CALL_ID`` / ``CAPTURE_WEBCAST_URL`` — which takes -precedence over the subscription pull. This is the operator-gated override; the -DEFAULT deployed path reads ``CAPTURE_JOBS_SUBSCRIPTION`` + ``AUDIO_HANDOFF_BUCKET``. -The handoff upload is skipped ONLY when ``AUDIO_HANDOFF_BUCKET`` is unset (a bare -local operator run), and then the transient path is emitted on stdout instead. +precedence over the subscription pull. This is the operator-gated override (no +lease loop, no ack, and the STT trigger is skipped when no handoff bucket is set); +the DEFAULT deployed path reads ``CAPTURE_JOBS_SUBSCRIPTION`` + +``AUDIO_HANDOFF_BUCKET`` + ``STT_SERVICE_URL``. The handoff upload is skipped ONLY +when ``AUDIO_HANDOFF_BUCKET`` is unset (a bare local operator run), and then the +transient path is emitted on stdout instead. **Lazy imports.** ffmpeg/PyAV (``av``) and httpx are pulled in only by the shipped -capture surface (lazy inside its own methods). ``google-cloud-pubsub`` and +capture surface (lazy inside its own methods). ``google-cloud-pubsub`` / ``google-cloud-storage`` are lazy-imported inside :func:`_pull_capture_job` / -:func:`_upload_handoff` — so this module and its ``main`` import nothing heavy at -module load (the tests stub the capture surface + fake pubsub/GCS). +:func:`_upload_handoff`, and ``httpx`` inside :func:`_trigger_stt` — so this module +and its ``main`` import nothing heavy at module load (the tests stub the capture +surface + fake pubsub / GCS / httpx). **Live IVS capture seam (27-09, OPERATOR-GATED).** The live-during-call path (:meth:`CaptureAdapter.live` → the Amazon-IVS HLS edge for a real in-progress @@ -61,15 +85,24 @@ from __future__ import annotations +import contextlib import json import logging import os import tempfile +import threading from services.earnings.jobs._env import optional_env, require_env _LOG = logging.getLogger("services.earnings.jobs.capture") +#: The Pub/Sub ack-deadline (seconds) the lease loop resets the message to on each +#: extension. Pub/Sub caps modify_ack_deadline at 600s; a 60-90 min capture needs +#: many extensions, so we re-lease at 600 and refresh well before it lapses. +_LEASE_DEADLINE_SECONDS = 600 +#: Refresh at 70% of the deadline so a slow extension still lands before expiry. +_LEASE_REFRESH_FRACTION = 0.7 + def _assert_audio_local(audio_path: str, out_dir: str) -> None: """Fail loud if the captured audio is not a local file under ``out_dir`` (D-27.9). @@ -88,14 +121,83 @@ def _assert_audio_local(audio_path: str, out_dir: str) -> None: ) -def _pull_capture_job(subscription: str) -> tuple[dict[str, str], object]: - """Pull ONE capture-job spec message off the subscription; return (spec, ack). +class _MessageHandle: + """A pulled Pub/Sub message: ack + lease-extension over a long capture. + + Wraps the ``SubscriberClient`` + ``(subscription, ack_id)`` so ``main`` can + (a) hold the lease during the whole capture via a background daemon thread and + (b) ack ONLY after the full capture→handoff→STT-trigger succeeds. The + operator-override path uses :class:`_NoopHandle` instead (no message to lease). + """ + + def __init__(self, client: object, subscription: str, ack_id: str) -> None: + self._client = client + self._subscription = subscription + self._ack_id = ack_id + + def ack(self) -> None: + self._client.acknowledge(subscription=self._subscription, ack_ids=[self._ack_id]) + + def _extend_once(self) -> None: + self._client.modify_ack_deadline( + subscription=self._subscription, + ack_ids=[self._ack_id], + ack_deadline_seconds=_LEASE_DEADLINE_SECONDS, + ) + + @contextlib.contextmanager + def hold_lease(self): + """Context manager: keep the message leased for its whole body. + + A daemon thread re-leases the message to ``_LEASE_DEADLINE_SECONDS`` every + ``deadline * _LEASE_REFRESH_FRACTION`` seconds so a 60-90 min capture never + lets the ack deadline lapse (which would trigger redelivery + a duplicate + capture). The thread is stopped BEFORE the caller acks/nacks. The first + extension is issued immediately so a slow capture is protected from second 0. + """ + stop = threading.Event() + interval = _LEASE_DEADLINE_SECONDS * _LEASE_REFRESH_FRACTION + + def _loop() -> None: + # Extend immediately, then every ``interval`` until stopped. Any + # transient extension error is logged but must not kill the capture — + # a lapse only risks a duplicate (idempotent) capture, never data loss. + while not stop.is_set(): + try: + self._extend_once() + except Exception: # pragma: no cover - defensive; transient RPC error + _LOG.warning("lease extension failed (will retry next tick)", exc_info=True) + stop.wait(interval) + + thread = threading.Thread(target=_loop, name="capture-lease-extender", daemon=True) + thread.start() + try: + yield + finally: + stop.set() + thread.join(timeout=5.0) + + +class _NoopHandle: + """The operator-override message handle: no lease, no ack (no Pub/Sub message).""" + + def ack(self) -> None: + return None + + @contextlib.contextmanager + def hold_lease(self): + yield + + +def _pull_capture_job(subscription: str) -> tuple[dict[str, str], _MessageHandle]: + """Pull ONE capture-job spec message off the subscription; return (spec, handle). Returns the decoded per-call spec (``ticker`` / ``call_id`` / - ``webcast_url``/``media_url``) plus a zero-arg ``ack`` callable the caller - invokes AFTER a successful capture+handoff (so a crash mid-capture leaves the - message un-acked and the job is retried). ``google-cloud-pubsub`` is - lazy-imported here (never at module load). + ``webcast_url``/``media_url``) plus a :class:`_MessageHandle` the caller uses to + hold the lease during the capture and ack AFTER the full pipeline (capture → + handoff → STT trigger) succeeds (so a crash mid-capture leaves the message + un-acked and the job is retried). ``google-cloud-pubsub`` is lazy-imported here + (never at module load). Raises: RuntimeError: no message is available on the subscription, or the message @@ -127,10 +229,8 @@ def _pull_capture_job(subscription: str) -> tuple[dict[str, str], object]: f"{missing}; cannot capture a settlement-adjacent call from a partial spec." ) - def _ack() -> None: - client.acknowledge(subscription=subscription, ack_ids=[msg.ack_id]) - - return {"ticker": ticker, "call_id": call_id, "webcast_url": webcast_url}, _ack + handle = _MessageHandle(client, subscription, msg.ack_id) + return {"ticker": ticker, "call_id": call_id, "webcast_url": webcast_url}, handle def _upload_handoff(audio_path: str, bucket: str, *, ticker: str, call_id: str) -> str: @@ -153,12 +253,73 @@ def _upload_handoff(audio_path: str, bucket: str, *, ticker: str, call_id: str) return uri -def _resolve_spec() -> tuple[dict[str, str], object]: - """Resolve the per-call capture spec + an ack callable. +def _trigger_stt(handoff_uri: str, *, ticker: str, call_id: str) -> None: + """POST the gs:// handoff URI to the STT service to trigger transcription. + + The synchronous HTTP trigger that closes capture→STT: the shipped + :mod:`services.earnings.jobs.stt_server` ``POST /transcribe`` accepts a gs:// + reference, downloads it from the handoff bucket, transcribes, and writes the + audio-free transcript ledger. ``httpx`` is lazy-imported here. + + The caller acks the Pub/Sub message ONLY after this returns (2xx). A non-2xx or + connection error raises, so ``main`` leaves the message un-acked → Pub/Sub + redelivers → capture re-runs (re-upload to the same handoff key is idempotent), + never orphaning the audio. + + Raises: + RuntimeError: ``STT_SERVICE_URL`` is unset (deploy misconfiguration — fail + loud rather than upload audio nobody will transcribe). + """ + stt_url = optional_env("STT_SERVICE_URL") + if not stt_url: + raise RuntimeError( + "STT_SERVICE_URL is unset in the deployed subscription path — capture " + "uploaded the audio handoff but cannot trigger STT, which would ORPHAN " + "the audio (captured, never transcribed). Set STT_SERVICE_URL to the STT " + "Cloud Run service base URL (fail loud rather than silently orphan)." + ) - Operator-override precedence: if ``CAPTURE_TICKER`` is set, read the whole - spec from env (the manual single-call path) with a no-op ack. Otherwise pull - ONE message off ``CAPTURE_JOBS_SUBSCRIPTION`` (the DEFAULT deployed path). + import httpx + + # Service-to-service auth contract: the STT Cloud Run service is PRIVATE (no + # public invoker), so a bare POST would 403. The capture SA holds + # roles/run.invoker on STT (infra/deploy_iam.tf), and Cloud Run authenticates + # by verifying a Google-signed ID token whose AUDIENCE is the receiving + # service's URL. So mint an ID token with audience = STT_SERVICE_URL and send + # it as `Authorization: Bearer `. + try: + import google.auth.transport.requests + from google.oauth2 import id_token as _id_token + + token = _id_token.fetch_id_token(google.auth.transport.requests.Request(), stt_url) + headers = {"Authorization": f"Bearer {token}"} + except Exception: + # No metadata server (local/test) or token mint failed — POST without a + # token. On Cloud Run the metadata server is always present; a mint failure + # there is a real misconfig that STT will reject 401/403, so we still POST + # (don't silently succeed) and let the non-2xx raise + NACK for redelivery. + headers = {} + + endpoint = stt_url.rstrip("/") + "/transcribe" + payload = {"audio_path": handoff_uri, "ticker": ticker, "call_id": call_id} + resp = httpx.post(endpoint, json=payload, headers=headers, timeout=60.0) + resp.raise_for_status() + _LOG.info( + "capture triggered STT: POST %s -> %s (%s/%s)", + endpoint, + resp.status_code, + ticker, + call_id, + ) + + +def _resolve_spec() -> tuple[dict[str, str], _MessageHandle | _NoopHandle]: + """Resolve the per-call capture spec + a message handle. + + Operator-override precedence: if ``CAPTURE_TICKER`` is set, read the whole spec + from env (the manual single-call path) with a :class:`_NoopHandle` (no lease, + no ack). Otherwise pull ONE message off ``CAPTURE_JOBS_SUBSCRIPTION`` (the + DEFAULT deployed path) and return its lease-capable :class:`_MessageHandle`. """ if optional_env("CAPTURE_TICKER"): spec = { @@ -166,12 +327,8 @@ def _resolve_spec() -> tuple[dict[str, str], object]: "call_id": require_env("CAPTURE_CALL_ID"), "webcast_url": require_env("CAPTURE_WEBCAST_URL"), } - - def _noop_ack() -> None: - return None - _LOG.info("capture: using operator-override env spec (manual single-call path)") - return spec, _noop_ack + return spec, _NoopHandle() subscription = require_env("CAPTURE_JOBS_SUBSCRIPTION") return _pull_capture_job(subscription) @@ -190,7 +347,7 @@ def main(argv: list[str] | None = None) -> int: """ logging.basicConfig(level=logging.INFO) - spec, ack = _resolve_spec() + spec, handle = _resolve_spec() ticker = spec["ticker"] call_id = spec["call_id"] webcast_url = spec["webcast_url"] @@ -215,9 +372,11 @@ def main(argv: list[str] | None = None) -> int: adapter = Q4CaptureAdapter() event = {"ticker": ticker, "call_id": call_id, "media_url": webcast_url} - # The AudioArtifact's audio is a transient local file; we assert the firewall - # invariants, hand it off to STT via the private GCS bucket, then ack. - artifact = adapter.capture(event, tmp_dir=out_dir) + # Hold the Pub/Sub lease for the WHOLE capture (60-90 min > ack deadline): a + # daemon thread re-leases the message so it is never redelivered mid-capture. + # The lease is released the instant capture returns, BEFORE the handoff/trigger. + with handle.hold_lease(): + artifact = adapter.capture(event, tmp_dir=out_dir) if not artifact.is_transient: raise RuntimeError( @@ -228,10 +387,13 @@ def main(argv: list[str] | None = None) -> int: if handoff_bucket: # DEFAULT deployed path: hand the transient audio to STT via the private - # in-firewall GCS bucket, THEN ack (a crash before the ack retries the job). + # in-firewall GCS bucket, TRIGGER STT, and ONLY THEN ack. If the STT trigger + # fails we do NOT ack — Pub/Sub redelivers and capture re-runs (re-upload to + # the same handoff key is idempotent), so the audio is never orphaned. handoff_uri = _upload_handoff( artifact.audio_path, handoff_bucket, ticker=ticker, call_id=call_id ) + _trigger_stt(handoff_uri, ticker=ticker, call_id=call_id) _LOG.info( "capture job done: ticker=%s call_id=%s handoff=%s source=%s", artifact.ticker, @@ -242,9 +404,12 @@ def main(argv: list[str] | None = None) -> int: # The gs:// handoff URI is the STT reference — the transient local file # dies with the ephemeral task; only STT (in-firewall) reads the object. print(handoff_uri) + # Ack ONLY after the STT trigger succeeded (2xx) — closes capture→STT. + handle.ack() else: # Bare local operator run (no handoff bucket): emit the transient path for - # a co-located STT run off the same disk. Audio is still never served. + # a co-located STT run off the same disk. Audio is still never served. No + # STT trigger + no ack (the _NoopHandle ack is a no-op anyway). _LOG.info( "capture job done (local, no handoff bucket): ticker=%s call_id=%s transient_audio=%s", artifact.ticker, @@ -252,8 +417,8 @@ def main(argv: list[str] | None = None) -> int: artifact.audio_path, ) print(artifact.audio_path) + handle.ack() - ack() return 0 diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py index 7020bec..c6ade8d 100644 --- a/services/earnings/tests/test_jobs_entrypoints.py +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -180,12 +180,34 @@ def pull(self, *, subscription, max_messages, return_immediately): } return _FakePullResponse(json.dumps(spec).encode("utf-8")) + def modify_ack_deadline(self, *, subscription, ack_ids, ack_deadline_seconds): + acked.setdefault("lease_extensions", []).append(ack_deadline_seconds) + def acknowledge(self, *, subscription, ack_ids): + # Record the ORDER: ack must come AFTER the STT trigger. acked["ack_subscription"] = subscription acked["ack_ids"] = list(ack_ids) + acked["ack_after_trigger"] = "stt_endpoint" in triggered fake_pubsub = types.SimpleNamespace(SubscriberClient=lambda: _FakeSubscriber()) + # --- fake httpx: capture the STT trigger POST (2xx) --------------------------- + triggered: dict[str, object] = {} + + class _FakeResponse: + status_code = 200 + + def raise_for_status(self) -> None: + return None + + def _fake_post(url, *, json, headers, timeout): + triggered["stt_endpoint"] = url + triggered["stt_payload"] = json + return _FakeResponse() + + fake_httpx = types.SimpleNamespace(post=_fake_post) + monkeypatch.setitem(sys.modules, "httpx", fake_httpx) + # --- fake google-cloud-storage: capture the uploaded handoff object ---------- uploaded: dict[str, object] = {} @@ -234,10 +256,11 @@ def capture(self, event, *, tmp_dir=None, **_): import mostlyright.weather.earnings.capture.q4 as q4mod monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) - # DEPLOYED env: no CAPTURE_TICKER override; subscription + handoff bucket set. + # DEPLOYED env: no CAPTURE_TICKER override; subscription + handoff + STT URL set. monkeypatch.delenv("CAPTURE_TICKER", raising=False) monkeypatch.setenv("CAPTURE_JOBS_SUBSCRIPTION", "projects/p/subscriptions/capture-jobs") monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + monkeypatch.setenv("STT_SERVICE_URL", "https://earnings-stt-abc.a.run.app") monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) assert capture_job.main() == 0 @@ -248,8 +271,16 @@ def capture(self, event, *, tmp_dir=None, **_): assert uploaded["bucket"] == "earnings-audio-handoff-123" assert uploaded["blob"] == "handoff/CHWY/evt-1.wav" assert uploaded["local"] == audio_path - # The message was acked AFTER the successful capture + handoff. + # STT was triggered with the gs:// handoff URI (closes capture→STT). + assert triggered["stt_endpoint"] == "https://earnings-stt-abc.a.run.app/transcribe" + assert triggered["stt_payload"] == { + "audio_path": "gs://earnings-audio-handoff-123/handoff/CHWY/evt-1.wav", + "ticker": "CHWY", + "call_id": "evt-1", + } + # The message was acked ONLY AFTER the STT trigger succeeded. assert acked["ack_ids"] == ["ack-123"] + assert acked["ack_after_trigger"] is True def test_capture_main_missing_message_fails_loud(monkeypatch: pytest.MonkeyPatch) -> None: @@ -270,6 +301,177 @@ def pull(self, *, subscription, max_messages, return_immediately): capture_job.main() +def _install_deployed_capture_fakes(monkeypatch, out_dir, *, stt_post): + """Wire pubsub + storage + httpx fakes for a deployed-path capture run. + + ``stt_post`` is the fake ``httpx.post`` callable (a test controls whether it + 2xx's or raises). Returns a ``record`` dict the fakes write into (lease + extensions, ack ids, uploaded blob) so the test can assert ordering. + """ + record: dict[str, object] = {"lease_extensions": [], "acked": False} + audio_path = str(out_dir / "audio.wav") + (out_dir / "audio.wav").write_bytes(b"fake-audio") + + class _FakeSubscriber: + def pull(self, *, subscription, max_messages, return_immediately): + spec = { + "ticker": "CHWY", + "call_id": "evt-1", + "webcast_url": "https://static.events.q4inc.com/x/y.mp4", + } + data = json.dumps(spec).encode("utf-8") + msg = types.SimpleNamespace(ack_id="ack-xyz", message=types.SimpleNamespace(data=data)) + return types.SimpleNamespace(received_messages=[msg]) + + def modify_ack_deadline(self, *, subscription, ack_ids, ack_deadline_seconds): + record["lease_extensions"].append(ack_deadline_seconds) # type: ignore[union-attr] + + def acknowledge(self, *, subscription, ack_ids): + record["acked"] = True + record["ack_ids"] = list(ack_ids) + + class _FakeBlob: + def __init__(self, name): + self._name = name + + def upload_from_filename(self, filename): + record["uploaded_blob"] = self._name + + class _FakeStorageClient: + def bucket(self, name): + return types.SimpleNamespace(blob=lambda n: _FakeBlob(n)) + + class _FakeArtifactLocal: + def __init__(self) -> None: + self.audio_path = audio_path + self.ticker = "CHWY" + self.call_id = "evt-1" + self.source_media_url = "https://static.events.q4inc.com/x/y.mp4" + self.is_transient = True + + class _FakeAdapter: + def capture(self, event, *, tmp_dir=None, **_): + return _FakeArtifactLocal() + + import mostlyright.weather.earnings.capture.q4 as q4mod + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _FakeAdapter) + monkeypatch.setitem( + sys.modules, + "google.cloud.pubsub_v1", + types.SimpleNamespace(SubscriberClient=lambda: _FakeSubscriber()), + ) + monkeypatch.setitem( + sys.modules, + "google.cloud.storage", + types.SimpleNamespace(Client=lambda: _FakeStorageClient()), + ) + monkeypatch.setitem(sys.modules, "httpx", types.SimpleNamespace(post=stt_post)) + monkeypatch.delenv("CAPTURE_TICKER", raising=False) + monkeypatch.setenv("CAPTURE_JOBS_SUBSCRIPTION", "projects/p/subscriptions/capture-jobs") + monkeypatch.setenv("AUDIO_HANDOFF_BUCKET", "earnings-audio-handoff-123") + monkeypatch.setenv("CAPTURE_OUT_DIR", str(out_dir)) + return record + + +def test_capture_main_extends_lease_for_long_capture( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A long capture keeps the Pub/Sub lease alive (modify_ack_deadline is called).""" + + class _OkResponse: + status_code = 200 + + def raise_for_status(self): + return None + + def _ok_post(url, *, json, headers, timeout): + return _OkResponse() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_ok_post) + + # Force the lease loop to tick during capture: a small (non-zero) refresh + # interval + a capture that blocks long enough for at least one extension. + # The re-lease deadline stays 600 (Pub/Sub's cap) so the assertion is realistic. + monkeypatch.setattr(capture_job, "_LEASE_DEADLINE_SECONDS", 600) + monkeypatch.setattr(capture_job, "_LEASE_REFRESH_FRACTION", 0.05 / 600) + + import time as _time + + import mostlyright.weather.earnings.capture.q4 as q4mod + + class _SlowArtifact: + audio_path = str(out_dir / "audio.wav") + ticker = "CHWY" + call_id = "evt-1" + source_media_url = "https://static.events.q4inc.com/x/y.mp4" + is_transient = True + + class _SlowAdapter: + def capture(self, event, *, tmp_dir=None, **_): + # Block long enough that the (interval=0) lease loop fires >=1 extension. + _time.sleep(0.15) + return _SlowArtifact() + + monkeypatch.setattr(q4mod, "Q4CaptureAdapter", _SlowAdapter) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + assert capture_job.main() == 0 + # The lease was extended at least once during the long capture, at 600s each. + assert len(record["lease_extensions"]) >= 1 # type: ignore[arg-type] + assert all(d == 600 for d in record["lease_extensions"]) # type: ignore[union-attr] + assert record["acked"] is True + + +def test_capture_main_fails_loud_when_stt_service_url_unset( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """STT_SERVICE_URL unset in the deployed path fails loud; message NOT acked.""" + + def _unused_post(url, *, json, headers, timeout): # pragma: no cover - never called + raise AssertionError("STT must not be POSTed when STT_SERVICE_URL is unset") + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_unused_post) + monkeypatch.delenv("STT_SERVICE_URL", raising=False) + + with pytest.raises(RuntimeError, match="STT_SERVICE_URL is unset"): + capture_job.main() + # The audio was uploaded, but the message was NOT acked (redeliver → retry). + assert record["acked"] is False + assert record.get("uploaded_blob") == "handoff/CHWY/evt-1.wav" + + +def test_capture_main_does_not_ack_when_stt_trigger_fails( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A non-2xx STT trigger raises and the message is NOT acked (idempotent retry).""" + + class _HttpError(Exception): + pass + + class _ErrResponse: + status_code = 503 + + def raise_for_status(self): + raise _HttpError("503 from STT") + + def _err_post(url, *, json, headers, timeout): + return _ErrResponse() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_err_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + with pytest.raises(_HttpError): + capture_job.main() + assert record["acked"] is False + + def test_capture_main_rejects_non_local_audio(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: out_dir = tmp_path / "cap" out_dir.mkdir() From 808f7cb458099c93af0edfa3c4fc4168d1c74e68 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Fri, 3 Jul 2026 23:44:31 +0200 Subject: [PATCH 11/17] =?UTF-8?q?fix(28):=20review=20round=207=20=E2=80=94?= =?UTF-8?q?=20transcript=20R2=20durability=20+=20capture->STT=20trigger=20?= =?UTF-8?q?timeout/lease?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two P1s from the final Codex gate on the deployed split-container topology: 1. Transcript durability across containers. STT, role/fact, and serving run as SEPARATE Cloud Run resources with isolated ephemeral disks, so STT's local TranscriptLedger write was invisible to the role/fact Job (it would fail "no persisted transcript"). STT now publishes the audio-free transcript parquet to the R2 data plane (the architecture's durable text store); role/fact rehydrates it from R2 on a local-cache miss. Opt-in on R2_BUCKET (a co-located operator run with a shared disk is unchanged). Adds the STT R2-write IAM/env + boto3 in the STT image; only TEXT crosses to R2, never audio (D-27.9). New _r2_sink.download. 2. capture->STT trigger timeout + lease. The synchronous /transcribe transcribes the whole call before responding, but capture gave the POST only 60s — every real call timed out mid-transcription -> NACK -> duplicate recapture. The timeout now covers a full transcription (STT_TRIGGER_TIMEOUT_SECONDS, default 3600s = the Cloud Run max the STT service is now pinned to), and the Pub/Sub lease is held ACROSS the trigger (it was released before it). Serving reading transcripts/facts from R2 (Codex also noted serving) remains a separate services-layer follow-up: the earnings serving read path uses the local ledger and would need an R2 list+get wrapper across its routes — out of scope for this deploy-runtime PR and tracked separately. Tests: STT R2 publish (+no-op when unset), role/fact R2 download-on-miss (+miss still fails loud), capture long/overridable timeout + lease-held-during-trigger. --- deploy/earnings/stt.Dockerfile | 8 +- infra/cloud_run.tf | 55 +++- infra/service_accounts.tf | 21 +- .../mostlyright/weather/satellite/_r2_sink.py | 26 +- services/earnings/jobs/capture.py | 81 ++++-- services/earnings/jobs/rolefact.py | 59 ++++- services/earnings/jobs/stt.py | 59 +++++ services/earnings/jobs/stt_server.py | 5 + .../earnings/tests/test_jobs_entrypoints.py | 247 ++++++++++++++++++ 9 files changed, 528 insertions(+), 33 deletions(-) diff --git a/deploy/earnings/stt.Dockerfile b/deploy/earnings/stt.Dockerfile index 4912dcc..477f6cf 100644 --- a/deploy/earnings/stt.Dockerfile +++ b/deploy/earnings/stt.Dockerfile @@ -60,13 +60,17 @@ COPY packages/weather/ packages/weather/ # SERVICE (google_cloud_run_v2_service.stt), so the container MUST serve $PORT. # google-cloud-storage downloads the transient audio HANDOFF object from the # private AUDIO_HANDOFF_BUCKET (capture + STT are separate Cloud Run resources -# with NO shared disk) — lazy-imported inside stt._resolve_audio_reference. NO -# torch anywhere (D-27.5): the STT engine is CTranslate2/faster-whisper only. +# with NO shared disk) — lazy-imported inside stt._resolve_audio_reference. boto3 +# backs the R2 write sink (mostlyright.weather.satellite._r2_sink, lazy-imported): +# STT publishes the durable TEXT transcript parquet to R2 so the SEPARATE role/fact +# Job can read it across containers (Codex R7 P1) — never audio (D-27.9). NO torch +# anywhere (D-27.5): the STT engine is CTranslate2/faster-whisper only. RUN python -m pip install --break-system-packages \ ./packages/core \ "./packages/weather[earnings]" \ "google-cloud-storage>=2.10,<4" \ "google-cloud-pubsub>=2.18,<3" \ + "boto3>=1.34,<2.0" \ "fastapi>=0.115,<1" \ "uvicorn[standard]>=0.30" diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index c8ac4a0..a3308f5 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -292,6 +292,14 @@ resource "google_cloud_run_v2_service" "stt" { # One request per instance: GPU transcription is not multiplexed. max_instance_request_concurrency = 1 + # /transcribe is SYNCHRONOUS — it transcribes the whole call before responding, + # and the capture Job holds its Pub/Sub lease waiting on that response. The + # default 300s request timeout would cut a real (multi-minute → up-to-~an-hour) + # transcription short (→ 5xx → capture NACK → duplicate recapture, Codex R7 P1), + # so pin it to the Cloud Run maximum. capture's STT_TRIGGER_TIMEOUT_SECONDS + # default (3600s, services/earnings/jobs/capture.py) matches this ceiling. + timeout = "3600s" + node_selector { accelerator = var.stt_gpu_type } @@ -311,10 +319,55 @@ resource "google_cloud_run_v2_service" "stt" { name = "AUDIO_HANDOFF_BUCKET" value = "earnings-audio-handoff-${google_project.ingest.number}" } + + # R2 (text data plane) so STT publishes the durable transcript parquet the + # SEPARATE role/fact Job reads across containers (Codex R7 P1). This is the + # TEXT transcript, never audio — the audio firewall (audio never gets an R2 + # key) is unchanged; STT is on the ingest/write side (write token, D-28.9). + env { + name = "R2_BUCKET" + value = var.r2_bucket + } + env { + name = "R2_REGION" + value = local.r2_region + } + env { + name = "R2_WRITE_ACCESS_KEY_ID" + value_source { + secret_key_ref { + secret = data.google_secret_manager_secret.r2_write_access_key_id.id + version = "latest" + } + } + } + env { + name = "R2_WRITE_SECRET_ACCESS_KEY" + value_source { + secret_key_ref { + secret = data.google_secret_manager_secret.r2_write_secret_access_key.id + version = "latest" + } + } + } + env { + name = "R2_ACCOUNT_ID" + value_source { + secret_key_ref { + secret = data.google_secret_manager_secret.r2_account_id.id + version = "latest" + } + } + } } } - depends_on = [google_project_service.enabled] + # The STT SA needs its R2-write + account secret bindings (secrets.tf) before the + # revision can mount them. + depends_on = [ + google_project_service.enabled, + google_secret_manager_secret_iam_member.access, + ] } # ===================================================================== diff --git a/infra/service_accounts.tf b/infra/service_accounts.tf index 8e46b9b..d0047a9 100644 --- a/infra/service_accounts.tf +++ b/infra/service_accounts.tf @@ -72,17 +72,20 @@ resource "google_service_account" "weather_incremental" { locals { # Convenience member strings for the firewall bindings downstream. - sa_earnings_capture = "serviceAccount:${google_service_account.earnings_capture.email}" - sa_earnings_stt = "serviceAccount:${google_service_account.earnings_stt.email}" - sa_earnings_rolefact = "serviceAccount:${google_service_account.earnings_rolefact.email}" - sa_serving = "serviceAccount:${google_service_account.serving.email}" - sa_weather_backfill = "serviceAccount:${google_service_account.weather_backfill.email}" - sa_weather_incremental = "serviceAccount:${google_service_account.weather_incremental.email}" + sa_earnings_capture = "serviceAccount:${google_service_account.earnings_capture.email}" + sa_earnings_stt = "serviceAccount:${google_service_account.earnings_stt.email}" + sa_earnings_rolefact = "serviceAccount:${google_service_account.earnings_rolefact.email}" + sa_serving = "serviceAccount:${google_service_account.serving.email}" + sa_weather_backfill = "serviceAccount:${google_service_account.weather_backfill.email}" + sa_weather_incremental = "serviceAccount:${google_service_account.weather_incremental.email}" - # The SHARED R2 write token members (v1 honest posture): ingest role/fact + - # BOTH satellite weather SAs. R2 tokens are bucket-scoped, not prefix-scoped — - # there is NO per-zone write isolation in v1 (Task 4 v1.x hardening splits it). + # The SHARED R2 write token members (v1 honest posture): ingest STT (publishes + # the durable transcript parquet so the SEPARATE role/fact Job can read it across + # containers — text only, never audio, D-27.9) + ingest role/fact + BOTH + # satellite weather SAs. R2 tokens are bucket-scoped, not prefix-scoped — there is + # NO per-zone write isolation in v1 (Task 4 v1.x hardening splits it). r2_write_members = [ + local.sa_earnings_stt, local.sa_earnings_rolefact, local.sa_weather_backfill, local.sa_weather_incremental, diff --git a/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py b/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py index 137d5f2..31ed55f 100644 --- a/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py +++ b/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py @@ -110,4 +110,28 @@ def upload(local_path: Path | str, bucket: str, key: str, *, r2_target: str | No return key -__all__ = ["upload"] +def download(bucket: str, key: str, local_path: Path | str, *, r2_target: str | None = None) -> str: + """Download one R2 object (``s3.download_file``) to ``local_path``; return the path. + + The read complement of :func:`upload`, used by the hosted ingest chain to + rehydrate a durable partition into a fresh Cloud Run container's local cache + (e.g. the role/fact Job pulling the STT-written transcript parquet — capture, + STT, and role/fact run in SEPARATE Cloud Run resources with isolated ephemeral + disks, so the transcript crosses between them only via the R2 data plane). Uses + the SAME write-token client as :func:`upload` (an R2 read-and-write token + grants ``GetObject``); the ingest SAs already hold that token. + + ``r2_target`` is accepted for signature symmetry with :func:`upload`; the + effective bucket is the explicit ``bucket`` argument. The parent directory of + ``local_path`` must already exist (the ledger path resolver creates it). + + Raises: + Whatever ``botocore`` raises on a missing key / transport error — the + caller decides whether a miss is fatal (fail loud) or a soft fallback. + """ + client = _get_r2_client() + client.download_file(bucket, key, str(local_path)) + return str(local_path) + + +__all__ = ["download", "upload"] diff --git a/services/earnings/jobs/capture.py b/services/earnings/jobs/capture.py index 49968bb..b4be387 100644 --- a/services/earnings/jobs/capture.py +++ b/services/earnings/jobs/capture.py @@ -103,6 +103,37 @@ #: Refresh at 70% of the deadline so a slow extension still lands before expiry. _LEASE_REFRESH_FRACTION = 0.7 +#: HTTP timeout (seconds) for the SYNCHRONOUS capture->STT ``/transcribe`` trigger. +#: ``/transcribe`` transcribes the WHOLE call before it responds, so the trigger +#: must wait out a real (multi-minute → up-to-~an-hour) GPU transcription — NOT a +#: 60s guess. A 60s cap made every real earnings call time out mid-transcription → +#: capture raised → the message was NOT acked → Pub/Sub redelivered → duplicate +#: recapture while STT was still running (Codex R7 P1). Default 3600s = the Cloud +#: Run max request timeout the STT service is pinned to (``infra/cloud_run.tf``); +#: override with ``STT_TRIGGER_TIMEOUT_SECONDS``. A call whose transcription exceeds +#: this needs the decoupled (fire-and-forget) trigger seam documented above. +_DEFAULT_STT_TRIGGER_TIMEOUT_SECONDS = 3600.0 + + +def _stt_trigger_timeout() -> float: + """Resolve the capture->STT trigger HTTP timeout (seconds). + + Defaults to :data:`_DEFAULT_STT_TRIGGER_TIMEOUT_SECONDS` (cover a full real + transcription); ``STT_TRIGGER_TIMEOUT_SECONDS`` overrides it. A non-numeric or + non-positive value fails loud (a bad timeout that silently fell back to a short + default would reintroduce the mid-transcription-timeout bug). + """ + raw = optional_env("STT_TRIGGER_TIMEOUT_SECONDS") + if not raw: + return _DEFAULT_STT_TRIGGER_TIMEOUT_SECONDS + try: + value = float(raw) + except ValueError as exc: + raise ValueError(f"STT_TRIGGER_TIMEOUT_SECONDS={raw!r} is not a number") from exc + if value <= 0: + raise ValueError(f"STT_TRIGGER_TIMEOUT_SECONDS must be > 0; got {value}") + return value + def _assert_audio_local(audio_path: str, out_dir: str) -> None: """Fail loud if the captured audio is not a local file under ``out_dir`` (D-27.9). @@ -302,7 +333,10 @@ def _trigger_stt(handoff_uri: str, *, ticker: str, call_id: str) -> None: endpoint = stt_url.rstrip("/") + "/transcribe" payload = {"audio_path": handoff_uri, "ticker": ticker, "call_id": call_id} - resp = httpx.post(endpoint, json=payload, headers=headers, timeout=60.0) + # /transcribe blocks until the full call is transcribed + the transcript ledger + # is written, so the timeout must cover the whole transcription (default 3600s), + # NOT 60s — the caller holds the Pub/Sub lease across this wait (see main()). + resp = httpx.post(endpoint, json=payload, headers=headers, timeout=_stt_trigger_timeout()) resp.raise_for_status() _LOG.info( "capture triggered STT: POST %s -> %s (%s/%s)", @@ -372,28 +406,39 @@ def main(argv: list[str] | None = None) -> int: adapter = Q4CaptureAdapter() event = {"ticker": ticker, "call_id": call_id, "media_url": webcast_url} - # Hold the Pub/Sub lease for the WHOLE capture (60-90 min > ack deadline): a - # daemon thread re-leases the message so it is never redelivered mid-capture. - # The lease is released the instant capture returns, BEFORE the handoff/trigger. + # Hold the Pub/Sub lease for the WHOLE pipeline — the capture (60-90 min) AND + # the SYNCHRONOUS STT handoff+trigger, which blocks until STT transcribes the + # full call (multi-minute → up-to-~an-hour). A daemon thread re-leases the + # message throughout, so it is never redelivered mid-capture OR + # mid-transcription. (Codex R7 P1: the trigger previously ran AFTER the lease + # released, so a multi-minute /transcribe let the ack deadline lapse → + # redelivery → duplicate capture.) The message is acked ONLY after the whole + # pipeline succeeds, BELOW the leased block. + handoff_uri: str | None = None with handle.hold_lease(): artifact = adapter.capture(event, tmp_dir=out_dir) - if not artifact.is_transient: - raise RuntimeError( - f"capture returned a NON-transient artifact for {ticker}/{call_id} — " - "captured earnings audio must always be transient (D-27.9)." - ) - _assert_audio_local(artifact.audio_path, out_dir) + if not artifact.is_transient: + raise RuntimeError( + f"capture returned a NON-transient artifact for {ticker}/{call_id} — " + "captured earnings audio must always be transient (D-27.9)." + ) + _assert_audio_local(artifact.audio_path, out_dir) + + if handoff_bucket: + # DEFAULT deployed path: hand the transient audio to STT via the private + # in-firewall GCS bucket and TRIGGER STT (a synchronous /transcribe that + # blocks until the transcript ledger is written). Still under the lease, + # so a long transcription never lets the message redeliver. If the + # trigger fails we do NOT ack — Pub/Sub redelivers and capture re-runs + # (re-upload to the same handoff key is idempotent), never orphaning the + # audio. + handoff_uri = _upload_handoff( + artifact.audio_path, handoff_bucket, ticker=ticker, call_id=call_id + ) + _trigger_stt(handoff_uri, ticker=ticker, call_id=call_id) if handoff_bucket: - # DEFAULT deployed path: hand the transient audio to STT via the private - # in-firewall GCS bucket, TRIGGER STT, and ONLY THEN ack. If the STT trigger - # fails we do NOT ack — Pub/Sub redelivers and capture re-runs (re-upload to - # the same handoff key is idempotent), so the audio is never orphaned. - handoff_uri = _upload_handoff( - artifact.audio_path, handoff_bucket, ticker=ticker, call_id=call_id - ) - _trigger_stt(handoff_uri, ticker=ticker, call_id=call_id) _LOG.info( "capture job done: ticker=%s call_id=%s handoff=%s source=%s", artifact.ticker, diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py index a0593ef..14134b3 100644 --- a/services/earnings/jobs/rolefact.py +++ b/services/earnings/jobs/rolefact.py @@ -116,11 +116,17 @@ def main(argv: list[str] | None = None) -> int: "rolefact job start: ticker=%s call_id=%s terms=%d", ticker, call_id, len(market_terms) ) - transcript_rows = TranscriptLedger().read(ticker, call_id) + # STT and rolefact run in SEPARATE Cloud Run resources with isolated ephemeral + # disks, so STT's transcript is NOT on this container's local disk. Rehydrate it + # from the R2 data plane on a local miss (Codex R7 P1) before reading. + transcript_ledger = TranscriptLedger() + _maybe_download_transcript_r2(transcript_ledger, ticker=ticker, call_id=call_id) + transcript_rows = transcript_ledger.read(ticker, call_id) if not transcript_rows: raise RuntimeError( f"no persisted transcript for {ticker}/{call_id} — the STT job must run " - "before rolefact (fail loud rather than build zero facts)." + "(and publish its transcript to R2, or write it to this container's " + "cache) before rolefact (fail loud rather than build zero facts)." ) transcript = _transcript_text(transcript_rows) @@ -204,6 +210,55 @@ def _parse_roster(raw: str | None) -> list[tuple[str, str]]: return roster +def _maybe_download_transcript_r2(ledger: object, *, ticker: str, call_id: str) -> None: + """Rehydrate STT's durable transcript parquet from R2 on a local-cache miss. + + STT, role/fact, and serving run in SEPARATE Cloud Run resources with isolated + ephemeral disks, so the transcript STT wrote via its local + :class:`TranscriptLedger` is NOT on THIS container's disk (Codex R7 P1). If the + local partition is absent AND an R2 bucket is configured (``ROLEFACT_R2_BUCKET`` + override, else ``R2_BUCKET`` — the infra env), download + ``earnings/transcripts//.parquet`` (the key STT published under) + into the local ledger path so the read that follows sees it. Opt-in on the + bucket: a co-located / operator run with a shared disk already has it locally. + + A miss (no such object) is logged, NOT raised — the caller then fails loud on + the empty read with a clear "STT must run first" error, which is the correct + signal whether the transcript is missing locally or in R2. + """ + path = ledger.path(ticker, call_id) # type: ignore[attr-defined] + if path.exists(): + return + bucket = optional_env("ROLEFACT_R2_BUCKET") or optional_env("R2_BUCKET") + if not bucket: + return + key = f"earnings/transcripts/{ticker}/{call_id}.parquet" + + from mostlyright.weather.satellite._r2_sink import download + + path.parent.mkdir(parents=True, exist_ok=True) + try: + download(bucket, key, str(path), r2_target=bucket) + except Exception: + _LOG.warning( + "rolefact: no durable transcript in R2 (bucket=%s key=%s) for %s/%s — " + "will fail loud if this container's local ledger is also empty", + bucket, + key, + ticker, + call_id, + exc_info=True, + ) + return + _LOG.info( + "rolefact: rehydrated transcript parquet from R2 bucket=%s key=%s (%s/%s)", + bucket, + key, + ticker, + call_id, + ) + + def _maybe_upload_r2(fact_path: str, *, ticker: str, call_id: str) -> None: """Opt-in upload of the derived fact parquet to R2 via the shipped write sink. diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py index 84de7ef..792b7f3 100644 --- a/services/earnings/jobs/stt.py +++ b/services/earnings/jobs/stt.py @@ -157,6 +157,57 @@ def _delete_handoff_source(bucket: str, key: str) -> None: ) +#: R2 key namespace for the durable transcript parquet the STT stage publishes so +#: the SEPARATE role/fact Cloud Run Job (isolated ephemeral disk) can read it. The +#: role/fact job downloads ``earnings/transcripts//.parquet`` back +#: into its local ledger before role-attributing (see +#: :func:`services.earnings.jobs.rolefact._maybe_download_transcript_r2`). +_TRANSCRIPT_R2_KEY_FMT = "earnings/transcripts/{ticker}/{call_id}.parquet" + + +def _maybe_upload_transcript_r2( + ledger: object, *, ticker: str, call_id: str, r2_bucket: str | None +) -> None: + """Publish the durable transcript parquet to R2 for the cross-container handoff. + + STT, role/fact, and serving run in SEPARATE Cloud Run resources with isolated + ephemeral filesystems, so the local :class:`TranscriptLedger` write above is NOT + visible to the downstream role/fact Job — it would fail with "no persisted + transcript" (Codex R7 P1). When ``R2_BUCKET`` is configured (the deployed + ingest path) the transcript parquet is uploaded to the R2 data plane, the + architecture's durable text/fact store; role/fact then rehydrates it. Opt-in on + the bucket: a bare local / co-located operator run (no bucket) keeps the + prior local-only behavior. ONLY the audio-free transcript parquet crosses + (D-27.9); the write-token creds come from the env by NAME via the shipped sink. + + The ledger partition path is resolved ONLY when a bucket is set — a no-bucket + run never touches ``ledger.path`` (so a fake ledger without it stays supported). + """ + if not r2_bucket: + return + ledger_path = str(ledger.path(ticker, call_id)) # type: ignore[attr-defined] + if not os.path.exists(ledger_path): + # A legitimately empty transcript writes no parquet — nothing to publish. + _LOG.info( + "stt: no transcript parquet at %s for %s/%s — skipping R2 publish", + ledger_path, + ticker, + call_id, + ) + return + from mostlyright.weather.satellite._r2_sink import upload + + key = _TRANSCRIPT_R2_KEY_FMT.format(ticker=ticker, call_id=call_id) + upload(ledger_path, r2_bucket, key, r2_target=r2_bucket) + _LOG.info( + "stt: published durable transcript parquet to R2 bucket=%s key=%s (%s/%s)", + r2_bucket, + key, + ticker, + call_id, + ) + + #: Default STT tier — the hosted / our-infra source-of-truth model (D-27.5). _DEFAULT_TIER = "large-v3" #: Default device/compute for the L4 GPU image. @@ -244,6 +295,7 @@ def transcribe_call( streaming_project: str | None = None, streaming_topic: str = "earnings-streaming", handoff_bucket: str | None = None, + r2_bucket: str | None = None, ) -> dict[str, object]: """Transcribe one call's transient audio → transcript ledger (+ optional publish). @@ -283,6 +335,12 @@ def transcribe_call( "stt wrote transcript ledger: ticker=%s call_id=%s rows_now=%d", ticker, call_id, total ) + # Publish the durable transcript to R2 so the SEPARATE role/fact Cloud Run Job + # (isolated ephemeral disk) can read it — the local ledger write above never + # crosses containers (Codex R7 P1). Opt-in on R2_BUCKET; no-op for a local / + # co-located operator run. + _maybe_upload_transcript_r2(ledger, ticker=ticker, call_id=call_id, r2_bucket=r2_bucket) + if publish_live: _maybe_publish_live( result.segments, @@ -342,6 +400,7 @@ def main(argv: list[str] | None = None) -> int: streaming_topic=optional_env("EARNINGS_STREAMING_TOPIC", "earnings-streaming") or "earnings-streaming", handoff_bucket=optional_env("AUDIO_HANDOFF_BUCKET"), + r2_bucket=optional_env("R2_BUCKET"), ) return 0 diff --git a/services/earnings/jobs/stt_server.py b/services/earnings/jobs/stt_server.py index 349ff6f..a552979 100644 --- a/services/earnings/jobs/stt_server.py +++ b/services/earnings/jobs/stt_server.py @@ -75,6 +75,10 @@ def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: # gs:// reference in the body is resolved + downloaded against it. A request # may also override it explicitly. handoff_bucket = payload.get("handoff_bucket") or os.environ.get("AUDIO_HANDOFF_BUCKET") + # R2_BUCKET (the deployed STT service sets it) enables the durable transcript + # publish so the SEPARATE role/fact Job can read the transcript across + # containers; a request may override it explicitly. + r2_bucket = payload.get("r2_bucket") or os.environ.get("R2_BUCKET") try: return transcribe_call( @@ -89,6 +93,7 @@ def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: streaming_project=payload.get("streaming_project"), streaming_topic=str(payload.get("streaming_topic") or "earnings-streaming"), handoff_bucket=handoff_bucket, + r2_bucket=r2_bucket, ) except FileNotFoundError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py index c6ade8d..976fd44 100644 --- a/services/earnings/tests/test_jobs_entrypoints.py +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -902,3 +902,250 @@ def _fake_upload(local_path, bucket, key, *, r2_target=None): assert rolefact_job.main() == 0 assert uploaded["bucket"] == "mostlyright-derived" assert uploaded["key"] == "earnings/facts/CHWY/evt-9.parquet" + + +# --------------------------------------------------------------------------- +# Round-7 P1 fixes: +# (1) capture->STT trigger must wait out a full transcription (not 60s) AND +# stay under the Pub/Sub lease. +# (2) the transcript must be durable across the SEPARATE STT / role-fact +# Cloud Run containers (via the R2 data plane), not just local disk. +# --------------------------------------------------------------------------- +class _OkPost: + status_code = 200 + + def raise_for_status(self): + return None + + +def test_capture_stt_trigger_uses_long_default_timeout( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """The synchronous STT trigger waits out a full transcription (default 3600s) — + the old 60s cap timed out mid-transcription → NACK → duplicate recapture.""" + seen: dict[str, object] = {} + + def _record_post(url, *, json, headers, timeout): + seen["timeout"] = timeout + return _OkPost() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_record_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + monkeypatch.delenv("STT_TRIGGER_TIMEOUT_SECONDS", raising=False) + + assert capture_job.main() == 0 + assert seen["timeout"] == 3600.0 + + +def test_capture_stt_trigger_timeout_env_override( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + seen: dict[str, object] = {} + + def _record_post(url, *, json, headers, timeout): + seen["timeout"] = timeout + return _OkPost() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_record_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + monkeypatch.setenv("STT_TRIGGER_TIMEOUT_SECONDS", "1200") + + assert capture_job.main() == 0 + assert seen["timeout"] == 1200.0 + + +def test_capture_stt_trigger_bad_timeout_fails_loud( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + def _unused_post(url, *, json, headers, timeout): # pragma: no cover - never reached + raise AssertionError("must not POST with an invalid timeout") + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_unused_post) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + monkeypatch.setenv("STT_TRIGGER_TIMEOUT_SECONDS", "notanumber") + + with pytest.raises(ValueError, match="STT_TRIGGER_TIMEOUT_SECONDS"): + capture_job.main() + assert record["acked"] is False + + +def test_capture_holds_lease_during_stt_trigger(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + """The lease is extended WHILE the (slow) STT trigger runs. Capture is instant + here, so extensions beyond the immediate one prove the lease covers the + trigger — previously the trigger ran AFTER the lease released (Codex R7 P1).""" + import time as _time + + def _slow_post(url, *, json, headers, timeout): + # Block long enough for the tiny-interval lease loop to tick during the + # trigger (capture itself returns immediately in the helper's fake adapter). + _time.sleep(0.25) + return _OkPost() + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_slow_post) + # interval = 600 * (0.01/600) = 0.01s — many ticks across the 0.25s trigger. + monkeypatch.setattr(capture_job, "_LEASE_REFRESH_FRACTION", 0.01 / 600) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + assert capture_job.main() == 0 + # >=2: the immediate extension at lease-start PLUS at least one during the + # trigger (instant capture leaves no other window for repeated extensions). + assert len(record["lease_extensions"]) >= 2 # type: ignore[arg-type] + assert record["acked"] is True + + +def test_stt_publishes_transcript_to_r2_when_bucket_set( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """STT publishes the durable TEXT transcript parquet to R2 (never audio) so the + SEPARATE role/fact Job can read it across containers (Codex R7 P1).""" + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + uploaded: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + uploaded["local_path"] = str(local_path) + uploaded["bucket"] = bucket + uploaded["key"] = key + return key + + monkeypatch.setattr(sink, "upload", _fake_upload) + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert stt_job.main() == 0 + assert uploaded["bucket"] == "mostlyright-derived" + assert uploaded["key"] == "earnings/transcripts/CHWY/evt-1.parquet" + assert uploaded["local_path"].endswith("evt-1.parquet") + # Only the TEXT transcript parquet crosses to R2 — never an audio path. + assert "audio" not in uploaded["local_path"].lower() + + +def test_stt_no_r2_publish_when_bucket_unset(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + """No R2 bucket → the transcript stays local (byte-identical to pre-R7).""" + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _boom_upload(*a, **k): # pragma: no cover - must not be called + raise AssertionError("no R2 publish when R2_BUCKET is unset") + + monkeypatch.setattr(sink, "upload", _boom_upload) + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + monkeypatch.delenv("R2_BUCKET", raising=False) + + assert stt_job.main() == 0 + + +def test_rolefact_downloads_transcript_from_r2_on_local_miss( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """rolefact rehydrates STT's transcript from R2 when THIS container's local + cache is empty (separate Cloud Run containers, Codex R7 P1).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + # NB: NO local seed — the transcript lives only in R2 (a fresh container). + + downloaded: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_download(bucket, key, local_path, *, r2_target=None): + downloaded["bucket"] = bucket + downloaded["key"] = key + # Simulate the object landing in this container's ledger cache. + _seed_transcript(cache, "CHWY", "evt-2", ["tariff here"]) + return str(local_path) + + def _fake_upload(local_path, bucket, key, *, r2_target=None): + downloaded.setdefault("fact_uploads", []).append(key) # type: ignore[union-attr] + return key + + monkeypatch.setattr(sink, "download", _fake_download) + monkeypatch.setattr(sink, "upload", _fake_upload) # R2_BUCKET also enables fact upload + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-2") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + assert downloaded["bucket"] == "mostlyright-derived" + assert downloaded["key"] == "earnings/transcripts/CHWY/evt-2.parquet" + + from mostlyright.weather.earnings.ledger import FactLedger + + facts = FactLedger().read("CHWY", "evt-2") + assert len(facts) == 1 + assert facts[0]["term_canonical"] == "tariff" + + +def test_rolefact_r2_download_miss_still_fails_loud( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A missing R2 transcript is logged, not swallowed — rolefact still fails loud + on the resulting empty read (the correct 'STT must run first' signal).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _raise_download(bucket, key, local_path, *, r2_target=None): + raise RuntimeError("NoSuchKey") + + monkeypatch.setattr(sink, "download", _raise_download) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-absent") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + with pytest.raises(RuntimeError, match="no persisted transcript"): + rolefact_job.main() From 1cbcc6671c7b73dcc85a00d399649c2991f911b5 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Sat, 4 Jul 2026 00:07:13 +0200 Subject: [PATCH 12/17] =?UTF-8?q?fix(28):=20review=20round=207-2=20?= =?UTF-8?q?=E2=80=94=20provision=20handoff=20bucket,=20idempotent=20ledger?= =?UTF-8?q?=20writes,=20capture=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three P1s from the round-2 Codex gate, all making the deployed capture->STT->rolefact pipeline actually runnable and correct: 1. Provision the audio handoff bucket + IAM. The infra referenced earnings-audio-handoff- by env but never CREATED it, and granted no storage access — the first real capture upload / STT download would 403 and the pipeline could never run end to end. Adds the private, in-firewall GCS bucket (co-located with STT, uniform access, public-access-prevention, 1-day orphan reaper) + capture objectAdmin (write/overwrite) and STT objectAdmin (get + post-ledger delete). Corrects the now-inaccurate "read-only" STT SA description. 2. Idempotent ledger writes. TranscriptLedger/FactLedger.append concatenates, so a retried STT/rolefact run (redelivery, R2-upload error, capture timeout) DOUBLED the rows and made role/fact double-count mentions — corrupting settlement data. Adds an atomic TranscriptLedger/FactLedger.replace (overwrite under the same FileLock, same audio-free + fail-closed-Kalshi guards); STT and rolefact now REPLACE their complete-per-call artifact instead of appending. 3. Capture job timeout. Round-7 made capture wait synchronously for STT, but the capture Cloud Run Job timeout was still 5400s (capture only) — a real capture + transcription could exceed it and be killed before ack -> redelivery -> duplicate recapture. Bumped to 9000s to cover capture + the synchronous STT wait (the decoupled trigger remains the real fix). Tests: ledger replace idempotency (overwrite / shrink / empty-removes / fail-closed-still-runs); STT + rolefact re-run-not-doubled end to end. --- infra/cloud_run.tf | 62 ++++++++++++++++++- infra/service_accounts.tf | 2 +- .../mostlyright/weather/earnings/ledger.py | 33 ++++++++++ .../earnings/test_ledger_kalshi_validation.py | 37 +++++++++++ services/earnings/jobs/rolefact.py | 7 ++- services/earnings/jobs/stt.py | 6 +- .../earnings/tests/test_jobs_entrypoints.py | 60 ++++++++++++++++++ .../earnings/tests/test_stt_handoff_delete.py | 4 +- 8 files changed, 205 insertions(+), 6 deletions(-) diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index a3308f5..d152af2 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -370,6 +370,60 @@ resource "google_cloud_run_v2_service" "stt" { ] } +# ===================================================================== +# Private in-firewall AUDIO HANDOFF bucket (28-10/28-13) — capture -> STT +# ===================================================================== +# capture and STT are SEPARATE Cloud Run resources with NO shared disk, so the +# transient audio crosses between them via this PRIVATE, in-firewall GCS object +# (never an R2 key, never served — D-27.9). Both jobs reference it by NAME via the +# AUDIO_HANDOFF_BUCKET env; it MUST exist and both SAs must be able to reach it, or +# the first real capture upload / STT download fails with a missing-bucket / 403 +# and the pipeline can never run end-to-end (Codex R7-2 P1). Co-located with STT +# (var.stt_region) so the GPU download is in-region. +# +# Lifecycle: the audio is transient — STT deletes each object right after the +# transcript is durably written (stt._delete_handoff_source), and this 1-day reaper +# is the BACKSTOP the code comments cite for any object a failed run orphans. +resource "google_storage_bucket" "earnings_audio_handoff" { + project = google_project.ingest.project_id + name = "earnings-audio-handoff-${google_project.ingest.number}" + location = upper(var.stt_region) + uniform_bucket_level_access = true + force_destroy = true # transient audio only; safe to empty on destroy + public_access_prevention = "enforced" + + lifecycle_rule { + condition { + age = 1 # days — transient audio; STT deletes post-ledger, this reaps orphans + } + action { + type = "Delete" + } + } + + labels = { + phase = "28" + role = "earnings-audio-handoff" + } + + depends_on = [google_project_service.enabled] +} + +# capture WRITES the transient audio (create + overwrite on an idempotent retry). +resource "google_storage_bucket_iam_member" "capture_handoff_writer" { + bucket = google_storage_bucket.earnings_audio_handoff.name + role = "roles/storage.objectAdmin" + member = local.sa_earnings_capture +} + +# STT READS the handoff object and DELETES it after the transcript is durable +# (post-ledger cleanup, D-27.9) — so it needs get+delete, not read-only. +resource "google_storage_bucket_iam_member" "stt_handoff_admin" { + bucket = google_storage_bucket.earnings_audio_handoff.name + role = "roles/storage.objectAdmin" + member = local.sa_earnings_stt +} + # ===================================================================== # Capture Job — mr-earnings-ingest / europe-west3 (28-10) # ===================================================================== @@ -378,7 +432,11 @@ resource "google_cloud_run_v2_service" "stt" { # NEVER an R2 key. Egress is pinned to one static IP via the VPC connector → # Cloud NAT (28-10 earnings_network.tf) so the Amazon-IVS session pin holds; # the connector is referenced by name via env so this file stays decoupled from -# the network plan. Long task timeout covers a 90-min call. +# the network plan. The task timeout must cover the capture (60-90 min) PLUS the +# SYNCHRONOUS STT trigger wait (capture blocks on /transcribe until the transcript +# is written, up to STT's 3600s ceiling) — otherwise the job is killed before it +# acks and Pub/Sub redelivers → duplicate recapture (Codex R7-2 P1). The decoupled +# (fire-and-forget) trigger seam removes this coupling; until then, size for both. resource "google_cloud_run_v2_job" "capture" { project = google_project.ingest.project_id name = "earnings-capture" @@ -387,7 +445,7 @@ resource "google_cloud_run_v2_job" "capture" { template { template { service_account = google_service_account.earnings_capture.email - timeout = "5400s" # 90 min + timeout = "9000s" # 150 min = ~90 min capture + up to 60 min synchronous STT wait # Scratch disk sized for a 90-min call; audio dies here or in the handoff # bucket (never R2). diff --git a/infra/service_accounts.tf b/infra/service_accounts.tf index d0047a9..ae691e4 100644 --- a/infra/service_accounts.tf +++ b/infra/service_accounts.tf @@ -29,7 +29,7 @@ resource "google_service_account" "earnings_stt" { project = google_project.ingest.project_id account_id = "earnings-stt" display_name = "Earnings STT (Cloud Run GPU L4) runtime SA" - description = "Runs the STT GPU workload (28-11). Read-only on the audio handoff bucket; emits transcript segments; no serving grant." + description = "Runs the STT GPU workload (28-11). Reads + deletes (post-ledger cleanup) the audio handoff bucket; publishes the text transcript to R2; no serving grant." depends_on = [google_project_service.enabled] } diff --git a/packages/weather/src/mostlyright/weather/earnings/ledger.py b/packages/weather/src/mostlyright/weather/earnings/ledger.py index fb7a969..4a4cb7f 100644 --- a/packages/weather/src/mostlyright/weather/earnings/ledger.py +++ b/packages/weather/src/mostlyright/weather/earnings/ledger.py @@ -192,6 +192,39 @@ def append(self, rows: Sequence[Mapping[str, object]], *, ticker: str, call_id: os.replace(tmp, path) return len(merged) + def replace(self, rows: Sequence[Mapping[str, object]], *, ticker: str, call_id: str) -> int: + """Idempotently OVERWRITE the ``(ticker, call_id)`` partition with ``rows``. + + Unlike :meth:`append` (read-modify-write concatenate), this REPLACES the + partition under the same ``FileLock`` — so a producer that writes the + COMPLETE artifact for a call (the batch STT transcript, the role/fact facts) + is idempotent across retries / Pub/Sub redelivery: re-running yields the same + partition, never DOUBLED rows that would make downstream counting + double-count (Codex R7-2 P1). Same audio-free normalization + cross-field + write-guard as :meth:`append`. An empty ``rows`` removes the partition (an + idempotent zero-row write). Returns the row count written. + """ + path = self.path(ticker, call_id) + lock = FileLock(str(path) + ".lock", timeout=LOCK_TIMEOUT_SECONDS) + with lock: + if not rows: + # Idempotent empty write: drop any stale partition so a re-run that + # legitimately produces zero rows does not leave prior rows behind. + if path.exists(): + os.remove(path) + return 0 + normalized = [self._strip_to_schema(r) for r in rows] + self._validate_normalized(normalized) + table = pa.Table.from_pylist( + [{name: r.get(name) for name in self._column_names} for r in normalized], + schema=self._pa_schema, + ) + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + pq.write_table(table, tmp, version="2.6", coerce_timestamps="us") + os.replace(tmp, path) + return len(normalized) + def read(self, ticker: str, call_id: str) -> list[dict[str, object]]: """Return all persisted rows for ``(ticker, call_id)`` (empty on miss).""" path = self.path(ticker, call_id) diff --git a/packages/weather/tests/earnings/test_ledger_kalshi_validation.py b/packages/weather/tests/earnings/test_ledger_kalshi_validation.py index c3a5e66..b967b1e 100644 --- a/packages/weather/tests/earnings/test_ledger_kalshi_validation.py +++ b/packages/weather/tests/earnings/test_ledger_kalshi_validation.py @@ -108,3 +108,40 @@ def test_each_canonical_compound_type_persists(self, tmp_path) -> None: ledger = FactLedger(root=tmp_path) rows = [_row(compound_type=ct) for ct in COMPOUND_TYPE_VALUES] assert ledger.append(rows, ticker="ORCL", call_id="C1") == len(COMPOUND_TYPE_VALUES) + + +class TestReplaceIsIdempotent: + """``replace`` OVERWRITES a partition so a retried complete-artifact write is + idempotent — STT (transcript) and role/fact (facts) use it so a redelivered + call never doubles rows and double-counts (Codex R7-2 P1).""" + + def test_replace_overwrites_not_appends(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + rows = [_row(matched_surface_form="AI")] + assert ledger.replace(rows, ticker="ORCL", call_id="C1") == 1 + # A second identical replace (a retry) must NOT double the partition. + assert ledger.replace(rows, ticker="ORCL", call_id="C1") == 1 + assert len(ledger.read("ORCL", "C1")) == 1 + + def test_replace_shrinks_partition(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + ledger.replace([_row(), _row(), _row()], ticker="ORCL", call_id="C1") + assert len(ledger.read("ORCL", "C1")) == 3 + # A re-run producing FEWER rows must not leave the stale extras behind. + assert ledger.replace([_row()], ticker="ORCL", call_id="C1") == 1 + assert len(ledger.read("ORCL", "C1")) == 1 + + def test_replace_empty_removes_partition(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + ledger.replace([_row()], ticker="ORCL", call_id="C1") + assert len(ledger.read("ORCL", "C1")) == 1 + assert ledger.replace([], ticker="ORCL", call_id="C1") == 0 + assert ledger.read("ORCL", "C1") == [] + + def test_replace_still_fails_closed_on_bad_kalshi_row(self, tmp_path) -> None: + # The fail-closed Kalshi guard must run on replace, exactly like append. + ledger = FactLedger(root=tmp_path) + bad = _row(role_source="diarization_advisory", kalshi_counted=True) + with pytest.raises(KalshiCountRuleViolation): + ledger.replace([bad], ticker="ORCL", call_id="C1") + assert ledger.read("ORCL", "C1") == [] diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py index 14134b3..b0be4d6 100644 --- a/services/earnings/jobs/rolefact.py +++ b/services/earnings/jobs/rolefact.py @@ -153,7 +153,12 @@ def main(argv: list[str] | None = None) -> int: _LOG.info("rolefact built %d fact rows for %s/%s", len(fact_rows), ticker, call_id) fact_ledger = FactLedger() - total = fact_ledger.append(fact_rows, ticker=ticker, call_id=call_id) + # Idempotent REPLACE (not append): rolefact rebuilds the COMPLETE fact set for + # the call from the whole transcript, so a retry / redelivery must OVERWRITE — + # appending would double every fact row (Codex R7-2 P1). An empty fact set + # (zero-mention call) removes any stale partition; the R2-upload guard below + # then correctly skips (no parquet to upload). + total = fact_ledger.replace(fact_rows, ticker=ticker, call_id=call_id) fact_path = fact_ledger.path(ticker, call_id) _LOG.info( "rolefact wrote fact ledger: ticker=%s call_id=%s rows_now=%d path=%s", diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py index 792b7f3..327fb0b 100644 --- a/services/earnings/jobs/stt.py +++ b/services/earnings/jobs/stt.py @@ -330,7 +330,11 @@ def transcribe_call( from mostlyright.weather.earnings.ledger import TranscriptLedger ledger = TranscriptLedger() - total = ledger.append(rows, ticker=ticker, call_id=call_id) + # Idempotent REPLACE (not append): STT writes the COMPLETE transcript for the + # call in one shot, so a retry / Pub/Sub redelivery must OVERWRITE the partition + # — appending would double the segments and make role/fact double-count the + # mentions (Codex R7-2 P1). + total = ledger.replace(rows, ticker=ticker, call_id=call_id) _LOG.info( "stt wrote transcript ledger: ticker=%s call_id=%s rows_now=%d", ticker, call_id, total ) diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py index 976fd44..a07fc9b 100644 --- a/services/earnings/tests/test_jobs_entrypoints.py +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -1149,3 +1149,63 @@ def _raise_download(bucket, key, local_path, *, r2_target=None): with pytest.raises(RuntimeError, match="no persisted transcript"): rolefact_job.main() + + +def test_stt_rerun_is_idempotent_not_doubled(monkeypatch: pytest.MonkeyPatch, tmp_path) -> None: + """Re-running STT for the SAME call overwrites (replaces) the transcript — a + redelivery/retry must not double the segments (Codex R7-2 P1).""" + cache = tmp_path / "cache" + audio = tmp_path / "audio.wav" + audio.write_bytes(b"RIFF-fake-not-real-audio") + + class _FakeTranscriber: + def __init__(self, *a, **k): + pass + + def transcribe(self, audio_path, *, initial_prompt=None): + return _FakeTranscriptResult() # two segments + + import mostlyright.weather.earnings.stt as sttmod + + monkeypatch.setattr(sttmod, "SttTranscriber", _FakeTranscriber) + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + monkeypatch.setenv("STT_AUDIO_PATH", str(audio)) + monkeypatch.setenv("STT_TICKER", "CHWY") + monkeypatch.setenv("STT_CALL_ID", "evt-1") + monkeypatch.setenv("STT_TIER", "small") + monkeypatch.setenv("STT_DEVICE", "cpu") + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + monkeypatch.delenv("R2_BUCKET", raising=False) + + assert stt_job.main() == 0 + assert stt_job.main() == 0 # retry / redelivery + + from mostlyright.weather.earnings.ledger import TranscriptLedger + + rows = TranscriptLedger().read("CHWY", "evt-1") + assert len(rows) == 2 # NOT 4 — the second run replaced, not appended + + +def test_rolefact_rerun_is_idempotent_not_doubled( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """Re-running role/fact for the SAME call replaces the fact partition — a + redelivery/retry must not double the fact rows (Codex R7-2 P1).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-1", ["we mentioned tariff twice", "tariff again here"]) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-1") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.delenv("R2_BUCKET", raising=False) + + assert rolefact_job.main() == 0 + assert rolefact_job.main() == 0 # retry / redelivery + + from mostlyright.weather.earnings.ledger import FactLedger + + facts = FactLedger().read("CHWY", "evt-1") + # 'tariff' occurs once per segment -> 2 facts; a second run must not make it 4. + assert len(facts) == 2 diff --git a/services/earnings/tests/test_stt_handoff_delete.py b/services/earnings/tests/test_stt_handoff_delete.py index 2a5b840..9825dbb 100644 --- a/services/earnings/tests/test_stt_handoff_delete.py +++ b/services/earnings/tests/test_stt_handoff_delete.py @@ -64,7 +64,9 @@ def transcribe(self, path, *, initial_prompt=None): import mostlyright.weather.earnings.ledger as engine_ledger class _FakeLedger: - def append(self, rows, *, ticker, call_id): + # STT writes the COMPLETE transcript via the idempotent ``replace`` (not + # ``append``) so a retry does not double rows (Codex R7-2 P1). + def replace(self, rows, *, ticker, call_id): events.append("ledger") if ledger_raises: raise RuntimeError("ledger write failed") From df919dfd8e3778751443d1470aa819564985987d Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Sat, 4 Jul 2026 00:26:30 +0200 Subject: [PATCH 13/17] =?UTF-8?q?fix(28):=20review=20round=207-3=20?= =?UTF-8?q?=E2=80=94=20GOES-footprint=20roster=20filter,=20yesterday-year,?= =?UTF-8?q?=20stale-R2-fact=20tombstone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three findings from the round-3 Codex gate: 1. [P1] Roster backfill silently under-covered non-GOES stations. The full fleet (--roster kalshi,polymarket, no --satellites) sent every shard through GOES-16/18, whose footprint is only the Americas / E-Pacific — so ~half the 65-station roster (EDDM, RJTT, FACT, ...) fetched nothing and then marked the empty slice complete, leaving the advertised backfill unpopulated and wasting Spot. Roster mode now EXCLUDES (and loudly logs) stations outside the GOES footprint under the default satellites; an all-non-GOES shard cleanly no-ops. Global native-ring coverage (Himawari/Meteosat/VIIRS) stays the 28-26 follow-up, reached via --satellites (which bypasses the filter). batch.tf / the workflow docs updated to match. 2. [P2] --incremental yesterday keyed on TODAY's year, so on Jan 1 (UTC) it never refreshed the prior-year Dec 31 partition ("yesterday"). Now keys on yesterday's year. 3. [P2] A zero-fact role/fact rerun cleared the LOCAL partition (idempotent replace) but left the prior nonzero run's earnings/facts//.parquet in R2 — serving reads R2 as the durable store, so it kept serving stale facts. The zero-row path now tombstones the R2 object (new _r2_sink.delete, idempotent). Tests: roster GOES-footprint filter (kept/dropped/all-non-GOES-noop), shard-routing tests isolated with explicit --satellites, yesterday-year across boundary, zero-fact R2 delete. --- .github/workflows/run-weather-backfill.yml | 4 +- infra/batch.tf | 6 ++ .../mostlyright/weather/satellite/__main__.py | 69 ++++++++++----- .../mostlyright/weather/satellite/_r2_sink.py | 21 ++++- .../tests/satellite/test_cli_roster.py | 85 +++++++++++++++++-- services/earnings/jobs/rolefact.py | 53 ++++++++++-- .../earnings/tests/test_jobs_entrypoints.py | 38 +++++++++ 7 files changed, 240 insertions(+), 36 deletions(-) diff --git a/.github/workflows/run-weather-backfill.yml b/.github/workflows/run-weather-backfill.yml index 61cc025..5f3cb45 100644 --- a/.github/workflows/run-weather-backfill.yml +++ b/.github/workflows/run-weather-backfill.yml @@ -37,7 +37,7 @@ on: default: "latest" type: string mode: - description: "pilot = 1 station (cheap, default). full = the 66-shard ~28 TB fleet (needs cost sign-off)." + description: "pilot = 1 station (cheap, default). full = the 65-shard ~28 TB fleet (needs cost sign-off; default GOES satellites cover Americas/E-Pacific stations — non-GOES shards no-op, native ring is 28-26)." required: true default: "pilot" type: choice @@ -76,7 +76,7 @@ jobs: - name: Enforce the cost sign-off gate for a full run run: | if [ "${{ inputs.mode }}" = "full" ] && [ "${{ inputs.confirm_cost_signoff }}" != "true" ]; then - echo "::error::mode=full runs the 66-shard ~28 TB fleet (the phase's largest spend)." + echo "::error::mode=full runs the 65-shard ~28 TB fleet (the phase's largest spend)." echo "::error::Set confirm_cost_signoff=true (H5 pilot cost sign-off) to proceed, or use mode=pilot." exit 1 fi diff --git a/infra/batch.tf b/infra/batch.tf index 8b5c911..ca31602 100644 --- a/infra/batch.tf +++ b/infra/batch.tf @@ -85,6 +85,12 @@ resource "google_batch_job" "weather_backfill" { # StationInfo, so a shard for it would resolve to zero partitions). One shard # per satellite-resolvable station (D-28.8). run-weather-backfill.yml (the # actual submitter) uses the SAME 65; keep them in lockstep with the roster. + # NOTE: the container passes no --satellites, so each shard runs under the + # DEFAULT GOES-East/West satellites, which cover only the Americas / E-Pacific + # footprint. Shards for stations outside that footprint (EU/Asia/Africa) cleanly + # NO-OP with a logged exclusion (the CLI filters them rather than fetching empty + # + marking complete). Global native-ring coverage (Himawari/Meteosat/VIIRS) is + # the 28-26 follow-up — reached by adding --satellites to the container args. task_count = 65 parallelism = 16 # bounded concurrent Spot slices diff --git a/packages/weather/src/mostlyright/weather/satellite/__main__.py b/packages/weather/src/mostlyright/weather/satellite/__main__.py index bb2151a..b94c437 100644 --- a/packages/weather/src/mostlyright/weather/satellite/__main__.py +++ b/packages/weather/src/mostlyright/weather/satellite/__main__.py @@ -38,7 +38,7 @@ import argparse import os import sys -from datetime import UTC, datetime +from datetime import UTC, datetime, timedelta from pathlib import Path from ._backfill import bulk_backfill @@ -291,19 +291,32 @@ def _from(flag: int | None, env_name: str, default: int) -> int: _GOES_FOOTPRINT_ICAO_PREFIXES: tuple[str, ...] = ("K", "C", "M", "P", "S", "T") -def _warn_non_goes_stations(stations: list[str]) -> None: - """Warn (stderr) for shard stations the GOES-only default satellites can't see.""" - off_footprint = [ - s for s in stations if not s[:1].upper().startswith(_GOES_FOOTPRINT_ICAO_PREFIXES) - ] - if off_footprint: +def _filter_to_goes_footprint(stations: list[str]) -> list[str]: + """Drop (and loudly log) shard stations the GOES-only default satellites can't see. + + The default roster satellites are GOES-East/West, whose footprint is the + Americas / E-Pacific. A roster station outside that footprint (EDDM, RJTT, + FACT, ...) resolves to ZERO GOES coverage, so backfilling it under the default + satellites would fetch nothing and then mark the slice ``completed`` — a SILENT + empty "success" that leaves the advertised backfill unpopulated and wastes Spot + (Codex R7-3 P1). So EXCLUDE those stations from the default-satellite run + entirely (no empty slices, no misleading complete markers) and log WHY. Global + coverage is the native-ring path: pass ``--satellites`` (Himawari/Meteosat/VIIRS) + to back-fill them (the 28-26 native-ring roster backfill), which bypasses this + filter. Returns the GOES-coverable subset (possibly empty for a non-GOES shard). + """ + kept = [s for s in stations if s[:1].upper().startswith(_GOES_FOOTPRINT_ICAO_PREFIXES)] + excluded = [s for s in stations if s not in kept] + if excluded: print( - "WARNING: roster stations outside the GOES footprint will produce NO " - f"coverage under the default GOES-only satellites: {', '.join(off_footprint)}. " - "Pass --satellites (Himawari/Meteosat/VIIRS) to cover them, or expect empty " - "partitions for these shards (28-26 native-ring backfill).", + "NOTE: excluding roster stations OUTSIDE the GOES footprint from this " + f"default-satellite run (no GOES coverage): {', '.join(excluded)}. " + "They are NOT back-filled here — pass --satellites " + "(Himawari/Meteosat/VIIRS) for native-ring global coverage (28-26). " + "Excluding them avoids empty slices being marked complete.", file=sys.stderr, ) + return kept def _run_backfill(args: argparse.Namespace) -> int: @@ -320,11 +333,14 @@ def _run_backfill(args: argparse.Namespace) -> int: # --incremental yesterday: year-granular resume window (28-22 deferred the # true day-granular incremental). Force resume so only new/missing partitions - # for the CURRENT UTC year are fetched. + # for yesterday's year are fetched. if args.incremental == "yesterday": - current_year = datetime.now(UTC).year - year_start = current_year - year_end = current_year + # Key on YESTERDAY's year, not today's: on Jan 1 (UTC) "yesterday" is + # Dec 31 of the PRIOR year, so using today's year would never refresh that + # prior-year December partition (Codex R7-3 P2). + yesterday = datetime.now(UTC).date() - timedelta(days=1) + year_start = yesterday.year + year_end = yesterday.year resume = True if args.roster is not None: @@ -335,14 +351,23 @@ def _run_backfill(args: argparse.Namespace) -> int: stations = list(shard_roster(roster, index, count)) satellites = args.satellites or list(_DEFAULT_ROSTER_SATELLITES) products = args.products or list(_DEFAULT_ROSTER_PRODUCTS) - # Coverage guard (no SILENT under-coverage): the settlement-station roster spans - # the globe, but the GOES-only default satellites see only the Americas / - # E-Pacific. Warn LOUDLY (stderr, visible in Cloud Batch logs) for any - # shard station outside the GOES footprint so a `mode=full` operator is not - # blindsided by empty partitions + wasted Spot spend. Override --satellites - # (Himawari/Meteosat/VIIRS) to actually cover those stations. + # Coverage guard (no SILENT under-coverage): the settlement-station roster + # spans the globe, but the GOES-only default satellites see only the + # Americas / E-Pacific. EXCLUDE any shard station outside the GOES footprint + # from the default-satellite run (rather than fetching nothing and marking + # the empty slice complete — Codex R7-3 P1); the exclusion is logged loudly + # to the Cloud Batch logs. Global coverage = pass --satellites + # (Himawari/Meteosat/VIIRS), which bypasses this filter (28-26 native ring). if args.satellites is None: - _warn_non_goes_stations(stations) + stations = _filter_to_goes_footprint(stations) + if not stations: + print( + "no GOES-coverable stations in this shard under the default " + "satellites — nothing to back-fill (pass --satellites for " + "native-ring coverage). Exiting cleanly.", + file=sys.stderr, + ) + return 0 if year_start is None: year_start = _DEFAULT_ROSTER_YEAR_START if year_end is None: diff --git a/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py b/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py index 31ed55f..eb931d7 100644 --- a/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py +++ b/packages/weather/src/mostlyright/weather/satellite/_r2_sink.py @@ -134,4 +134,23 @@ def download(bucket: str, key: str, local_path: Path | str, *, r2_target: str | return str(local_path) -__all__ = ["download", "upload"] +def delete(bucket: str, key: str, *, r2_target: str | None = None) -> None: + """Delete one R2 object (``s3.delete_object``); idempotent (a missing key is a no-op). + + Used by the hosted ingest chain to TOMBSTONE a durable partition that an + idempotent replace has emptied — e.g. a role/fact rerun that legitimately + produces zero facts must clear the previously-uploaded + ``earnings/facts//.parquet`` so serving (which reads R2 as the + durable store) does not keep serving stale facts. ``delete_object`` is + idempotent on S3/R2 (deleting an absent key succeeds), so a first-time + zero-fact call is a safe no-op. Uses the same write-token client as + :func:`upload`. + + ``r2_target`` is accepted for signature symmetry; the effective bucket is the + explicit ``bucket`` argument. + """ + client = _get_r2_client() + client.delete_object(Bucket=bucket, Key=key) + + +__all__ = ["delete", "download", "upload"] diff --git a/packages/weather/tests/satellite/test_cli_roster.py b/packages/weather/tests/satellite/test_cli_roster.py index fdd8a9b..7004451 100644 --- a/packages/weather/tests/satellite/test_cli_roster.py +++ b/packages/weather/tests/satellite/test_cli_roster.py @@ -95,8 +95,20 @@ def test_shard_from_batch_task_env(captured, monkeypatch): """Shard index/count come from BATCH_TASK_INDEX/COUNT when flags are absent.""" monkeypatch.setenv("BATCH_TASK_INDEX", "5") monkeypatch.setenv("BATCH_TASK_COUNT", "66") + # Explicit --satellites so this test isolates env-driven shard resolution from + # the default-satellite GOES-footprint filter (tested separately below). rc = cli.main( - ["backfill", "--mirror", "gcp", "--roster", "kalshi,polymarket", "--r2-bucket", "b"] + [ + "backfill", + "--mirror", + "gcp", + "--roster", + "kalshi,polymarket", + "--satellites", + "goes16", + "--r2-bucket", + "b", + ] ) assert rc == 0 kw = captured[0] @@ -113,6 +125,8 @@ def test_shard_index_flag_overrides_env(captured, monkeypatch): "backfill", "--roster", "kalshi,polymarket", + "--satellites", + "goes16", "--shard-index", "2", "--shard-count", @@ -128,17 +142,72 @@ def test_roster_no_shard_defaults_to_whole_roster(captured, monkeypatch): """No shard flags + no env -> whole roster (index=0, count=1), e.g. Cloud Run Job.""" monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) - rc = cli.main(["backfill", "--roster", "kalshi,polymarket", "--r2-bucket", "b"]) + # Explicit --satellites so the whole roster is passed through unfiltered (the + # default-satellite GOES footprint filter is tested separately below). + rc = cli.main( + ["backfill", "--roster", "kalshi,polymarket", "--satellites", "goes16", "--r2-bucket", "b"] + ) assert rc == 0 kw = captured[0] assert kw["stations"] == list(SETTLEMENT_STATION_ROSTER) +def test_roster_default_satellites_filter_non_goes_stations(captured, monkeypatch): + """Default (GOES-only) satellites EXCLUDE roster stations outside the GOES + footprint — rather than fetching them empty and marking the slice complete + (silent under-coverage, Codex R7-3 P1).""" + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + rc = cli.main(["backfill", "--roster", "kalshi,polymarket", "--r2-bucket", "b"]) + assert rc == 0 + passed = captured[0]["stations"] + assert passed # non-empty — the roster has many GOES-footprint stations + assert all(s[:1].upper().startswith(cli._GOES_FOOTPRINT_ICAO_PREFIXES) for s in passed) + # A GOES station survives; European/Asian stations are dropped. + assert "KNYC" in passed + assert "EDDM" not in passed and "RJTT" not in passed + # Exactly the GOES-footprint subset of the committed roster. + expected = [ + s + for s in SETTLEMENT_STATION_ROSTER + if s[:1].upper().startswith(cli._GOES_FOOTPRINT_ICAO_PREFIXES) + ] + assert passed == expected + + +def test_roster_all_non_goes_shard_noops(captured, monkeypatch): + """A shard whose stations are all outside the GOES footprint cleanly no-ops under + the default satellites — no bulk_backfill, no empty-complete markers.""" + monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) + monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) + # shard 5 of 66 resolves to EPWA (Warsaw) — outside the GOES footprint. + non_goes = list(shard_roster(resolve_roster("kalshi,polymarket"), 5, 66)) + assert non_goes and all( + not s[:1].upper().startswith(cli._GOES_FOOTPRINT_ICAO_PREFIXES) for s in non_goes + ) + rc = cli.main( + [ + "backfill", + "--roster", + "kalshi,polymarket", + "--shard-index", + "5", + "--shard-count", + "66", + "--r2-bucket", + "b", + ] + ) + assert rc == 0 + assert captured == [] # filtered to empty -> nothing dispatched + + def test_incremental_yesterday_single_year_resume(captured, monkeypatch): """--incremental yesterday sets a single-year window and forces resume.""" - from datetime import UTC, datetime + from datetime import UTC, datetime, timedelta - year = datetime.now(UTC).year + # Yesterday's year (== today's except across the Jan-1 UTC boundary). + year = (datetime.now(UTC).date() - timedelta(days=1)).year monkeypatch.delenv("BATCH_TASK_INDEX", raising=False) monkeypatch.delenv("BATCH_TASK_COUNT", raising=False) rc = cli.main( @@ -194,6 +263,8 @@ def __init__(self, gcs_uri, *, fs=None): from mostlyright.weather.satellite import _progress monkeypatch.setattr(_progress, "GcsProgressStore", _FakeStore) + # Explicit --satellites so this test isolates progress-store wiring from the + # default-satellite GOES-footprint filter (shard 3 is a non-GOES station). rc = cli.main( [ "backfill", @@ -201,6 +272,8 @@ def __init__(self, gcs_uri, *, fs=None): "gcp", "--roster", "kalshi,polymarket", + "--satellites", + "goes16", "--progress-bucket", "marker-bkt", "--r2-bucket", @@ -256,9 +329,9 @@ def test_explicit_mode_missing_required_raises(captured): def test_incremental_explicit_mode(captured, tmp_path): """--incremental works in explicit mode too (year window override + resume).""" - from datetime import UTC, datetime + from datetime import UTC, datetime, timedelta - year = datetime.now(UTC).year + year = (datetime.now(UTC).date() - timedelta(days=1)).year rc = cli.main( [ "backfill", diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py index b0be4d6..0f5d888 100644 --- a/services/earnings/jobs/rolefact.py +++ b/services/earnings/jobs/rolefact.py @@ -168,15 +168,18 @@ def main(argv: list[str] | None = None) -> int: fact_path, ) - # A legitimate zero-mention call builds no fact rows, so FactLedger.append - # writes NO parquet — uploading a non-existent path would crash an otherwise - # valid call. Only upload when a partition actually exists (guard on both the - # row count and the file, since the ledger may no-op an empty append). + # A legitimate zero-mention call builds no fact rows, so FactLedger.replace + # removes the LOCAL partition. Only upload when a partition actually exists. if fact_rows and fact_path.exists(): _maybe_upload_r2(str(fact_path), ticker=ticker, call_id=call_id) else: + # Zero fact rows: the idempotent replace cleared the local partition, so a + # PRIOR nonzero run's R2 object (which serving reads as the durable store) + # is now stale — DELETE it too, else /facts keeps serving facts the replace + # meant to clear (Codex R7-3 P2). + _maybe_delete_r2_facts(ticker=ticker, call_id=call_id) _LOG.info( - "rolefact: no fact rows for %s/%s — skipping R2 upload (nothing to upload)", + "rolefact: no fact rows for %s/%s — cleared local partition + any stale R2 object", ticker, call_id, ) @@ -294,5 +297,45 @@ def _maybe_upload_r2(fact_path: str, *, ticker: str, call_id: str) -> None: ) +def _maybe_delete_r2_facts(*, ticker: str, call_id: str) -> None: + """Tombstone the R2 fact object when an idempotent rerun produced ZERO facts. + + A role/fact rerun that legitimately yields no facts (terms no longer match, a + corrected transcript) clears the LOCAL partition via ``FactLedger.replace`` — + but a PRIOR nonzero run may have uploaded + ``earnings/facts//.parquet`` to R2, which serving reads as the + durable store. Delete that stale object so ``/facts`` stops serving facts the + replace meant to clear (Codex R7-3 P2). Opt-in on the same bucket as the upload; + ``delete_object`` is idempotent, so a first-time zero-fact call is a safe no-op. + """ + bucket = optional_env("ROLEFACT_R2_BUCKET") or optional_env("R2_BUCKET") + if not bucket: + return + key = f"earnings/facts/{ticker}/{call_id}.parquet" + + from mostlyright.weather.satellite._r2_sink import delete + + try: + delete(bucket, key, r2_target=bucket) + except Exception: + _LOG.warning( + "rolefact: could not delete stale R2 fact object bucket=%s key=%s " + "(may already be absent) for %s/%s", + bucket, + key, + ticker, + call_id, + exc_info=True, + ) + return + _LOG.info( + "rolefact: deleted stale R2 fact object bucket=%s key=%s (zero-row replace) for %s/%s", + bucket, + key, + ticker, + call_id, + ) + + if __name__ == "__main__": raise SystemExit(main()) diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py index a07fc9b..e50e8bf 100644 --- a/services/earnings/tests/test_jobs_entrypoints.py +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -1209,3 +1209,41 @@ def test_rolefact_rerun_is_idempotent_not_doubled( facts = FactLedger().read("CHWY", "evt-1") # 'tariff' occurs once per segment -> 2 facts; a second run must not make it 4. assert len(facts) == 2 + + +def test_rolefact_zero_facts_deletes_stale_r2_object( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A zero-mention rerun clears the local partition AND deletes the stale R2 fact + object, so serving stops serving facts the replace cleared (Codex R7-3 P2).""" + cache = tmp_path / "cache" + monkeypatch.setenv("MOSTLYRIGHT_CACHE_DIR", str(cache)) + _seed_transcript(cache, "CHWY", "evt-3", ["nothing relevant is said here"]) + + deleted: dict[str, object] = {} + sink = importlib.import_module("mostlyright.weather.satellite._r2_sink") + + def _fake_delete(bucket, key, *, r2_target=None): + deleted["bucket"] = bucket + deleted["key"] = key + + def _boom_upload(*a, **k): # pragma: no cover - zero facts => must not upload + raise AssertionError("must not upload when there are zero facts") + + monkeypatch.setattr(sink, "delete", _fake_delete) + monkeypatch.setattr(sink, "upload", _boom_upload) + + monkeypatch.setenv("ROLEFACT_TICKER", "CHWY") + monkeypatch.setenv("ROLEFACT_CALL_ID", "evt-3") + monkeypatch.setenv("ROLEFACT_TERMS", '[{"term_canonical": "tariff"}]') # never mentioned + monkeypatch.delenv("ROLEFACT_R2_BUCKET", raising=False) + monkeypatch.setenv("R2_BUCKET", "mostlyright-derived") + + assert rolefact_job.main() == 0 + # The stale R2 fact object was tombstoned (not left behind). + assert deleted["bucket"] == "mostlyright-derived" + assert deleted["key"] == "earnings/facts/CHWY/evt-3.parquet" + + from mostlyright.weather.earnings.ledger import FactLedger + + assert FactLedger().read("CHWY", "evt-3") == [] From a0c56f5704b36f69b0d05d9e3baa6588cca89eab Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Sat, 4 Jul 2026 00:40:54 +0200 Subject: [PATCH 14/17] =?UTF-8?q?fix(28):=20review=20round=207-4=20?= =?UTF-8?q?=E2=80=94=20serving=20reads=20earnings=20transcripts/facts=20fr?= =?UTF-8?q?om=20R2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings from the round-4 Codex gate, completing the hosted earnings data plane: 1. [P1] Serving read the container-local ledger, not R2. STT/role-fact publish the transcript + fact parquet to R2, but the earnings serving app built ServingState against its ephemeral local disk — so a fresh Cloud Run instance returned EMPTY /transcripts and /facts even after ingest succeeded. Adds services/earnings/ r2_read.py (EarningsR2Reader + R2LedgerSource) mirroring the weather serving R2 read path: READ-ONLY token, list+get, settlement-safe NoSuchKey→[] vs real-error propagation, audio-free. ServingState.build now reads from R2 when the read token is present and no explicit ledger_root is given (local ledger for tests / on-device). The earnings_serving infra ALREADY injects the R2 read token + R2_BUCKET (no infra change); the serving image gains boto3. 2. [P2] A failed zero-fact R2 tombstone was swallowed. role/fact's zero-row branch caught+ignored delete() errors, so a real auth/network failure exited "success" while the stale fact object kept being served. Since delete_object is idempotent (missing key succeeds), any exception is a real failure — now propagated so the job fails and retries. Tests: EarningsR2Reader parse / miss-is-empty / real-error-propagates / list tickers+call_ids / unsafe-segment; ServingState R2-vs-local gating; /transcripts + /facts serve R2 rows end to end. --- deploy/earnings/serving.Dockerfile | 12 +- services/earnings/deps.py | 36 ++- services/earnings/jobs/rolefact.py | 19 +- services/earnings/r2_read.py | 235 ++++++++++++++++++ .../earnings/tests/test_serving_r2_read.py | 168 +++++++++++++ 5 files changed, 447 insertions(+), 23 deletions(-) create mode 100644 services/earnings/r2_read.py create mode 100644 services/earnings/tests/test_serving_r2_read.py diff --git a/deploy/earnings/serving.Dockerfile b/deploy/earnings/serving.Dockerfile index 23a4aaa..3dfc077 100644 --- a/deploy/earnings/serving.Dockerfile +++ b/deploy/earnings/serving.Dockerfile @@ -38,15 +38,19 @@ COPY packages/weather/ packages/weather/ # Install the two published distributions with the [parquet] extra ONLY (pandas # for the ledger read path) — deliberately NOT [earnings] (that pulls # faster-whisper + av, the audio toolchain the firewall forbids on serving) — plus -# the serving runtime (FastAPI + uvicorn) and google-cloud-pubsub (the SSE -# streaming subscriber; only started when EARNINGS_STREAMING_SUBSCRIPTION is set, -# lazy-imported so the ledger-only default deploy needs no Pub/Sub call). +# the serving runtime (FastAPI + uvicorn), google-cloud-pubsub (the SSE streaming +# subscriber; only started when EARNINGS_STREAMING_SUBSCRIPTION is set), and boto3 +# for the READ-ONLY R2 source (services/earnings/r2_read.py — a fresh serving +# instance reads the durable transcript/fact parquet the ingest jobs wrote to R2, +# not its empty local disk). All three are lazy-imported, so a local ledger-only +# tier needs none of them at import. RUN pip install \ ./packages/core \ "./packages/weather[parquet]" \ "fastapi>=0.115,<1" \ "uvicorn[standard]>=0.30" \ - "google-cloud-pubsub>=2.20,<3" + "google-cloud-pubsub>=2.20,<3" \ + "boto3>=1.34,<2.0" # --- App layer --------------------------------------------------------------- # The non-published serving app is imported as `services.earnings.*` (matching the diff --git a/services/earnings/deps.py b/services/earnings/deps.py index 09839eb..852be0e 100644 --- a/services/earnings/deps.py +++ b/services/earnings/deps.py @@ -16,6 +16,8 @@ from mostlyright.weather.earnings.ledger import FactLedger, TranscriptLedger from mostlyright.weather.earnings.segment_bus import SegmentBus +from .r2_read import EarningsR2Reader, R2LedgerSource, r2_read_configured + #: STT tier this deployment runs (RESEARCH-MARKETS §3.4 — large-v3 hosted #: source-of-truth). Reported by /capabilities. DEFAULT_STT_TIER = "large-v3" @@ -226,10 +228,19 @@ def release_and_maybe_evict(self, call_id: str) -> None: @dataclass(slots=True) class ServingState: - """The serving app's read-side state.""" + """The serving app's read-side state. + + ``transcripts`` / ``facts`` are the READ source. In the DEPLOYED hosted topology + they are R2-backed (:class:`R2LedgerSource`) — serving runs in a SEPARATE Cloud + Run container from the ingest jobs, so it reads the durable text/fact parquet + from R2, not a container-local ledger that would be empty (Codex R7-4 P1). In + the local / on-device / test tier they are the engine's local parquet ledgers. + Both expose the same read subset the routes use (``read`` / ``read_ticker`` / + ``list_call_ids`` / ``list_tickers``). + """ - transcripts: TranscriptLedger - facts: FactLedger + transcripts: TranscriptLedger | R2LedgerSource + facts: FactLedger | R2LedgerSource stt_tier: str = DEFAULT_STT_TIER buses: BusRegistry = field(default_factory=BusRegistry) @@ -237,10 +248,23 @@ class ServingState: def build( cls, ledger_root: Path | str | None = None, *, stt_tier: str | None = None ) -> ServingState: - root = Path(ledger_root) if ledger_root is not None else None + # Deployed serving reads the durable R2 corpus the ingest jobs wrote (its + # container has ONLY the R2 READ token). An EXPLICIT ledger_root (tests / + # on-device) always wins → the local ledger; otherwise, when the read + # token is present, read from R2. + transcripts: TranscriptLedger | R2LedgerSource + facts: FactLedger | R2LedgerSource + if ledger_root is None and r2_read_configured(): + reader = EarningsR2Reader() + transcripts = R2LedgerSource(reader, "transcripts") + facts = R2LedgerSource(reader, "facts") + else: + root = Path(ledger_root) if ledger_root is not None else None + transcripts = TranscriptLedger(root=root) + facts = FactLedger(root=root) return cls( - transcripts=TranscriptLedger(root=root), - facts=FactLedger(root=root), + transcripts=transcripts, + facts=facts, stt_tier=stt_tier or DEFAULT_STT_TIER, buses=BusRegistry(), ) diff --git a/services/earnings/jobs/rolefact.py b/services/earnings/jobs/rolefact.py index 0f5d888..4cc71c5 100644 --- a/services/earnings/jobs/rolefact.py +++ b/services/earnings/jobs/rolefact.py @@ -315,19 +315,12 @@ def _maybe_delete_r2_facts(*, ticker: str, call_id: str) -> None: from mostlyright.weather.satellite._r2_sink import delete - try: - delete(bucket, key, r2_target=bucket) - except Exception: - _LOG.warning( - "rolefact: could not delete stale R2 fact object bucket=%s key=%s " - "(may already be absent) for %s/%s", - bucket, - key, - ticker, - call_id, - exc_info=True, - ) - return + # Do NOT swallow: R2/S3 delete_object is idempotent (a MISSING key succeeds), so + # a first-time zero-fact call is a safe no-op and raises nothing. An exception + # here is therefore a REAL auth/network/service failure — let it propagate so + # the job fails and is RETRIED, rather than exiting "success" while the stale + # fact object keeps being served from the durable store (Codex R7-4 P2). + delete(bucket, key, r2_target=bucket) _LOG.info( "rolefact: deleted stale R2 fact object bucket=%s key=%s (zero-row replace) for %s/%s", bucket, diff --git a/services/earnings/r2_read.py b/services/earnings/r2_read.py new file mode 100644 index 0000000..969fd06 --- /dev/null +++ b/services/earnings/r2_read.py @@ -0,0 +1,235 @@ +"""Read-only Cloudflare R2 access for the earnings serving app (Phase 28, 28-13). + +The earnings ingest jobs run in SEPARATE Cloud Run resources from serving and +publish their durable text/fact parquet to R2 (bucket ``mostlyright-derived``): + + earnings/transcripts/{ticker}/{call_id}.parquet (STT, jobs/stt.py) + earnings/facts/{ticker}/{call_id}.parquet (role/fact, jobs/rolefact.py) + +A fresh serving Cloud Run instance has an EMPTY ephemeral disk, so it must read +those objects from R2 — the durable data plane — rather than a container-local +ledger (which would return empty even after ingest succeeded; Codex R7-4 P1). +This module is the READ side of the R2 firewall for earnings: it signs with the +READ-ONLY token (list+get) and NEVER holds the write token — mirroring +``services/weather/r2_read.py`` (the satellite serving read path). The deployed +``earnings-serving`` container already carries the read-token env +(``infra/cloud_run.tf``: ``R2_ACCESS_KEY_ID`` / ``R2_SECRET_ACCESS_KEY`` / +``R2_ACCOUNT_ID`` / ``R2_BUCKET`` from the ``r2-read-*`` secrets). + +**Audio firewall (D-27.9).** ONLY the text transcript + derived-fact parquet is +ever read here — audio never got an R2 key, so there is nothing audio-shaped to +read. The rows are the canonical ``schema.earnings_transcript.v1`` / +``schema.earnings_fact.v1`` shapes. + +**Read-only by construction.** The client exposes only ``get_object`` / +``list_objects`` — no put/delete. boto3 is lazy-imported (kept off module load so +a local/on-device serving tier that never touches R2 needs no boto3). +""" + +from __future__ import annotations + +import io +import os +import re +from typing import Any + +#: Object-store key prefixes the ingest jobs write under (kept in lockstep with +#: ``jobs/stt.py::_TRANSCRIPT_R2_KEY_FMT`` and ``jobs/rolefact.py``). +_EARNINGS_PREFIX = "earnings/" + +#: Environment-variable NAMES the READ-ONLY token credentials are read from — the +#: GENERIC ``R2_*`` names the deploy layer injects into the serving SA env (the +#: serving SA's ONLY R2 token is the read token; the ingest WRITE path uses the +#: disjoint ``R2_WRITE_*`` names). Mirrors ``services/weather/r2_read.py``. +_ENV_ACCOUNT_ID = "R2_ACCOUNT_ID" +_ENV_ACCESS_KEY_ID = "R2_ACCESS_KEY_ID" +_ENV_SECRET_ACCESS_KEY = "R2_SECRET_ACCESS_KEY" + +_ENV_BUCKET = "R2_BUCKET" +_DEFAULT_BUCKET = "mostlyright-derived" + +#: R2's fixed S3-compat pseudo-region (Cloudflare requires ``"auto"``). +_R2_REGION = "auto" + +#: A ticker / call_id path segment must be a safe single key component (no ``/`` +#: or ``..`` — the ingest write side only ever wrote safe segments, so an unsafe +#: query value cannot match any object and is rejected as "no such call"). +_SAFE_SEGMENT = re.compile(r"^[A-Za-z0-9._-]+$") + + +def r2_read_configured() -> bool: + """True when the READ-ONLY R2 token is present (the deployed serving path). + + The deployed ``earnings-serving`` container carries the read-token env; a + local / on-device / test serving tier does not, and reads the local ledger + instead. Gate on the access-key id (the token that unambiguously means + "serve from the durable R2 corpus"). + """ + return bool(os.environ.get(_ENV_ACCESS_KEY_ID)) + + +def _require_env(name: str) -> str: + value = os.environ.get(name) + if not value: + raise ValueError( + f"the earnings R2 read client needs {name} set (the READ-ONLY-token " + "credential is injected into the serving SA env from Secret Manager). " + "It is unset or empty." + ) + return value + + +def _derived_bucket() -> str: + return os.environ.get(_ENV_BUCKET) or _DEFAULT_BUCKET + + +def _validate_segment(value: str, *, field: str) -> str: + if not _SAFE_SEGMENT.match(value): + raise ValueError(f"unsafe {field} segment {value!r} for an R2 key") + return value + + +class EarningsR2Reader: + """Read-only R2 accessor: list + fetch the earnings transcript/fact partitions. + + Constructed lazily against the injected READ-token env. No write surface — the + write side (``satellite/_r2_sink.py``) is a separate module bound to the + disjoint write token. + """ + + def __init__(self, bucket: str | None = None) -> None: + self._bucket = bucket or _derived_bucket() + self._client: Any | None = None + + @property + def bucket(self) -> str: + return self._bucket + + def _get_client(self) -> Any: + if self._client is not None: + return self._client + import boto3 + import botocore.config + + account_id = _require_env(_ENV_ACCOUNT_ID) + access_key_id = _require_env(_ENV_ACCESS_KEY_ID) + secret_access_key = _require_env(_ENV_SECRET_ACCESS_KEY) + + self._client = boto3.client( + "s3", + endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com", + aws_access_key_id=access_key_id, + aws_secret_access_key=secret_access_key, + region_name=_R2_REGION, + config=botocore.config.Config(retries={"max_attempts": 5, "mode": "adaptive"}), + ) + return self._client + + def _list_child_dirs(self, prefix: str) -> list[str]: + """Return the immediate child "directory" names under ``prefix`` (deduped).""" + client = self._get_client() + names: set[str] = set() + token: str | None = None + while True: + kwargs: dict[str, Any] = {"Bucket": self._bucket, "Prefix": prefix, "Delimiter": "/"} + if token is not None: + kwargs["ContinuationToken"] = token + resp = client.list_objects_v2(**kwargs) + for cp in resp.get("CommonPrefixes", []) or []: + # e.g. "earnings/facts/GIS/" -> "GIS" + names.add(cp["Prefix"][len(prefix) :].rstrip("/")) + if not resp.get("IsTruncated"): + break + token = resp.get("NextContinuationToken") + return sorted(n for n in names if n) + + def _list_object_stems(self, prefix: str) -> list[str]: + """Return the ``*.parquet`` object stems directly under ``prefix`` (sorted).""" + client = self._get_client() + stems: list[str] = [] + token: str | None = None + while True: + kwargs: dict[str, Any] = {"Bucket": self._bucket, "Prefix": prefix} + if token is not None: + kwargs["ContinuationToken"] = token + resp = client.list_objects_v2(**kwargs) + for obj in resp.get("Contents", []) or []: + key = obj["Key"] + tail = key[len(prefix) :] + if "/" in tail or not tail.endswith(".parquet"): + continue + stems.append(tail[: -len(".parquet")]) + if not resp.get("IsTruncated"): + break + token = resp.get("NextContinuationToken") + return sorted(stems) + + def list_tickers(self, subdir: str) -> list[str]: + """List every ticker with at least one persisted partition under ``subdir``.""" + return self._list_child_dirs(f"{_EARNINGS_PREFIX}{subdir}/") + + def list_call_ids(self, subdir: str, ticker: str) -> list[str]: + """List every persisted ``call_id`` for ``ticker`` under ``subdir``.""" + safe = _validate_segment(ticker, field="ticker") + return self._list_object_stems(f"{_EARNINGS_PREFIX}{subdir}/{safe}/") + + def read_partition(self, subdir: str, ticker: str, call_id: str) -> list[dict[str, object]]: + """Fetch + parse one ``(ticker, call_id)`` partition; ``[]`` on a genuine miss. + + Settlement safety (mirrors ``weather/r2_read``): a MISSING object + (``NoSuchKey`` — nothing ingested for this call yet) is a legitimate empty + (``[]``); a missing/typoed BUCKET or any other error is a CONFIG/read + failure that must propagate (never silently look like "no data"). + """ + from botocore.exceptions import ClientError + + safe_ticker = _validate_segment(ticker, field="ticker") + safe_call = _validate_segment(call_id, field="call_id") + key = f"{_EARNINGS_PREFIX}{subdir}/{safe_ticker}/{safe_call}.parquet" + client = self._get_client() + try: + resp = client.get_object(Bucket=self._bucket, Key=key) + except ClientError as exc: + response = getattr(exc, "response", None) or {} + code = str(response.get("Error", {}).get("Code", "")) + if code == "NoSuchKey": + return [] + raise + body = resp["Body"].read() + + import pyarrow.parquet as pq + + return pq.read_table(io.BytesIO(body)).to_pylist() + + +class R2LedgerSource: + """Duck-typed read facade over :class:`EarningsR2Reader` for one ledger subdir. + + Implements the read subset the serving routes call on ``ServingState.transcripts`` + / ``.facts`` (``read`` / ``read_ticker`` / ``list_call_ids`` / ``list_tickers``), + so ``ServingState`` can hold an R2-backed source in place of the local ledger + with NO route change. It is READ-ONLY (no ``append``/``replace``) — serving + never writes. + """ + + def __init__(self, reader: EarningsR2Reader, subdir: str) -> None: + self._reader = reader + self._subdir = subdir + + def read(self, ticker: str, call_id: str) -> list[dict[str, object]]: + return self._reader.read_partition(self._subdir, ticker, call_id) + + def list_call_ids(self, ticker: str) -> list[str]: + return self._reader.list_call_ids(self._subdir, ticker) + + def list_tickers(self) -> list[str]: + return self._reader.list_tickers(self._subdir) + + def read_ticker(self, ticker: str) -> list[dict[str, object]]: + out: list[dict[str, object]] = [] + for call_id in self.list_call_ids(ticker): + out.extend(self.read(ticker, call_id)) + return out + + +__all__ = ["EarningsR2Reader", "R2LedgerSource", "r2_read_configured"] diff --git a/services/earnings/tests/test_serving_r2_read.py b/services/earnings/tests/test_serving_r2_read.py new file mode 100644 index 0000000..c66faa1 --- /dev/null +++ b/services/earnings/tests/test_serving_r2_read.py @@ -0,0 +1,168 @@ +"""Serving reads the durable R2 corpus the ingest jobs wrote (Codex R7-4 P1). + +STT / role-fact publish transcript + fact parquet to R2; the serving app runs in a +SEPARATE Cloud Run container with an empty local disk, so it must read those from +R2 (else /transcripts and /facts return empty after ingest succeeded). These tests +prove: the ``EarningsR2Reader`` parses R2 objects + distinguishes a genuine miss +from a real error; ``ServingState`` picks the R2 source only when the read token is +present and no explicit ledger_root is given; and the routes serve R2 rows. +""" + +from __future__ import annotations + +import io + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest +from fastapi.testclient import TestClient + +from services.earnings.app import create_app +from services.earnings.deps import ServingState +from services.earnings.r2_read import EarningsR2Reader, R2LedgerSource + + +def _parquet_bytes(rows: list[dict]) -> bytes: + buf = io.BytesIO() + pq.write_table(pa.Table.from_pylist(rows), buf) + return buf.getvalue() + + +class _FakeBody: + def __init__(self, data: bytes) -> None: + self._data = data + + def read(self) -> bytes: + return self._data + + +class _FakeS3: + """A minimal in-memory S3/R2 stand-in: {key: parquet-bytes}.""" + + def __init__(self, objects: dict[str, bytes]) -> None: + self._objects = objects + + def get_object(self, *, Bucket, Key): + if Key not in self._objects: + from botocore.exceptions import ClientError + + raise ClientError({"Error": {"Code": "NoSuchKey"}}, "GetObject") + return {"Body": _FakeBody(self._objects[Key])} + + def list_objects_v2(self, *, Bucket, Prefix, Delimiter=None, ContinuationToken=None): + keys = [k for k in self._objects if k.startswith(Prefix)] + if Delimiter: + common = set() + for k in keys: + tail = k[len(Prefix) :] + if Delimiter in tail: + common.add(Prefix + tail.split(Delimiter, 1)[0] + Delimiter) + return {"CommonPrefixes": [{"Prefix": p} for p in sorted(common)], "IsTruncated": False} + return {"Contents": [{"Key": k} for k in sorted(keys)], "IsTruncated": False} + + +def _reader_over(objects: dict[str, bytes]) -> EarningsR2Reader: + reader = EarningsR2Reader(bucket="mostlyright-derived") + reader._client = _FakeS3(objects) # bypass boto3 (no creds needed) + return reader + + +def test_reader_read_partition_parses_rows() -> None: + rows = [{"ticker": "GIS", "call_id": "GIS-Q3", "segment_index": 0, "text": "hello"}] + reader = _reader_over({"earnings/transcripts/GIS/GIS-Q3.parquet": _parquet_bytes(rows)}) + got = reader.read_partition("transcripts", "GIS", "GIS-Q3") + assert got == rows + + +def test_reader_missing_object_is_empty_not_error() -> None: + reader = _reader_over({}) + # NoSuchKey -> [] (nothing ingested yet), never a raise. + assert reader.read_partition("facts", "GIS", "NOPE") == [] + + +def test_reader_real_error_propagates() -> None: + class _BoomS3: + def get_object(self, *, Bucket, Key): + from botocore.exceptions import ClientError + + raise ClientError({"Error": {"Code": "AccessDenied"}}, "GetObject") + + reader = EarningsR2Reader() + reader._client = _BoomS3() + with pytest.raises(Exception): # noqa: B017 - a non-NoSuchKey error must NOT be silently empty + reader.read_partition("facts", "GIS", "GIS-Q3") + + +def test_reader_lists_tickers_and_call_ids() -> None: + objects = { + "earnings/facts/GIS/GIS-Q3.parquet": _parquet_bytes([{"ticker": "GIS"}]), + "earnings/facts/GIS/GIS-Q2.parquet": _parquet_bytes([{"ticker": "GIS"}]), + "earnings/facts/ORCL/ORCL-Q1.parquet": _parquet_bytes([{"ticker": "ORCL"}]), + } + reader = _reader_over(objects) + assert reader.list_tickers("facts") == ["GIS", "ORCL"] + assert reader.list_call_ids("facts", "GIS") == ["GIS-Q2", "GIS-Q3"] + + +def test_reader_rejects_unsafe_segment() -> None: + reader = _reader_over({}) + with pytest.raises(ValueError, match="unsafe"): + reader.read_partition("facts", "../secret", "x") + + +def test_serving_state_uses_r2_when_read_token_present(monkeypatch) -> None: + monkeypatch.setenv("R2_ACCESS_KEY_ID", "read-key") + monkeypatch.setenv("R2_SECRET_ACCESS_KEY", "read-secret") + monkeypatch.setenv("R2_ACCOUNT_ID", "acct") + state = ServingState.build(ledger_root=None) + assert isinstance(state.transcripts, R2LedgerSource) + assert isinstance(state.facts, R2LedgerSource) + + +def test_serving_state_local_when_ledger_root_given(monkeypatch, tmp_path) -> None: + # An explicit ledger_root ALWAYS wins (tests / on-device), even with R2 env set. + monkeypatch.setenv("R2_ACCESS_KEY_ID", "read-key") + state = ServingState.build(ledger_root=tmp_path) + assert not isinstance(state.transcripts, R2LedgerSource) + assert not isinstance(state.facts, R2LedgerSource) + + +def test_serving_state_local_when_no_r2_token(monkeypatch) -> None: + monkeypatch.delenv("R2_ACCESS_KEY_ID", raising=False) + state = ServingState.build(ledger_root=None) + assert not isinstance(state.transcripts, R2LedgerSource) + + +def test_routes_serve_r2_data(monkeypatch) -> None: + """/transcripts + /facts return the rows read from R2 (deployed serving path).""" + transcript_rows = [ + { + "ticker": "GIS", + "call_id": "GIS-Q3", + "segment_index": 0, + "text": "hi", + "delivery": "hosted", + } + ] + fact_rows = [{"ticker": "GIS", "call_id": "GIS-Q3", "term_canonical": "AI", "mention_count": 1}] + objects = { + "earnings/transcripts/GIS/GIS-Q3.parquet": _parquet_bytes(transcript_rows), + "earnings/facts/GIS/GIS-Q3.parquet": _parquet_bytes(fact_rows), + } + + import services.earnings.deps as deps + + # Force the R2 branch (no ledger_root) with a fake reader over the objects. + monkeypatch.setenv("R2_ACCESS_KEY_ID", "read-key") + monkeypatch.setattr(deps, "EarningsR2Reader", lambda *a, **k: _reader_over(objects)) + + app = create_app(api_key=None) # keyless in-process; no ledger_root -> R2 path + client = TestClient(app) + + r = client.get("/transcripts", params={"ticker": "GIS", "call_id": "GIS-Q3"}) + assert r.status_code == 200 + assert r.json() == transcript_rows + + r = client.get("/facts", params={"ticker": "GIS"}) + assert r.status_code == 200 + assert r.json() == fact_rows From a80ff5b2badd15db86d99a786bbddd512df6a2b7 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Sat, 4 Jul 2026 00:47:19 +0200 Subject: [PATCH 15/17] =?UTF-8?q?fix(28):=20review=20round=207-5=20?= =?UTF-8?q?=E2=80=94=20fail=20loud=20on=20subscription=20capture=20with=20?= =?UTF-8?q?no=20handoff=20bucket?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [P1] The capture job pulled a real Pub/Sub message from CAPTURE_JOBS_SUBSCRIPTION but, when AUDIO_HANDOFF_BUCKET was unset (deploy misconfig), took the bare-local branch: printed the ephemeral path and ACKED the message. The job then exited and deleted the only audio copy without ever uploading it or triggering STT, and Pub/Sub — already acked — never redelivered → silent settlement-audio loss. Now fail loud EARLY (before the expensive capture), leaving the message un-acked so it redelivers once the env is fixed. The no-handoff-bucket path stays legitimate ONLY for the bare-local operator override (_NoopHandle: no Pub/Sub message, ack is a no-op). Test: subscription pull + AUDIO_HANDOFF_BUCKET unset raises + does not ack. --- services/earnings/jobs/capture.py | 17 ++++++++++++++ .../earnings/tests/test_jobs_entrypoints.py | 22 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/services/earnings/jobs/capture.py b/services/earnings/jobs/capture.py index b4be387..91c828f 100644 --- a/services/earnings/jobs/capture.py +++ b/services/earnings/jobs/capture.py @@ -387,6 +387,23 @@ def main(argv: list[str] | None = None) -> int: webcast_url = spec["webcast_url"] handoff_bucket = optional_env("AUDIO_HANDOFF_BUCKET") + + # A SUBSCRIPTION-pulled capture (real _MessageHandle) MUST have a handoff bucket + # to hand the audio to STT. Without it the run would capture, take the local + # branch, print the ephemeral path, and ACK the message — deleting the only + # audio copy on job exit while Pub/Sub never redelivers → SILENT settlement-audio + # loss (Codex R7-5 P1). Fail loud EARLY (before the expensive capture), NOT + # acked, so the message redelivers once AUDIO_HANDOFF_BUCKET is fixed. The + # no-bucket path is legitimate ONLY for the bare-local operator override + # (_NoopHandle: no Pub/Sub message, ack is a no-op). + if not handoff_bucket and not isinstance(handle, _NoopHandle): + raise RuntimeError( + "AUDIO_HANDOFF_BUCKET is unset but this capture was pulled from " + "CAPTURE_JOBS_SUBSCRIPTION — the deployed path MUST hand the transient " + "audio to STT via the handoff bucket. Refusing to run + ack (which would " + "ORPHAN the captured audio with no redelivery). Set AUDIO_HANDOFF_BUCKET." + ) + out_dir = optional_env("CAPTURE_OUT_DIR") or tempfile.mkdtemp(prefix="earnings-capture-") os.makedirs(out_dir, exist_ok=True) diff --git a/services/earnings/tests/test_jobs_entrypoints.py b/services/earnings/tests/test_jobs_entrypoints.py index e50e8bf..d2540df 100644 --- a/services/earnings/tests/test_jobs_entrypoints.py +++ b/services/earnings/tests/test_jobs_entrypoints.py @@ -1247,3 +1247,25 @@ def _boom_upload(*a, **k): # pragma: no cover - zero facts => must not upload from mostlyright.weather.earnings.ledger import FactLedger assert FactLedger().read("CHWY", "evt-3") == [] + + +def test_capture_subscription_without_handoff_bucket_fails_loud( + monkeypatch: pytest.MonkeyPatch, tmp_path +) -> None: + """A SUBSCRIPTION-pulled capture with AUDIO_HANDOFF_BUCKET unset fails loud and + does NOT ack — else the audio is orphaned with no redelivery (Codex R7-5 P1).""" + + def _unused_post(url, *, json, headers, timeout): # pragma: no cover - never reached + raise AssertionError("must not reach the STT trigger") + + out_dir = tmp_path / "cap" + out_dir.mkdir() + record = _install_deployed_capture_fakes(monkeypatch, out_dir, stt_post=_unused_post) + # Deploy misconfiguration: a real subscription pull but no handoff bucket. + monkeypatch.delenv("AUDIO_HANDOFF_BUCKET", raising=False) + monkeypatch.setenv("STT_SERVICE_URL", "https://stt.run.app") + + with pytest.raises(RuntimeError, match="AUDIO_HANDOFF_BUCKET is unset"): + capture_job.main() + # The message was NOT acked (Pub/Sub redelivers once the env is fixed). + assert record["acked"] is False From 1645d0dc66decd7d689b6c5f495233a7bcb2f33e Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Sat, 4 Jul 2026 00:54:47 +0200 Subject: [PATCH 16/17] =?UTF-8?q?fix(28):=20review=20round=207-6=20?= =?UTF-8?q?=E2=80=94=20STT=20service=20derives=20live-streaming=20config?= =?UTF-8?q?=20from=20env?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [P2] The deployed capture->STT trigger posts only {audio_path, ticker, call_id}, so the STT Cloud Run SERVICE (/transcribe) kept publish_live=False and never called _maybe_publish_live — yet the serving app ALWAYS starts the earnings-streaming subscriber (EARNINGS_STREAMING_SUBSCRIPTION is set unconditionally). So /stream had nothing to fan out for hosted calls unless every request manually supplied the streaming fields. stt_server now derives publish_live / streaming_project / streaming_topic from the service env (mirroring the one-shot jobs/stt.py main()), and the STT service infra sets EARNINGS_STREAMING_ENABLED=1 / _PROJECT / _TOPIC (the STT SA already holds pubsub.publisher on the topic). An explicit request field still overrides the env. Tests: env-derived publish (on), no-env (off), request-override. --- infra/cloud_run.tf | 20 +++++++ services/earnings/jobs/stt_server.py | 23 ++++++- services/earnings/tests/test_stt_server.py | 70 ++++++++++++++++++++++ 3 files changed, 110 insertions(+), 3 deletions(-) diff --git a/infra/cloud_run.tf b/infra/cloud_run.tf index d152af2..8c1cf04 100644 --- a/infra/cloud_run.tf +++ b/infra/cloud_run.tf @@ -359,6 +359,26 @@ resource "google_cloud_run_v2_service" "stt" { } } } + + # Live SSE publish: the serving app ALWAYS starts the earnings-streaming + # subscriber (EARNINGS_STREAMING_SUBSCRIPTION is set unconditionally on + # earnings_serving), so STT must PUBLISH its transcript segments to that + # topic or /stream has nothing to fan out for hosted calls (Codex R7-6 P2). + # The capture->STT trigger posts no streaming fields, so the service derives + # them from these env vars. The STT SA holds pubsub.publisher on the topic + # (deploy_iam.tf). Set ENABLED=0 to turn hosted live-publish off. + env { + name = "EARNINGS_STREAMING_ENABLED" + value = "1" + } + env { + name = "EARNINGS_STREAMING_PROJECT" + value = google_project.ingest.project_id + } + env { + name = "EARNINGS_STREAMING_TOPIC" + value = google_pubsub_topic.earnings_streaming.name + } } } diff --git a/services/earnings/jobs/stt_server.py b/services/earnings/jobs/stt_server.py index a552979..54c05e9 100644 --- a/services/earnings/jobs/stt_server.py +++ b/services/earnings/jobs/stt_server.py @@ -80,6 +80,23 @@ def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: # containers; a request may override it explicitly. r2_bucket = payload.get("r2_bucket") or os.environ.get("R2_BUCKET") + # Live-publish config DEFAULTS from the SERVICE env (mirrors the one-shot + # jobs/stt.py main()), because the deployed capture->STT trigger posts only + # {audio_path, ticker, call_id}. Without this, the STT SERVICE never publishes + # to earnings-streaming, so the serving /stream subscriber has nothing to fan + # out for hosted calls (Codex R7-6 P2). A request may still override any field. + env_enabled = os.environ.get("EARNINGS_STREAMING_ENABLED") + env_publish = bool(env_enabled) and env_enabled.strip().lower() not in ("0", "false", "no") + publish_live = bool(payload.get("publish_live", env_publish)) + streaming_project = payload.get("streaming_project") or os.environ.get( + "EARNINGS_STREAMING_PROJECT" + ) + streaming_topic = str( + payload.get("streaming_topic") + or os.environ.get("EARNINGS_STREAMING_TOPIC") + or "earnings-streaming" + ) + try: return transcribe_call( str(audio_path), @@ -89,9 +106,9 @@ def transcribe(payload: Annotated[dict, Body(...)]) -> dict[str, object]: device=str(payload.get("device") or "cuda"), compute_type=str(payload.get("compute_type") or "float16"), initial_prompt=payload.get("initial_prompt"), - publish_live=bool(payload.get("publish_live", False)), - streaming_project=payload.get("streaming_project"), - streaming_topic=str(payload.get("streaming_topic") or "earnings-streaming"), + publish_live=publish_live, + streaming_project=streaming_project, + streaming_topic=streaming_topic, handoff_bucket=handoff_bucket, r2_bucket=r2_bucket, ) diff --git a/services/earnings/tests/test_stt_server.py b/services/earnings/tests/test_stt_server.py index 75883d9..12854df 100644 --- a/services/earnings/tests/test_stt_server.py +++ b/services/earnings/tests/test_stt_server.py @@ -58,3 +58,73 @@ def _fake_transcribe_call(audio_path, **kwargs): assert not any("audio" in k.lower() for k in body) assert captured["audio_path"] == "/tmp/a.wav" assert captured["tier"] == "small" + + +def test_transcribe_derives_streaming_from_env(monkeypatch) -> None: + """The STT SERVICE derives publish_live/project/topic from its env — the deployed + capture->STT trigger posts none of them, so without this the serving /stream + subscriber never gets events for hosted calls (Codex R7-6 P2).""" + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured.update(kwargs) + return {"ticker": kwargs["ticker"], "call_id": kwargs["call_id"], "segments": 0} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + monkeypatch.setenv("EARNINGS_STREAMING_ENABLED", "1") + monkeypatch.setenv("EARNINGS_STREAMING_PROJECT", "mr-ingest") + monkeypatch.setenv("EARNINGS_STREAMING_TOPIC", "earnings-streaming") + + client = TestClient(server.app) + resp = client.post( + "/transcribe", json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1"} + ) + assert resp.status_code == 200 + assert captured["publish_live"] is True + assert captured["streaming_project"] == "mr-ingest" + assert captured["streaming_topic"] == "earnings-streaming" + + +def test_transcribe_no_streaming_env_defaults_off(monkeypatch) -> None: + """No streaming env -> publish_live stays off (unchanged local/default behavior).""" + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured.update(kwargs) + return {"ticker": "GIS", "call_id": "c1", "segments": 0} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + monkeypatch.delenv("EARNINGS_STREAMING_ENABLED", raising=False) + + client = TestClient(server.app) + resp = client.post( + "/transcribe", json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1"} + ) + assert resp.status_code == 200 + assert captured["publish_live"] is False + + +def test_transcribe_request_overrides_env_streaming(monkeypatch) -> None: + """An explicit request field still overrides the env-derived default.""" + import services.earnings.jobs.stt_server as server + + captured: dict = {} + + def _fake_transcribe_call(audio_path, **kwargs): + captured.update(kwargs) + return {"ticker": "GIS", "call_id": "c1", "segments": 0} + + monkeypatch.setattr(server, "transcribe_call", _fake_transcribe_call) + monkeypatch.setenv("EARNINGS_STREAMING_ENABLED", "1") + + client = TestClient(server.app) + resp = client.post( + "/transcribe", + json={"audio_path": "/tmp/a.wav", "ticker": "GIS", "call_id": "c1", "publish_live": False}, + ) + assert resp.status_code == 200 + assert captured["publish_live"] is False From d968421cce7c3bc2cdaf4bb61a29c4cefd41ac25 Mon Sep 17 00:00:00 2001 From: helloiamvu Date: Sat, 4 Jul 2026 01:04:22 +0200 Subject: [PATCH 17/17] docs(28): mark STT->rolefact trigger as deferred operator-gated orchestration seam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex R7-6 flagged that STT does not auto-trigger the role/fact stage, so hosted /facts stays empty until role/fact runs. Documented as an intentional deferral in services/earnings/jobs/stt.py: role/fact needs the per-market TERM SPECS (ROLEFACT_TERMS) which STT does not have — they are market-specific and must be threaded capture->STT->role/fact (or fetched from the markets catalog), and WHO triggers role/fact is the same operator-gated orchestration decision the capture->STT entrypoint already documents. Tracked as a follow-up; the data plane (capture->STT->R2->role/fact->R2->serving) is fully wired, only the auto-trigger orchestration remains operator/scheduler-driven. --- services/earnings/jobs/stt.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/services/earnings/jobs/stt.py b/services/earnings/jobs/stt.py index 327fb0b..a039119 100644 --- a/services/earnings/jobs/stt.py +++ b/services/earnings/jobs/stt.py @@ -345,6 +345,19 @@ def transcribe_call( # co-located operator run. _maybe_upload_transcript_r2(ledger, ticker=ticker, call_id=call_id, r2_bucket=r2_bucket) + # DEFERRED ORCHESTRATION SEAM — STT -> role/fact trigger (Codex R7-6 P1). + # STT completes the TRANSCRIPT stage; the derived-FACT stage (earnings-rolefact + # Cloud Run Job) is NOT auto-triggered here, so hosted /facts stays empty until + # role/fact runs for this call. This is intentionally NOT wired in this + # deploy-runtime PR because role/fact needs the per-market TERM SPECS + # (ROLEFACT_TERMS — which strike words to count), which STT does NOT have: the + # terms are market-specific and must be threaded capture -> STT -> role/fact (or + # fetched from the markets catalog at role/fact time). WHO triggers role/fact + # (a scheduler, a transcript-ready Pub/Sub event fanning to a jobs.run, or an + # operator) is the same operator-gated orchestration decision the capture->STT + # entrypoint documents for capture. Tracked as a follow-up; until then role/fact + # is operator/scheduler-driven (env-supplied ticker/call_id/terms). + if publish_live: _maybe_publish_live( result.segments,