diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e5698e..19746b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ All notable changes to `mostlyright`. The format follows [Keep a Changelog](http ## [Unreleased] +### Fixed + +- **Earnings venue-rule compound-word under-count (D-30).** Hyphenated compounds + (e.g. `supply-chain` for `chain`) were dropped from BOTH venue counts by an + inverted guard; they now count on both venues per each issuer's actual rule. + Adds the per-occurrence `compound_type` axis to `schema.earnings_fact.v1` + (`standalone`/`open`/`hyphenated`/`closed`/`affix_derivation` — additive, + nullable): closed compounds (`wildfire` for `fire`) are Kalshi-No + + Polymarket human-review candidates, affix derivations (`joyful` for `joy`) + count for neither venue, and an unknown value fails loud in every venue + tally, at the `FactLedger` durable-write boundary, and through the catalog + read boundary. Live-path parity: the streaming classifier emits one delta + per `(term, compound_type)`; its engine-relative timestamp crosses the SSE + wire as `offset_seconds` (never as the wallclock `spoken_at`), and the + stream consumer fail-louds on any naive/numeric temporal field. + ## [1.11.0] — 2026-07-02 — Earnings-mention markets engine + SDK surface (Phase 27) Minor release. Adds the earnings-mention markets vertical (Kalshi diff --git a/packages/core/src/mostlyright/core/schemas/earnings_fact.py b/packages/core/src/mostlyright/core/schemas/earnings_fact.py index 993b643..a479dad 100644 --- a/packages/core/src/mostlyright/core/schemas/earnings_fact.py +++ b/packages/core/src/mostlyright/core/schemas/earnings_fact.py @@ -121,6 +121,32 @@ "regex", ) +#: Compound-type axis (D-30; RESEARCH-MARKETS §2.1(3) "Compound words — REAL +#: CROSS-VENUE DIVERGENCE" + §2.2 "Closed-compound divergence"). This is a +#: SEPARATE per-occurrence axis from :data:`MATCH_RULE_VALUES` — do NOT overload +#: the match-rule enum with these values. A hyphenated compound counts for the +#: bare term on BOTH venues; a closed (unhyphenated) compound (``wildfire`` for +#: ``fire``) diverges: Kalshi No, Polymarket candidate-only in v1. An affix +#: derivation (``joyful`` for ``joy``) counts for NEITHER venue. +COMPOUND_TYPE_VALUES: tuple[str, ...] = ( + "standalone", + "open", + "hyphenated", + "closed", + "affix_derivation", +) + +#: Venue auto-count membership over :data:`COMPOUND_TYPE_VALUES` (D-30). Both +#: venues auto-count standalone/open/hyphenated in v1. ``closed`` is a +#: candidate-only type — it never auto-counts; instead it surfaces for human +#: review when it could flip a Polymarket outcome (fact_builder), and it is +#: excluded from the Kalshi count entirely (Kalshi says No on a closed compound). +#: ``affix_derivation`` is in NEITHER set (counts for no venue). +KALSHI_AUTOCOUNT_COMPOUND_TYPES: frozenset[str] = frozenset({"standalone", "open", "hyphenated"}) +POLYMARKET_AUTOCOUNT_COMPOUND_TYPES: frozenset[str] = frozenset( + {"standalone", "open", "hyphenated"} +) + #: Resolution status (RESEARCH-MARKETS §2.1(6)). ``"provisional"`` (27-10 Task 1) #: marks a LIVE early-signal fact delta — NOT a settlement/backtest source #: (D-27.16); authority is gated on this column + ``source``, never on @@ -320,6 +346,20 @@ class EarningsFactSchema(Schema): enum_values=MATCH_RULE_VALUES, notes="§2.1(3) term.match_rule (plural/possessive OK, NOT tense)", ), + ColumnSpec( + name="compound_type", + dtype="enum", + units=None, + nullable=True, + enum_values=COMPOUND_TYPE_VALUES, + notes=( + "D-30 §2.1(3) per-occurrence compound axis — SEPARATE from " + "term_match_rule (do NOT overload MATCH_RULE_VALUES). standalone/" + "open/hyphenated auto-count on both venues; closed is Polymarket " + "candidate-only + Kalshi-No; affix_derivation counts for neither. " + "Nullable: pre-fix rows omit it (default 'standalone')" + ), + ), ColumnSpec( name="matched_surface_form", dtype="string", @@ -498,11 +538,14 @@ class EarningsFactSchema(Schema): __all__ = [ + "COMPOUND_TYPE_VALUES", "COUNTING_MODE_VALUES", "DELIVERY_VALUES", + "KALSHI_AUTOCOUNT_COMPOUND_TYPES", "KALSHI_COUNTABLE_ROLE_SOURCES", "KALSHI_COUNTABLE_SPEAKER_ROLES", "MATCH_RULE_VALUES", + "POLYMARKET_AUTOCOUNT_COMPOUND_TYPES", "RESOLUTION_STATUS_VALUES", "ROLE_SOURCE_VALUES", "SEGMENT_VALUES", diff --git a/packages/core/tests/test_earnings_fact_schema.py b/packages/core/tests/test_earnings_fact_schema.py index 761c99d..8c8712f 100644 --- a/packages/core/tests/test_earnings_fact_schema.py +++ b/packages/core/tests/test_earnings_fact_schema.py @@ -140,6 +140,54 @@ def test_resolution_status_includes_provisional() -> None: assert "provisional" in BY_NAME["resolution_status"].enum_values +def test_compound_type_axis_and_venue_autocount_sets() -> None: + """D-30: compound_type is a NEW axis (not overloaded onto term_match_rule). + + A hyphenated compound counts for the bare term on both venues; a closed + (unhyphenated) compound diverges — Kalshi No, Polymarket candidate-only in v1. + The compound_type column carries this per-occurrence axis; the venue + auto-count membership sets diverge the settlement filters row-wise exactly as + kalshi_counted does for speaker-scope. + """ + from mostlyright.core.schemas.earnings_fact import ( + COMPOUND_TYPE_VALUES, + KALSHI_AUTOCOUNT_COMPOUND_TYPES, + POLYMARKET_AUTOCOUNT_COMPOUND_TYPES, + ) + + assert COMPOUND_TYPE_VALUES == ( + "standalone", + "open", + "hyphenated", + "closed", + "affix_derivation", + ) + # New axis — NOT overloaded onto the match-rule enum. + assert "closed" not in MATCH_RULE_VALUES + assert "compound_type" not in MATCH_RULE_VALUES + + ct = BY_NAME["compound_type"] + assert ct.dtype == "enum" + assert ct.nullable is True # back-compat: old rows omit it + assert ct.enum_values == COMPOUND_TYPE_VALUES + + # v1 auto-count sets (closed is candidate-only, NOT auto-counted). + expected = frozenset({"standalone", "open", "hyphenated"}) + assert expected == KALSHI_AUTOCOUNT_COMPOUND_TYPES + assert expected == POLYMARKET_AUTOCOUNT_COMPOUND_TYPES + + +def test_compound_type_names_exported() -> None: + import mostlyright.core.schemas.earnings_fact as ef + + for name in ( + "COMPOUND_TYPE_VALUES", + "KALSHI_AUTOCOUNT_COMPOUND_TYPES", + "POLYMARKET_AUTOCOUNT_COMPOUND_TYPES", + ): + assert name in ef.__all__ + + def test_no_columnspec_carries_regex_or_cross_field() -> None: # ColumnSpec only has name/dtype/units/nullable/enum_values/notes — assert no # rogue regex/cross-field attribute leaked onto any spec. diff --git a/packages/weather/src/mostlyright/weather/catalog/earnings.py b/packages/weather/src/mostlyright/weather/catalog/earnings.py index a4af7ca..cd453d4 100644 --- a/packages/weather/src/mostlyright/weather/catalog/earnings.py +++ b/packages/weather/src/mostlyright/weather/catalog/earnings.py @@ -97,6 +97,12 @@ "mention_count", "role_source", "kalshi_counted", + # D-30 compound axis (review R2-F1): stripping this at the read boundary + # laundered closed/affix occurrences into standalone auto-counts on BOTH + # venues (inverting Kalshi's closed-compound No and bypassing the fail-loud + # Polymarket review). Projected only-when-present, so transcript rows and + # pre-fix payloads are unaffected. + "compound_type", ) @@ -427,27 +433,50 @@ def _project_stream_row(event: str, payload: object, stream_seq: int | None) -> return row +def _as_aware_live_timestamp(value: object, *, field: str) -> pd.Timestamp | None: + """Parse one live temporal field to a tz-aware Timestamp (None passes). + + An absent value is a schema shape, not a contract violation. Any PRESENT + value must parse to a tz-aware wallclock: a naive value would mis-order + against the tz-aware as_of cutoff downstream, and a bare number (e.g. an + engine-relative ``spoken_at`` float leaked across the D-30 wire seam — + ``offset_seconds`` is the correct wire field) silently coerces into a + 1970-epoch wallclock. The 27-11 wire contract is tz-aware UTC. + """ + if value is None: + return None + try: + ts = pd.Timestamp(value) + except (TypeError, ValueError) as exc: + raise LiveStreamError( + f"earnings live stream: {field}={value!r} is not parseable as a " + "timestamp — the 27-11 wire contract is tz-aware UTC." + ) from exc + if ts.tz is None: + raise LiveStreamError( + f"earnings live stream: {field}={value!r} must be tz-aware (UTC). " + "A naive/numeric value would coerce into a 1970-epoch wallclock or " + "mis-order against the tz-aware as_of cutoff downstream; an " + "engine-relative offset belongs in offset_seconds, not here." + ) + return ts + + def _assert_live_temporal_contract(spoken_at: object, knowledge_time: object) -> None: """Enforce ``knowledge_time (published_at) >= spoken_at`` for a live row. - A no-op when either value is absent (a transcript segment may carry only one, - and an absent value is a schema shape, not a contract violation). When BOTH - are present, a publish wallclock that predates the aired instant is - impossible and signals a mis-paired feed — raise ``LiveStreamError`` so a - malformed feed fails loud rather than yielding a knowledge_time that - understates availability. + Each PRESENT value must be a tz-aware wallclock (validated independently — + a transcript segment may carry only one, and a fact delta may carry + neither, but a malformed value never passes just because its partner is + absent). When BOTH are present, a publish wallclock that predates the + aired instant is impossible and signals a mis-paired feed — raise + ``LiveStreamError`` so a malformed feed fails loud rather than yielding a + knowledge_time that understates availability. """ - if spoken_at is None or knowledge_time is None: + spoken = _as_aware_live_timestamp(spoken_at, field="spoken_at") + published = _as_aware_live_timestamp(knowledge_time, field="knowledge_time") + if spoken is None or published is None: return - spoken = pd.Timestamp(spoken_at) - published = pd.Timestamp(knowledge_time) - if spoken.tz is None or published.tz is None: - # A naive live timestamp would mis-order against the tz-aware as_of - # cutoff downstream. The 27-11 wire contract is tz-aware UTC. - raise LiveStreamError( - "earnings live stream: spoken_at / knowledge_time must be tz-aware " - f"(UTC); got spoken_at={spoken_at!r}, knowledge_time={knowledge_time!r}." - ) if published < spoken: raise LiveStreamError( "earnings live stream: knowledge_time (publish/STT-finalization " diff --git a/packages/weather/src/mostlyright/weather/earnings/fact_builder.py b/packages/weather/src/mostlyright/weather/earnings/fact_builder.py index b7c2bc2..d6da12f 100644 --- a/packages/weather/src/mostlyright/weather/earnings/fact_builder.py +++ b/packages/weather/src/mostlyright/weather/earnings/fact_builder.py @@ -23,9 +23,13 @@ from __future__ import annotations +from datetime import datetime from typing import TYPE_CHECKING from mostlyright.core.schemas.earnings_fact import ( + COMPOUND_TYPE_VALUES, + KALSHI_AUTOCOUNT_COMPOUND_TYPES, + POLYMARKET_AUTOCOUNT_COMPOUND_TYPES, validate_kalshi_counted_occurrence, ) @@ -44,6 +48,25 @@ _DEFAULT_SEGMENT = "qa" _SOURCE_IDENTITY = "earnings_call" +#: Default compound_type for an occurrence/row that does not carry one — a pre-fix +#: ``stt_counts`` record or a PR #89 persisted row (D-30 back-compat). ``standalone`` +#: counts for BOTH venues, so a legacy row settles exactly as it did before the +#: compound_type axis existed. +_DEFAULT_COMPOUND_TYPE = "standalone" + +#: The closed-compound candidate type (D-30). Never auto-counts on either venue, +#: but a closed candidate that could flip a Polymarket outcome surfaces for human +#: review (see :func:`resolve_status`) rather than settling ``resolved_no``. +_CLOSED_COMPOUND_TYPE = "closed" + +#: The Polymarket resolution status for a closed-candidate straddle (D-30 decision +#: 3). Reuses the existing schema ``RESOLUTION_STATUS_VALUES`` member ``"disputed"`` +#: (human adjudication) — no new schema enum value is invented in this quick fix. +#: A ``disputed`` Polymarket resolution ALWAYS carries a non-zero closed-candidate +#: count that straddles the threshold; a human reviewer adjudicates whether the +#: closed compounds count under Polymarket's "distinct component" wording. +_POLYMARKET_REVIEW_STATUS = "disputed" + def build_fact_rows( stt_counts: Sequence[Mapping[str, object]], @@ -92,6 +115,12 @@ def build_fact_rows( "term_canonical": term, "term_accepted_forms": spec.get("term_accepted_forms", "[]"), "term_match_rule": spec.get("term_match_rule", _DEFAULT_MATCH_RULE), + # Per-occurrence compound axis (D-30). Each classify_mentions record is + # already a SINGLE classified occurrence, so a row is naturally + # one-per-compound_type (never an aggregate mixing types). A pre-fix + # occurrence lacking the key defaults to standalone (counts for both); + # an out-of-enum value fails loud at write time (review F5). + "compound_type": _compound_type(occ), "matched_surface_form": occ.get("matched_surface_form", term), "mention_count": int(occ.get("mention_count", 1)), "counting_mode": spec.get("counting_mode", _DEFAULT_COUNTING_MODE), @@ -106,7 +135,7 @@ def build_fact_rows( "resolution_status": "pending", # kalshi_counted is set by apply_kalshi_filter below — NEVER hand-set. "is_final": occ.get("is_final", True), - "spoken_at": occ.get("spoken_at"), + "spoken_at": _validated_spoken_at(occ), "event_time": event_time, "as_of_time": occ.get("as_of_time"), "source": _SOURCE_IDENTITY, @@ -124,6 +153,42 @@ def _turn_at(turns: Sequence[Turn], index: object) -> Turn | None: return None +def _validated_spoken_at(occ: Mapping[str, object]) -> datetime | None: + """Validate an occurrence's ``spoken_at`` as a tz-aware wallclock (R2-F3). + + The schema's ``spoken_at`` column is ``timestamp_utc``. pyarrow silently + coerces a float (e.g. the streaming engine's stream-relative ``12.5`` + seconds) to microseconds-after-epoch — the ledger would persist + ``1970-01-01 00:00:00.000012+00:00`` with NO error, corrupting the temporal + audit marker on every row. This is the fail-loud write seam, consistent + with the tz-aware enforcement on the SSE projection path + (``catalog/earnings.py::_assert_live_temporal_contract``): + + * ``None`` / absent -> ``None`` (nullable column — the streaming seam maps + its engine-relative float into ``offset_seconds`` instead, see + ``FactDelta.to_stt_count``). + * a tz-AWARE :class:`datetime` (incl. ``pd.Timestamp``) -> passed through. + * anything else (float, int, string, NAIVE datetime) -> :class:`ValueError`. + """ + spoken = occ.get("spoken_at") + if spoken is None: + return None + if not isinstance(spoken, datetime): + raise ValueError( + f"spoken_at must be a tz-aware datetime wallclock or None, got " + f"{type(spoken).__name__} {spoken!r}. An engine-relative float belongs " + "in offset_seconds — writing it to the timestamp_utc spoken_at column " + "silently persists as 1970-01-01 (pyarrow float->epoch coercion)." + ) + if spoken.tzinfo is None: + raise ValueError( + f"spoken_at must be tz-aware (UTC), got naive {spoken!r}. A naive " + "wallclock would mis-order against the tz-aware as_of cutoff " + "downstream (same contract as the SSE projection path)." + ) + return spoken + + def apply_kalshi_filter(rows: Sequence[Mapping[str, object]]) -> list[dict[str, object]]: """Set ``kalshi_counted`` on every row via the fail-closed rule (D-27.11). @@ -147,30 +212,124 @@ def apply_kalshi_filter(rows: Sequence[Mapping[str, object]]) -> list[dict[str, # --------------------------------------------------------------------------- # Venue-rule derivations over the fact rows # --------------------------------------------------------------------------- +def _compound_type(row: Mapping[str, object]) -> str: + """Return a row's validated ``compound_type``, defaulting to ``standalone``. + + A pre-fix row (or a PR #89 persisted row) lacks the key or carries ``None`` + — it defaults to ``standalone`` so it counts for BOTH venues exactly as + before the axis existed (D-30 back-compat). + + An UNKNOWN non-null value raises :class:`ValueError` (review F5): a typo'd + value (``"Standalone"``) would otherwise VANISH from every tally — not + auto-counted, not a closed candidate — silently resolving a true-Yes market + ``resolved_no``. Fail loud instead, consistent with the fail-loud ethos of + ``count_mentions`` / ``apply_kalshi_filter``. + """ + ct = row.get("compound_type") + # Missing / None / empty -> default. ``ct != ct`` catches float NaN: a MIXED + # old/new frame round-tripped through pandas (EarningsAdapter.from_rows -> + # to_dict("records")) fills NaN for a pre-fix row's absent compound_type — + # that is "missing" (default standalone), NOT an out-of-enum value (R2-F1). + if ct is None or ct == "" or ct != ct: + return _DEFAULT_COMPOUND_TYPE + ct_str = str(ct) + if ct_str not in COMPOUND_TYPE_VALUES: + raise ValueError( + f"unknown compound_type {ct_str!r}; expected one of " + f"{sorted(COMPOUND_TYPE_VALUES)} (or None/missing -> " + f"{_DEFAULT_COMPOUND_TYPE!r}). An out-of-enum value would silently " + "vanish from every venue tally and mis-settle the market." + ) + return ct_str + + def kalshi_boolean_settles(rows: Sequence[Mapping[str, object]]) -> bool: """Boolean Kalshi settlement: ``sum(mention_count | kalshi_counted) >= 1``. - Only the Kalshi-counted (transcript-anchored company-rep) occurrences - contribute; analyst / un-anchorable occurrences never flip a boolean Kalshi - market (D-27.11 fail-closed). + Only the Kalshi-counted (transcript-anchored company-rep) occurrences whose + ``compound_type`` is auto-countable on Kalshi + (``standalone``/``open``/``hyphenated`` — :data:`KALSHI_AUTOCOUNT_COMPOUND_TYPES`) + contribute. A ``closed`` compound (``wildfire`` for ``fire``) is Kalshi-No — it + NEVER flips a boolean Kalshi market — and an ``affix_derivation`` counts for + neither venue. This is in ADDITION to the D-27.11 speaker-scope + ``kalshi_counted`` gate (analyst / un-anchorable occurrences already excluded). """ - return sum(int(r.get("mention_count", 0)) for r in rows if r.get("kalshi_counted")) >= 1 + return ( + sum( + int(r.get("mention_count", 0)) + for r in rows + if r.get("kalshi_counted") and _compound_type(r) in KALSHI_AUTOCOUNT_COMPOUND_TYPES + ) + >= 1 + ) def polymarket_count(rows: Sequence[Mapping[str, object]]) -> int: - """Polymarket any-speaker tally: ``sum(mention_count)`` over ALL rows. + """Polymarket any-speaker AUTO tally over auto-countable compound types. Every occurrence counts regardless of ``kalshi_counted`` — Polymarket's rule - is speaker-blind (§2.1(4)). + is speaker-blind (§2.1(4)) — BUT only ``compound_type`` in + :data:`POLYMARKET_AUTOCOUNT_COMPOUND_TYPES` (``standalone``/``open``/ + ``hyphenated``) auto-counts in v1. A ``closed`` compound is a candidate-only + type (surfaced for human review by :func:`resolve_status`, NOT auto-counted); + an ``affix_derivation`` counts for neither venue (D-30 decision 3/4). """ - return sum(int(r.get("mention_count", 0)) for r in rows) + return sum( + int(r.get("mention_count", 0)) + for r in rows + if _compound_type(r) in POLYMARKET_AUTOCOUNT_COMPOUND_TYPES + ) + + +def closed_candidate_count(rows: Sequence[Mapping[str, object]]) -> int: + """Sum of ``mention_count`` over ``closed``-compound candidate rows (D-30). + + These are NOT auto-counted for Polymarket, but they COULD flip an outcome + under Polymarket's "distinct component" wording — :func:`resolve_status` + surfaces them for human review rather than settling ``resolved_no`` when they + straddle the threshold. + """ + return sum( + int(r.get("mention_count", 0)) for r in rows if _compound_type(r) == _CLOSED_COMPOUND_TYPE + ) def polymarket_threshold_met(rows: Sequence[Mapping[str, object]], threshold_n: int) -> bool: - """Polymarket ``"say X N+ times"`` bracket: any-speaker count ``>= threshold_n``.""" + """Polymarket ``"say X N+ times"`` bracket: AUTO count ``>= threshold_n``. + + Uses the auto-countable :func:`polymarket_count` (closed candidates excluded). + A closed-candidate straddle is handled by :func:`resolve_status` (fail-loud), + not here. + """ return polymarket_count(rows) >= threshold_n +def resolve_polymarket_status( + rows: Sequence[Mapping[str, object]], + threshold_n: int, +) -> str: + """Fail-loud Polymarket resolution over the compound_type axis (D-30 decision 3). + + * ``auto >= threshold`` -> ``"resolved_yes"`` (closed candidates irrelevant). + * ``auto < threshold`` but ``auto + closed >= threshold`` -> the closed + candidates COULD flip the outcome, so return the review status + (:data:`_POLYMARKET_REVIEW_STATUS` == ``"disputed"``) — NEVER a silent + ``"resolved_no"``. A human adjudicates whether the closed compounds count. + * ``auto + closed < threshold`` -> ``"resolved_no"`` (candidates cannot change + the outcome, so no review is needed). + + ``auto`` is :func:`polymarket_count` (standalone/open/hyphenated); + ``affix_derivation`` rows count for neither venue and are ignored here. + """ + auto = polymarket_count(rows) + if auto >= threshold_n: + return "resolved_yes" + closed = closed_candidate_count(rows) + if auto + closed >= threshold_n: + return _POLYMARKET_REVIEW_STATUS + return "resolved_no" + + def resolve_status( rows: Sequence[Mapping[str, object]], *, @@ -179,10 +338,15 @@ def resolve_status( ) -> str: """Return a ``resolution_status`` for the venue given the counted rows. - ``venue`` is ``"kalshi"`` (boolean ``>=1`` on Kalshi-counted rows) or - ``"polymarket"`` (any-speaker count vs ``threshold_n``). Returns - ``"resolved_yes"`` / ``"resolved_no"`` per RESEARCH-MARKETS §2.1(6). A + ``venue`` is ``"kalshi"`` (boolean ``>=1`` on Kalshi-counted, auto-countable + compound-type rows) or ``"polymarket"`` (delegates to + :func:`resolve_polymarket_status` — fail-loud on closed candidates). Returns + ``"resolved_yes"`` / ``"resolved_no"`` per RESEARCH-MARKETS §2.1(6), plus + ``"disputed"`` for a Polymarket closed-candidate straddle (D-30 decision 3). A ``no_qualifying_event`` (no rows at all) surfaces as ``"no_qualifying_event"``. + + Kalshi resolution is unchanged except the compound-type restriction (a + ``closed``-only mention still resolves Kalshi-No, which is correct). """ if not rows: return "no_qualifying_event" @@ -190,15 +354,17 @@ def resolve_status( return "resolved_yes" if kalshi_boolean_settles(rows) else "resolved_no" if venue == "polymarket": n = threshold_n if threshold_n is not None else 1 - return "resolved_yes" if polymarket_threshold_met(rows, n) else "resolved_no" + return resolve_polymarket_status(rows, n) raise ValueError(f"unknown venue {venue!r}; expected 'kalshi' or 'polymarket'") __all__ = [ "apply_kalshi_filter", "build_fact_rows", + "closed_candidate_count", "kalshi_boolean_settles", "polymarket_count", "polymarket_threshold_met", + "resolve_polymarket_status", "resolve_status", ] diff --git a/packages/weather/src/mostlyright/weather/earnings/ledger.py b/packages/weather/src/mostlyright/weather/earnings/ledger.py index da463e0..fb7a969 100644 --- a/packages/weather/src/mostlyright/weather/earnings/ledger.py +++ b/packages/weather/src/mostlyright/weather/earnings/ledger.py @@ -37,7 +37,9 @@ from filelock import FileLock from mostlyright._internal._bounds import assert_path_under from mostlyright._internal._cache_dir import resolve_cache_root_without_v1 +from mostlyright.core.exceptions import EarningsFactCorruptError from mostlyright.core.schemas.earnings_fact import ( + COMPOUND_TYPE_VALUES, EarningsFactSchema, validate_kalshi_counted_rows, ) @@ -254,7 +256,23 @@ def _validate_normalized(self, rows: Sequence[Mapping[str, object]]) -> None: (``KalshiCountRuleViolation`` for a fabricated ``True``, ``EarningsFactCorruptError`` for an under-counted ``False``) BEFORE the parquet write — a corrupt count can never reach disk. + + ``compound_type`` (D-30) is re-validated here too: it is the venue-count + axis, and an out-of-enum value silently vanishes from every venue tally + downstream (``fact_builder`` fail-louds at tally time, but ``/facts`` + serves stored rows directly). ``None`` passes — pre-fix rows omit the + nullable column (default-standalone semantics). """ + for idx, row in enumerate(rows): + compound_type = row.get("compound_type") + if compound_type is not None and compound_type not in COMPOUND_TYPE_VALUES: + raise EarningsFactCorruptError( + f"fact row {idx}: compound_type={compound_type!r} is not a " + f"canonical value {sorted(COMPOUND_TYPE_VALUES)} (or None for " + "a pre-fix row). An out-of-enum value would vanish from every " + "venue tally and mis-settle the market (D-30) — refusing the " + "durable write." + ) validate_kalshi_counted_rows(rows, strict=True) diff --git a/packages/weather/src/mostlyright/weather/earnings/streaming_transcriber.py b/packages/weather/src/mostlyright/weather/earnings/streaming_transcriber.py index 61c84d0..eb634ab 100644 --- a/packages/weather/src/mostlyright/weather/earnings/streaming_transcriber.py +++ b/packages/weather/src/mostlyright/weather/earnings/streaming_transcriber.py @@ -26,11 +26,16 @@ 3. **partial → final.** An in-flight window emits a *revisable* PARTIAL ``Segment`` (``is_final=False``). On the speech-end boundary the segment is re-emitted FINAL (``is_final=True``) with the stabilised text. -4. **final-only counting.** :func:`~mostlyright.weather.earnings.stt.count_mentions` - runs over FINAL segments ONLY → :class:`FactDelta`s - (``resolution_status="provisional"``, ``is_final=True``, +4. **final-only counting.** :func:`~mostlyright.weather.earnings.stt.classify_mentions` + runs over FINAL segments ONLY → :class:`FactDelta`s aggregated per + ``(term, compound_type)`` (``resolution_status="provisional"``, + ``is_final=True``, ``kalshi_counted=validate_kalshi_counted_occurrence(...)``). A term in a PARTIAL is NEVER counted (else a retracted word settles a market — T-27-35). + classify_mentions (NOT count_mentions) is the production counter (D-30, + review F1): it additionally surfaces CLOSED-compound candidates + (``firefighter`` for ``fire``) that count_mentions never matches — without + them a Polymarket closed-straddle silently resolved ``resolved_no``. **Authority (D-27.16, codex P2).** ``is_final`` denotes STT-segment finality ONLY; a counted fact delta is ``is_final=True`` yet STILL @@ -63,7 +68,7 @@ validate_kalshi_counted_occurrence, ) -from .stt import count_mentions, seed_initial_prompt +from .stt import classify_mentions, seed_initial_prompt if TYPE_CHECKING: from collections.abc import Mapping, Sequence @@ -127,6 +132,13 @@ class FactDelta: finality — orthogonal to authority), ``kalshi_counted`` derived via the unchanged fail-closed :func:`validate_kalshi_counted_occurrence` (D-27.11, mode-agnostic). TEXT/facts only — no audio (D-27.9). + + ``compound_type`` (D-30, review F1) is the per-delta compound axis — one + delta per ``(term, compound_type)``, never an aggregate mixing types, so the + venue filters in ``fact_builder`` apply row-wise (closed candidates surface + for Polymarket human review; Kalshi-No on closed). ADDITIVE with default + ``"standalone"``: an existing SSE consumer or a persisted pre-fix delta is + unaffected (standalone counts for both venues exactly as before the axis). """ term_canonical: str @@ -139,9 +151,44 @@ class FactDelta: is_final: bool spoken_at: float stream_seq: int + compound_type: str = "standalone" resolution_status: str = "provisional" source: str = _SOURCE_IDENTITY + def to_stt_count(self, *, turn_index: int | None = None) -> dict[str, object]: + """Map this delta to a ``build_fact_rows`` ``stt_counts`` occurrence record. + + The propagation seam (review F1): ``compound_type`` survives from the + live classifier through the occurrence record into the fact row, so the + venue filters + the fail-loud closed-candidate resolution operate on the + SAME axis end-to-end. ``turn_index`` (when known) links the occurrence + to the role-parser turns list so ``build_fact_rows`` re-derives the + speaker scope from the authoritative turn. + + TEMPORAL MAPPING (review R2-F3): this delta's ``spoken_at`` is an + ENGINE-RELATIVE float (seconds into the stream, e.g. ``12.5``), NOT a + wallclock. It maps into ``offset_seconds`` — the schema's + engine-relative int audit field, which ``build_fact_rows`` already + accepts. It is NEVER emitted as the occurrence's ``spoken_at``: that + schema column is ``timestamp_utc`` and pyarrow silently coerces a float + to microseconds-after-epoch, persisting ``1970-01-01 + 00:00:00.000012+00:00`` as the temporal audit marker. ``spoken_at`` is + left ABSENT (the column is nullable); a caller holding a genuine + tz-aware wallclock sets it on the occurrence record explicitly — + ``build_fact_rows`` fail-louds on any non-tz-aware-datetime value. + """ + occ: dict[str, object] = { + "term": self.term_canonical, + "matched_surface_form": self.matched_surface_form, + "mention_count": self.mention_count, + "compound_type": self.compound_type, + "is_final": self.is_final, + "offset_seconds": int(self.spoken_at), + } + if turn_index is not None: + occ["turn_index"] = turn_index + return occ + def _rms(pcm: bytes) -> float: """Root-mean-square amplitude of s16le PCM (energy VAD signal).""" @@ -352,10 +399,20 @@ def _emit_segment(self, *, text: str, spoken_at: float, is_final: bool) -> Segme # -- final-only counting --------------------------------------------- def _count_final(self, segment: Segment) -> list[FactDelta]: - """Run the 27-03 counter over a FINAL segment → provisional fact deltas. - - Partials NEVER reach here (D-27.14). Each counted term becomes a - provisional delta carrying the fail-closed ``kalshi_counted`` flag. + """Run the 27-03 classifier over a FINAL segment → provisional fact deltas. + + Partials NEVER reach here (D-27.14). :func:`classify_mentions` (NOT + ``count_mentions``) is the production counter (D-30, review F1): each + occurrence carries a ``compound_type``, and occurrences are aggregated + into ONE delta per ``(term, compound_type)`` — a closed compound + (``firefighter`` for ``fire``) now produces a closed-candidate delta + that the fact_builder resolution surfaces for Polymarket human review + instead of silently resolving ``resolved_no``. The word-boundary + occurrences (standalone/open/hyphenated) tally IDENTICALLY to the old + ``count_mentions`` path, so existing consumers see the same counts, just + split per compound_type. Each delta carries the fail-closed + ``kalshi_counted`` flag (speaker-scope axis, unchanged — the + compound-type venue filtering is fact_builder's job). """ if not segment.is_final or not self._market_terms: return [] @@ -371,23 +428,31 @@ def _count_final(self, segment: Segment) -> list[FactDelta]: if not term: continue match_rule = str(spec.get("term_match_rule", _DEFAULT_MATCH_RULE)) - count, surfaces = count_mentions(segment.text, term, match_rule=match_rule) - if count <= 0: + records = classify_mentions(segment.text, term, match_rule=match_rule) + if not records: continue - deltas.append( - FactDelta( - term_canonical=term, - matched_surface_form=surfaces[0] if surfaces else term, - mention_count=count, - speaker_role=speaker_role, - role_source=role_source, - speaker_name=speaker_name, - kalshi_counted=kalshi_counted, - is_final=True, - spoken_at=segment.spoken_at, - stream_seq=segment.stream_seq, + # Aggregate per compound_type (insertion order = first occurrence + # order, deterministic): one delta per (term, compound_type) so the + # venue filters apply row-wise downstream — never mixed in one delta. + by_type: dict[str, list[dict[str, object]]] = {} + for rec in records: + by_type.setdefault(str(rec["compound_type"]), []).append(rec) + for compound_type, recs in by_type.items(): + deltas.append( + FactDelta( + term_canonical=term, + matched_surface_form=str(recs[0].get("surface", term)), + mention_count=len(recs), + speaker_role=speaker_role, + role_source=role_source, + speaker_name=speaker_name, + kalshi_counted=kalshi_counted, + is_final=True, + spoken_at=segment.spoken_at, + stream_seq=segment.stream_seq, + compound_type=compound_type, + ) ) - ) return deltas diff --git a/packages/weather/src/mostlyright/weather/earnings/stt.py b/packages/weather/src/mostlyright/weather/earnings/stt.py index 523f8c2..f47d822 100644 --- a/packages/weather/src/mostlyright/weather/earnings/stt.py +++ b/packages/weather/src/mostlyright/weather/earnings/stt.py @@ -524,17 +524,31 @@ def _form_to_pattern(form: str, *, case_sensitive: bool = False) -> re.Pattern[s so a form ending in an apostrophe (``companies'``, ``tariff's``) still anchors correctly — a literal trailing ``\\b`` after ``'`` never matches (``'`` is a non-word char), which silently killed every plural-possessive. - * Hyphenated-compound rule (LOCKED): a bare-word match must NOT be part of a - hyphenated compound — ``pre-tariff`` / ``anti-tariff`` are DISTINCT compounds, - not the bare word ``tariff`` (like a tense inflection, they are excluded per - the Kalshi exact/plural/possessive rule). The extra lookarounds ``(? appears separated from other words by a space or hyphen or in + a compound form with a space or hyphen, it counts." + - Polymarket Event+Mentions DeFi: + https://polymarket-upload.s3.us-east-2.amazonaws.com/market_products/Event+Mentions+Contract+DeFi.pdf + — agrees: hyphenated compounds count for the bare word. + + An earlier version added inverted lookarounds ``(? re.Pattern[s parts.append(sub) body = r"\s+".join(parts) flags = 0 if case_sensitive else re.IGNORECASE - # Hyphenated-compound guard: only for forms that do NOT themselves contain a - # hyphen (a hyphenated product code like ``GLP-1`` opts out so it still - # matches). ``(? list[_Form]: + """Validate ``(term, match_rule)`` and expand to the matchable surface forms. + + The SHARED fail-loud form-prep path for BOTH :func:`count_mentions` and + :func:`classify_mentions` (review F2): a typo'd ``match_rule`` must not + silently fall through to ``exact`` semantics (dropping the plural/possessive + forms -> settlement under-count), and a degenerate term (empty / a bare + synonym separator / lone punctuation) must not silently match nothing (a + market would settle "not mentioned" on a config bug). Raises + :class:`ValueError` for every such case; returns the expanded + :class:`_Form` list otherwise. + """ + if match_rule not in _VALID_MATCH_RULES: + raise ValueError( + f"unknown match_rule {match_rule!r}; expected one of {sorted(_VALID_MATCH_RULES)}" + ) + if not term or not term.strip(): + raise ValueError( + "mention matching requires a non-empty `term`; an empty term would " + "silently count 0 for every transcript (a settlement would settle " + "'not mentioned' on a config error)." + ) + forms = _match_forms_for_term(term, match_rule, asr_misrenders=asr_misrenders) + if not forms: + # A term that is truthy-but-degenerate (only a synonym separator, e.g. + # ``" / "``) explodes to zero surface forms and would silently count 0 for + # EVERY transcript — the same silent-settle-"not mentioned" hazard as an + # empty term. Fail loud instead. + raise ValueError( + f"term {term!r} expands to no surface forms (a bare synonym separator); " + "it would silently count 0 for every transcript. Provide a real target " + "term." + ) + if not any(re.search(r"\w", form.surface) for form in forms): + # A bare single-punctuation term (``"/"``, ``"."``, ``"-"``, ``"&"``) does + # NOT explode to zero forms — ``_explode_slash_forms`` only splits a + # whitespace-flanked slash, so ``"/"`` survives as its own one-element form. + # But every such form is word-character-free, and ``_form_to_pattern`` anchors + # each with ``(? str: + """Return the immediately space-adjacent alphabetic word (lowercased), or ``""``. + + ``backward=True`` reads the word ending just before ``text`` (a right-stripped + left context); otherwise the word starting just after ``text`` (a left-stripped + right context). Only a SINGLE space of separation counts as adjacency — a + sentence boundary (punctuation) breaks it. + """ + if backward: + if not text.endswith(" "): + return "" + tail = text.rstrip() + m = re.search(r"([A-Za-z]+)$", tail) + return m.group(1).lower() if m and tail.endswith(m.group(1)) else "" + if not text.startswith(" "): + return "" + head = text.lstrip() + m = re.match(r"([A-Za-z]+)", head) + return m.group(1).lower() if m else "" + + +def _classify_span(transcript: str, start: int, end: int, surface: str) -> str: + """Classify a WORD-BOUNDARY-anchored occurrence: standalone/open/hyphenated. + + ``surface`` is the matched span (``start``:``end`` in ``transcript``). A + match adjacent to ``hyphen+word`` on EITHER side is ``hyphenated`` + (``pre-tariff`` / ``tariff-based`` / ``New York-based``). A match whose + space-adjacent partner word is a curated open-compound partner + (:data:`_OPEN_COMPOUND_PARTNERS` — ``fire station``, ``supply chain``) is + ``open``. Otherwise the bare word (including a possessive ``tariff's`` or a + word in ordinary prose ``tariff here``) is ``standalone``. + + NOTE: both venues auto-count ``standalone`` AND ``open`` identically, so an + open-vs-standalone mistag never changes a settlement; the conservative default + to ``standalone`` is safe. + """ + prev_ch = transcript[start - 1] if start > 0 else "" + next_ch = transcript[end] if end < len(transcript) else "" + # Hyphenated: a hyphen immediately abuts the match on either side and the far + # side of that hyphen is a word char (``pre-tariff`` / ``tariff-based``). + if prev_ch == "-" and start - 2 >= 0 and transcript[start - 2].isalnum(): + return "hyphenated" + if next_ch == "-" and end + 1 < len(transcript) and transcript[end + 1].isalnum(): + return "hyphenated" + # Open compound: a curated space-adjacent partner word on either side. + right = _adjacent_word(transcript[end:], backward=False) + if right in _OPEN_COMPOUND_PARTNERS: + return "open" + left = _adjacent_word(transcript[:start], backward=True) + if left in _OPEN_COMPOUND_PARTNERS: + return "open" + return "standalone" + + +def _closed_or_affix(longer_word: str, term_lower: str, at: int) -> str: + """Classify a substring hit of ``term_lower`` fused inside a longer word component. + + ``longer_word`` is a hyphen-free scan unit: a whole unhyphenated token + (``wildfire``) or ONE hyphen-separated component of a hyphenated token + (the ``wildfire`` of ``wildfire-related`` — R3-F1). + + ``at`` is the offset of the term inside ``longer_word`` (both lowercased). + Returns ``"closed"`` when the term stays a DISTINCT component (``wildfire`` / + ``firefighter`` / ``killjoy`` / ``oversupply`` / ``prepayment``) or + ``"affix_derivation"`` when the TRAILING characters form a curated + grammatical suffix that alters the root (``joyful`` / ``running``). D-30 + decision 4: CURATED stdlib heuristic, no dictionary dep; ambiguity resolves + CONSERVATIVELY to ``"closed"`` (a candidate that surfaces to the human + reviewer) rather than a silent drop. + + Review F4: prefix residuals NEVER classify as ``affix_derivation`` — the + earlier prefix branch silently dropped genuine closed compounds + (``oversupply`` / ``prepayment`` / ``underdog``) from the candidate count, + letting a Polymarket straddle resolve ``resolved_no`` with no human review. + Only the suffix branch below may return ``affix_derivation``. + """ + before = longer_word[:at] + after = longer_word[at + len(term_lower) :] + # Pure suffix derivation: nothing before, a curated derivational suffix after + # (``joy`` + ``ful`` -> joyful; ``run`` + ``ning`` -> running, doubled). + if not before and after: + norm_after = after + # Tolerate a doubled final consonant of the root before ``-ing/-ed`` etc. + # (``run`` -> ``running`` has an extra leading ``n``). + if norm_after[:1] == term_lower[-1:] and len(norm_after) > 1: + norm_after = norm_after[1:] + if any(norm_after == suf or after == suf for suf in _DERIVATIONAL_SUFFIXES): + return "affix_derivation" + # Everything else where the term stays a distinct component -> closed + # candidate (``wildfire``: wild+fire; ``firefighter``: fire+fighter; + # ``killjoy``: kill+joy; ``oversupply``: over+supply — prefix residuals are + # ALWAYS closed, F4). Ambiguous cases land here CONSERVATIVELY. + return "closed" + + +#: A whole-word token for the closed-compound substring pass. Interior chars may +#: include the apostrophe variants (:data:`_APOSTROPHES`) and hyphens so a +#: possessive / hyphenated word is one token. Built off ``_APOSTROPHES`` so the +#: ambiguous-glyph noqa lives in ONE place. +_WORD_RE = re.compile(rf"[A-Za-z][A-Za-z{_APOSTROPHES}-]*[A-Za-z]|[A-Za-z]") + + +def classify_mentions( + transcript: str, + term: str, + *, + match_rule: str = "plural_possessive_ok_no_tense", + asr_misrenders: bool = False, +) -> list[dict[str, Any]]: + """Classify EACH occurrence of ``term`` in ``transcript`` by compound type. + + A SIBLING of :func:`count_mentions` (which is unchanged): returns one record + per occurrence, each ``{"surface", "start", "compound_type"}`` where + ``compound_type`` is one of ``standalone`` / ``open`` / ``hyphenated`` / + ``closed`` / ``affix_derivation`` (D-30 §2.1(3) cross-venue compound + divergence). It reuses the SAME form expansion + apostrophe + plural/ + possessive machinery as :func:`count_mentions` (``_match_forms_for_term`` + + ``_form_to_pattern``) — never bare exact equality — so a possessive + (``tariff's``) is a standalone occurrence, not a miss. + + The word-boundary pass finds ``standalone`` / ``open`` / ``hyphenated`` + occurrences exactly as ``count_mentions`` counts them (post-D-30 the hyphen + guard is gone, so ``pre-tariff`` is a real occurrence tagged ``hyphenated``). + An ADDITIONAL substring pass finds ``closed`` candidates — a surface form + FUSED inside a longer word component where the term stays a distinct part + (``wildfire``, ``killjoy``, and the ``wildfire`` component of + ``wildfire-related`` — R3-F1: hyphenated tokens are split and each component + scanned) — and separates true CLOSED compounds from ``affix_derivation`` + roots (``joyful``, incl. inside hyphenated tokens: ``joyful-sounding``) via + a curated stdlib suffix heuristic (:func:`_closed_or_affix`; no dictionary + dep — D-30 decision 4). The heuristic is CONSERVATIVE: an ambiguous case is + a ``closed`` candidate that surfaces to a human reviewer, NEVER a silent + drop. + + Overlap handling matches ``count_mentions``: longest-first, and a span is + classified once (the word-boundary occurrences win; a closed substring pass + skips any span already covered so ``pre-tariff`` is not double-counted as both + hyphenated and closed). + + Raises :class:`ValueError` for an unrecognized ``match_rule`` or a degenerate + ``term`` — IDENTICALLY to :func:`count_mentions` (review F2, shared + :func:`_validated_forms_for_term` path): silently returning ``[]`` would + settle "not mentioned" on a config bug. + """ + forms = _validated_forms_for_term(term, match_rule, asr_misrenders=asr_misrenders) + forms_sorted = sorted( + set(forms), key=lambda f: (len(f.surface), f.surface, f.case_sensitive), reverse=True + ) + + records: list[dict[str, Any]] = [] + covered: list[tuple[int, int]] = [] + + def _overlaps(s: int, e: int) -> bool: + return any(s < ce and cs < e for cs, ce in covered) + + # Pass 1: word-boundary occurrences (standalone / open / hyphenated) — the + # same matcher count_mentions uses. + for form in forms_sorted: + pattern = _form_to_pattern(form.surface, case_sensitive=form.case_sensitive) + for m in pattern.finditer(transcript): + s, e = m.start(), m.end() + if _overlaps(s, e): + continue + covered.append((s, e)) + records.append( + { + "surface": m.group(0), + "start": s, + "compound_type": _classify_span(transcript, s, e, m.group(0)), + } + ) + + # Pass 2: closed-compound candidates — a surface form appearing FUSED as a + # substring inside a longer word component (never matched by the + # word-boundary pass). The scan unit is the hyphen-separated COMPONENT + # (R3-F1): an unhyphenated token is one component; a hyphenated token + # (``wildfire-related``) is split and each component scanned — pass 1 only + # covers the term as a DISTINCT hyphen-separated element (``fire-related``), + # not fused inside a component (``wildfire``-related). Only single-token + # alphabetic forms are scanned (a multi-word phrase is not a closed-compound + # component in prose). The guard mirroring pass 1's over-count protection + # (review F3 -> R2-F2): + # * a CASE-SENSITIVE form — an ACRONYM (``AI`` / ``OCI``) or a Capitalized / + # mixed-case token (``Block`` / ``Chewy`` / ``iPhone``) — scans + # case-sensitively. Pass 1 refuses the verb ``block`` for the ticker + # ``Block`` and the lowercase ``oci`` for the acronym ``OCI``, so the + # substring pass must not bypass those guards case-blind: ``blockchain`` is + # NOT a candidate for ``Block``, ``social`` NOT for ``OCI``, ``said`` NOT + # for ``AI``. A CASE-PRESERVED substring IS a conservative closed CANDIDATE + # (``GenAI`` / ``OpenAI`` for ``AI``, ``Blockchain`` for ``Block``) — human + # review adjudicates; candidates never auto-count, so this is review + # surface, not a tally. Acronyms are NOT excluded entirely (round-1's + # overcorrection, R2-F2): full exclusion silently dropped genuine acronym + # compounds — ``GenAI`` for term ``AI`` produced ZERO rows and a Polymarket + # threshold-1 market resolved resolved_no with no review. + scanned: set[tuple[str, bool]] = set() + for form in forms_sorted: + surface = form.surface + if " " in surface or "-" in surface or not surface.isalpha(): + continue + key = (surface if form.case_sensitive else surface.lower(), form.case_sensitive) + if key in scanned: + continue + scanned.add(key) + low_surface = surface.lower() + for wm in _WORD_RE.finditer(transcript): + word = wm.group(0) + # A hyphenated token is NOT skipped (review R3-F1): pass 1 only + # matches the term as a DISTINCT hyphen-separated element + # (``fire-related``); it cannot match ``fire`` FUSED inside the + # component ``wildfire`` of ``wildfire-related`` — skipping the + # whole token silently dropped that closed candidate (auto=0, + # closed=0 -> resolved_no instead of disputed). Split the token + # into its hyphen-separated components and scan EACH exactly like + # an unhyphenated word; a component that exactly equals the surface + # (under the form's case rule) is pass 1's boundary-match territory + # and is skipped explicitly (the covered-span check would also + # catch it — both guards are exercised by tests). + comp_offset = 0 + for comp in word.split("-"): + comp_start_in_word = comp_offset + comp_offset += len(comp) + 1 # advance past the component + "-" + if not comp: + continue + if form.case_sensitive: + if comp == surface: + continue # exact element -> pass 1 (matched or refused) + idx = comp.find(surface) # case-preserved substring (F3/R2-F2) + else: + low_comp = comp.lower() + if low_comp == low_surface: + continue # exact element -> pass 1 already handled it + idx = low_comp.find(low_surface) + if idx < 0: + continue + # Absolute offsets account for the component's position inside + # the token so covered-span bookkeeping stays correct (R3-F1). + abs_start = wm.start() + comp_start_in_word + idx + abs_end = abs_start + len(low_surface) + if _overlaps(abs_start, abs_end): + continue + covered.append((abs_start, abs_end)) + records.append( + { + "surface": comp[idx : idx + len(low_surface)], + "start": abs_start, + # Classification runs on the COMPONENT, not the whole + # token (``joyful-sounding`` for ``joy`` -> the + # component ``joyful`` -> affix_derivation). + # _closed_or_affix works on the lowercase-normalized + # component; lowercasing preserves offsets so ``idx`` + # is valid for both. + "compound_type": _closed_or_affix(comp.lower(), low_surface, idx), + } + ) + + records.sort(key=lambda r: r["start"]) + return records + + # --------------------------------------------------------------------------- # Transcriber # --------------------------------------------------------------------------- @@ -782,6 +1180,7 @@ def transcribe( __all__ = [ "SttTranscriber", "TranscriptResult", + "classify_mentions", "count_mentions", "seed_initial_prompt", ] diff --git a/packages/weather/tests/catalog/test_earnings_adapter.py b/packages/weather/tests/catalog/test_earnings_adapter.py index d10ab11..c729e1d 100644 --- a/packages/weather/tests/catalog/test_earnings_adapter.py +++ b/packages/weather/tests/catalog/test_earnings_adapter.py @@ -87,6 +87,83 @@ def _fact_rows() -> list[dict[str, object]]: ] +def _closed_fact_row() -> dict[str, object]: + """A fact row whose only occurrence is a CLOSED-compound candidate (D-30).""" + return { + "ticker": "STZ", + "call_id": "STZ-2026Q1", + "term_canonical": "fire", + "matched_surface_form": "fire", + "mention_count": 1, + "speaker_role": "company_executive", + "role_source": "transcript_structural", + "segment": "qa", + "kalshi_counted": True, + "compound_type": "closed", + "event_time": "2026-07-01T12:10:00Z", + "transcript_available_at": "2026-07-01T14:30:00Z", + } + + +def test_compound_type_survives_from_rows_projection(): + """compound_type crosses the canonical read boundary intact (review R2-F1). + + _PASSTHROUGH_FIELDS used to strip compound_type at _project_row, so a + closed-only 'firefighter' row laundered into a standalone auto-count after + EarningsAdapter.from_rows — resolving resolved_yes on BOTH venues and + inverting Kalshi's closed-compound No. The closed row must stay closed + through the projection and still resolve Kalshi no / Polymarket disputed. + """ + from mostlyright.weather.earnings.fact_builder import resolve_status + + df = EarningsAdapter.from_rows( + [_closed_fact_row()], source="earnings.hosted", retrieved_at=_RETRIEVED_AT + ) + assert "compound_type" in df.columns + assert df["compound_type"].iloc[0] == "closed" + + rows = df.to_dict("records") + # Kalshi: closed compounds are Kalshi-No — never resolved_yes. + assert resolve_status(rows, venue="kalshi") == "resolved_no" + # Polymarket: the closed candidate straddles threshold 1 -> disputed + # (human review), NEVER a laundered resolved_yes or silent resolved_no. + assert resolve_status(rows, venue="polymarket", threshold_n=1) == "disputed" + + +def test_prefix_payload_without_compound_type_projects_cleanly(): + """A pre-fix payload lacking compound_type still projects (back-compat, R2-F1).""" + # Fact rows without the field: no compound_type column materializes. + df = EarningsAdapter.from_rows( + _fact_rows(), source="earnings.hosted", retrieved_at=_RETRIEVED_AT + ) + assert "compound_type" not in df.columns + # Transcript rows are unaffected too. + df2 = EarningsAdapter.from_rows( + _transcript_rows(), source="earnings.hosted", retrieved_at=_RETRIEVED_AT + ) + assert "compound_type" not in df2.columns + + +def test_mixed_prefix_and_new_rows_resolve_after_projection(): + """Mixed old/new rows through the projection still resolve (NaN-safe, R2-F1). + + pandas fills NaN for the pre-fix row's missing compound_type in a mixed + frame; the venue tallies must treat NaN as missing (default standalone) — + NOT raise, and NOT vanish the row from the tallies. + """ + from mostlyright.weather.earnings.fact_builder import polymarket_count, resolve_status + + legacy = dict(_fact_rows()[0]) # no compound_type; mention_count 3, exec, counted + df = EarningsAdapter.from_rows( + [legacy, _closed_fact_row()], source="earnings.hosted", retrieved_at=_RETRIEVED_AT + ) + rows = df.to_dict("records") + # Legacy row (NaN compound_type) counts as standalone: auto count = 3. + assert polymarket_count(rows) == 3 + # Kalshi: the legacy anchorable standalone row settles yes. + assert resolve_status(rows, venue="kalshi") == "resolved_yes" + + def test_supported_sources(): assert EarningsAdapter.SUPPORTED_SOURCES == ["earnings.live", "earnings.hosted"] diff --git a/packages/weather/tests/earnings/test_fact_builder_kalshi_filter.py b/packages/weather/tests/earnings/test_fact_builder_kalshi_filter.py index 43e4023..e5dca72 100644 --- a/packages/weather/tests/earnings/test_fact_builder_kalshi_filter.py +++ b/packages/weather/tests/earnings/test_fact_builder_kalshi_filter.py @@ -201,6 +201,248 @@ def test_resolve_status_rejects_unknown_venue() -> None: resolve_status([{"mention_count": 1, "kalshi_counted": True}], venue="ftx") +# --------------------------------------------------------------------------- +# D-30: compound_type row split + venue filters + fail-loud closed-candidate review +# --------------------------------------------------------------------------- +def _occ_ct( + turn_index: int, compound_type: str, *, count: int = 1, term: str = "Marketing" +) -> dict: + occ = _occ(turn_index, count=count, term=term) + occ["compound_type"] = compound_type + return occ + + +def test_build_fact_rows_splits_per_compound_type() -> None: + # Same term, three different compound_types -> three SEPARATE rows, each + # carrying its own compound_type (venue filters apply row-wise). + rows = build_fact_rows( + [ + _occ_ct(0, "standalone"), + _occ_ct(0, "hyphenated"), + _occ_ct(0, "closed"), + ], + [EXEC_TURN], + MARKET_TERMS, + ticker="ORCL", + call_id="C1", + ) + assert len(rows) == 3 + assert {r["compound_type"] for r in rows} == {"standalone", "hyphenated", "closed"} + # No row aggregates mixed compound_types. + for r in rows: + assert r["compound_type"] in {"standalone", "hyphenated", "closed"} + + +def test_build_fact_rows_defaults_compound_type_standalone() -> None: + # A pre-fix occurrence WITHOUT compound_type defaults to standalone. + rows = build_fact_rows([_occ(0)], [EXEC_TURN], MARKET_TERMS, ticker="ORCL", call_id="C1") + assert rows[0]["compound_type"] == "standalone" + + +def test_kalshi_filter_excludes_closed_compound() -> None: + from mostlyright.weather.earnings.fact_builder import kalshi_boolean_settles + + # A closed-compound mention by an exec is NOT a Kalshi Yes (Kalshi No on closed). + closed_only = build_fact_rows( + [_occ_ct(0, "closed")], [EXEC_TURN], MARKET_TERMS, ticker="ORCL", call_id="C1" + ) + assert kalshi_boolean_settles(closed_only) is False + # A standalone mention by the same exec IS a Kalshi Yes. + standalone = build_fact_rows( + [_occ_ct(0, "standalone")], [EXEC_TURN], MARKET_TERMS, ticker="ORCL", call_id="C1" + ) + assert kalshi_boolean_settles(standalone) is True + + +def test_polymarket_autocount_excludes_closed_and_affix() -> None: + from mostlyright.weather.earnings.fact_builder import polymarket_count + + rows = build_fact_rows( + [ + _occ_ct(0, "standalone", count=2), + _occ_ct(0, "hyphenated", count=1), + _occ_ct(0, "closed", count=5), + _occ_ct(0, "affix_derivation", count=4), + ], + [ANALYST_TURN], + MARKET_TERMS, + ticker="ORCL", + call_id="C1", + ) + # Only standalone + hyphenated auto-count (2 + 1); closed + affix excluded. + assert polymarket_count(rows) == 3 + + +def test_polymarket_fail_loud_closed_candidate_review() -> None: + # auto_count (3) < threshold (5), but auto + closed (3 + 3) >= 5 -> the + # closed candidates COULD flip the outcome -> needs-human-review, NOT + # resolved_no. + rows = build_fact_rows( + [ + _occ_ct(0, "standalone", count=3), + _occ_ct(0, "closed", count=3), + ], + [ANALYST_TURN], + MARKET_TERMS, + ticker="ORCL", + call_id="C1", + ) + status = resolve_status(rows, venue="polymarket", threshold_n=5) + assert status not in {"resolved_no", "resolved_yes"} + assert status in {"disputed", "needs_human_review"} + + +def test_polymarket_resolved_yes_ignores_closed_when_auto_meets_threshold() -> None: + rows = build_fact_rows( + [ + _occ_ct(0, "standalone", count=5), + _occ_ct(0, "closed", count=3), + ], + [ANALYST_TURN], + MARKET_TERMS, + ticker="ORCL", + call_id="C1", + ) + assert resolve_status(rows, venue="polymarket", threshold_n=5) == "resolved_yes" + + +def test_polymarket_resolved_no_when_candidates_cannot_change_outcome() -> None: + # auto (2) + closed (1) = 3 < threshold 5 -> candidates cannot flip -> No. + rows = build_fact_rows( + [ + _occ_ct(0, "standalone", count=2), + _occ_ct(0, "closed", count=1), + ], + [ANALYST_TURN], + MARKET_TERMS, + ticker="ORCL", + call_id="C1", + ) + assert resolve_status(rows, venue="polymarket", threshold_n=5) == "resolved_no" + + +def test_affix_derivation_counts_for_neither_venue() -> None: + from mostlyright.weather.earnings.fact_builder import ( + kalshi_boolean_settles, + polymarket_count, + ) + + rows = build_fact_rows( + [_occ_ct(0, "affix_derivation", count=9)], + [EXEC_TURN], + MARKET_TERMS, + ticker="ORCL", + call_id="C1", + ) + assert kalshi_boolean_settles(rows) is False + assert polymarket_count(rows) == 0 + + +def test_unknown_compound_type_fails_loud_in_every_tally() -> None: + """An out-of-enum compound_type raises ValueError everywhere (review F5). + + A typo'd value (e.g. 'Standalone') used to VANISH from all tallies — not + auto-counted, not a closed candidate — so a true count of 5 resolved + resolved_no at threshold 5 silently. Fail loud instead, consistent with the + count_mentions / apply_kalshi_filter ethos. + """ + from mostlyright.weather.earnings.fact_builder import ( + closed_candidate_count, + kalshi_boolean_settles, + polymarket_count, + polymarket_threshold_met, + ) + + bad = [ + { + "role_source": "roster_match", + "speaker_role": "company_executive", + "kalshi_counted": True, + "mention_count": 5, + "compound_type": "Standalone", # wrong case -> out of enum + } + ] + for fn in (kalshi_boolean_settles, polymarket_count, closed_candidate_count): + with pytest.raises(ValueError, match="compound_type"): + fn(bad) + with pytest.raises(ValueError, match="compound_type"): + polymarket_threshold_met(bad, 5) + with pytest.raises(ValueError, match="compound_type"): + resolve_status(bad, venue="polymarket", threshold_n=5) + with pytest.raises(ValueError, match="compound_type"): + resolve_status(bad, venue="kalshi") + + +def test_unknown_compound_type_fails_loud_at_row_build() -> None: + """build_fact_rows rejects an occurrence carrying an out-of-enum compound_type.""" + occ = _occ(0) + occ["compound_type"] = "hyphen" # not a COMPOUND_TYPE_VALUES member + with pytest.raises(ValueError, match="compound_type"): + build_fact_rows([occ], [EXEC_TURN], MARKET_TERMS, ticker="ORCL", call_id="C1") + + +def test_none_and_missing_compound_type_default_standalone_in_tallies() -> None: + """None / missing compound_type defaults to standalone (back-compat, F5).""" + from mostlyright.weather.earnings.fact_builder import polymarket_count + + rows = [ + {"kalshi_counted": True, "mention_count": 2}, # missing key + {"kalshi_counted": True, "mention_count": 3, "compound_type": None}, # explicit None + ] + assert polymarket_count(rows) == 5 + + +def test_non_datetime_spoken_at_fails_loud_at_row_build() -> None: + """build_fact_rows rejects a non-datetime spoken_at (review R2-F3). + + A float (engine-relative seconds) or naive datetime written to the + timestamp_utc spoken_at column is silently coerced by pyarrow to + microseconds-after-epoch (1970-01-01...) — corrupting the temporal audit + marker. Consistent with the tz-aware enforcement on the SSE projection path + (_assert_live_temporal_contract): fail loud at the write seam. + """ + from datetime import UTC, datetime + + occ = _occ(0) + occ["spoken_at"] = 12.5 # engine-relative float, NOT a wallclock + with pytest.raises(ValueError, match="spoken_at"): + build_fact_rows([occ], [EXEC_TURN], MARKET_TERMS, ticker="ORCL", call_id="C1") + + # A NAIVE datetime is rejected too (would mis-order against tz-aware as_of). + occ_naive = _occ(0) + occ_naive["spoken_at"] = datetime(2026, 7, 1, 13, 0) + with pytest.raises(ValueError, match="tz-aware"): + build_fact_rows([occ_naive], [EXEC_TURN], MARKET_TERMS, ticker="ORCL", call_id="C1") + + # A genuine tz-aware wallclock passes through unchanged. + wallclock = datetime(2026, 7, 1, 13, 0, tzinfo=UTC) + occ_ok = _occ(0) + occ_ok["spoken_at"] = wallclock + rows = build_fact_rows([occ_ok], [EXEC_TURN], MARKET_TERMS, ticker="ORCL", call_id="C1") + assert rows[0]["spoken_at"] == wallclock + + +def test_backcompat_row_without_compound_type_settles_as_before() -> None: + # A pre-fix row lacking compound_type behaves as standalone -> counts for both. + from mostlyright.weather.earnings.fact_builder import ( + kalshi_boolean_settles, + polymarket_count, + ) + + legacy = [ + { + "role_source": "roster_match", + "speaker_role": "company_executive", + "kalshi_counted": True, + "mention_count": 3, + # NO compound_type key + } + ] + assert kalshi_boolean_settles(legacy) is True + assert polymarket_count(legacy) == 3 + assert resolve_status(legacy, venue="polymarket", threshold_n=2) == "resolved_yes" + + # --------------------------------------------------------------------------- # Parquet ledger round-trip + NO audio column (D-27.9) # --------------------------------------------------------------------------- diff --git a/packages/weather/tests/earnings/test_ledger_kalshi_validation.py b/packages/weather/tests/earnings/test_ledger_kalshi_validation.py index a910626..c3a5e66 100644 --- a/packages/weather/tests/earnings/test_ledger_kalshi_validation.py +++ b/packages/weather/tests/earnings/test_ledger_kalshi_validation.py @@ -14,6 +14,7 @@ import pytest from mostlyright.core.exceptions import EarningsFactCorruptError, KalshiCountRuleViolation +from mostlyright.core.schemas.earnings_fact import COMPOUND_TYPE_VALUES from mostlyright.weather.earnings.ledger import FactLedger @@ -78,3 +79,32 @@ def test_analyst_not_counted_persists(self, tmp_path) -> None: ledger = FactLedger(root=tmp_path) good = _row(speaker_role="sell_side_analyst", kalshi_counted=False) assert ledger.append([good], ticker="ORCL", call_id="C1") == 1 + + +class TestLedgerRejectsOutOfEnumCompoundType: + """D-30 follow-up: ``compound_type`` is the venue-count axis — an + out-of-enum value silently vanishes from every venue tally downstream + (``fact_builder`` fail-louds at tally time, but ``/facts`` serves the + stored rows directly), so the durable write boundary re-validates it — + the same choke-point rationale as the ``kalshi_counted`` re-derivation + above.""" + + def test_out_of_enum_compound_type_is_rejected(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + bad = _row(compound_type="banana") + with pytest.raises(EarningsFactCorruptError): + ledger.append([bad], ticker="ORCL", call_id="C1") + # Nothing was persisted — the raise happens before the parquet write. + assert ledger.read("ORCL", "C1") == [] + + def test_absent_compound_type_persists(self, tmp_path) -> None: + # Pre-fix rows omit the column (nullable) — default-standalone semantics. + ledger = FactLedger(root=tmp_path) + assert ledger.append([_row()], ticker="ORCL", call_id="C1") == 1 + stored = ledger.read("ORCL", "C1") + assert stored and stored[0]["compound_type"] is None + + def test_each_canonical_compound_type_persists(self, tmp_path) -> None: + ledger = FactLedger(root=tmp_path) + rows = [_row(compound_type=ct) for ct in COMPOUND_TYPE_VALUES] + assert ledger.append(rows, ticker="ORCL", call_id="C1") == len(COMPOUND_TYPE_VALUES) diff --git a/packages/weather/tests/earnings/test_streaming_transcriber.py b/packages/weather/tests/earnings/test_streaming_transcriber.py index fd1e9c0..f56cd34 100644 --- a/packages/weather/tests/earnings/test_streaming_transcriber.py +++ b/packages/weather/tests/earnings/test_streaming_transcriber.py @@ -319,3 +319,187 @@ def test_segments_and_deltas_carry_no_audio_bytes() -> None: for f in dataclasses.fields(delta): value = getattr(delta, f.name) assert not isinstance(value, (bytes, bytearray)), "audio leaked onto a FactDelta" + + +# --------------------------------------------------------------------------- +# compound_type on live deltas (D-30, review F1): classify_mentions is the +# PRODUCTION counter — a closed compound must produce a candidate delta. +# --------------------------------------------------------------------------- +def _run_single_final(stub_texts: list[str], market_terms: list[dict]) -> StreamingTranscriber: + stub = _StubTranscriber(texts=stub_texts) + tx = StreamingTranscriber( + transcriber=stub, + initial_prompt_terms=[str(t["term_canonical"]) for t in market_terms], + market_terms=market_terms, + turn_provider=lambda _seg: _EXEC_TURN, + ) + frames = [ + (_speech_frame(3.0), 0.0), + (_speech_frame(1.0, loud=False), 3.0), + ] + _drain(tx, frames) + return tx + + +def test_closed_compound_produces_closed_delta() -> None: + """'firefighter' for term 'fire' produces a closed-candidate delta (F1). + + The as-wired pipeline used count_mentions, which never matches a closed + compound — so NO closed candidate existed anywhere in production and a + Polymarket closed-straddle silently resolved resolved_no (the exact defect + D-30 targets). classify_mentions is now the production counter. + """ + tx = _run_single_final( + ["the firefighter arrived", "the firefighter arrived"], + [{"term_canonical": "fire"}], + ) + assert len(tx.fact_deltas) == 1, "a closed compound must produce a candidate delta" + delta = tx.fact_deltas[0] + assert delta.compound_type == "closed" + assert delta.mention_count == 1 + assert delta.term_canonical == "fire" + + +def test_deltas_split_per_compound_type() -> None: + """Mixed compound types of one term emit one delta PER (term, compound_type).""" + tx = _run_single_final( + ["a tariff and a pre-tariff move", "a tariff and a pre-tariff move"], + [{"term_canonical": "tariff"}], + ) + by_type = {d.compound_type: d for d in tx.fact_deltas} + assert set(by_type) == {"standalone", "hyphenated"} + assert by_type["standalone"].mention_count == 1 + assert by_type["hyphenated"].mention_count == 1 + # A delta never aggregates across compound types. + assert all(d.term_canonical == "tariff" for d in tx.fact_deltas) + + +def test_delta_compound_type_defaults_standalone() -> None: + """FactDelta.compound_type is additive with default 'standalone' (back-compat). + + Existing SSE consumers / persisted deltas constructed without the field are + unaffected; an ordinary standalone mention carries 'standalone'. + """ + # Construction without the kwarg still works (additive field). + legacy = FactDelta( + term_canonical="revenue", + matched_surface_form="revenue", + mention_count=1, + speaker_role="company_executive", + role_source="roster_match", + speaker_name="Safra Catz", + kalshi_counted=True, + is_final=True, + spoken_at=0.0, + stream_seq=0, + ) + assert legacy.compound_type == "standalone" + # And the engine emits 'standalone' for an ordinary mention. + tx = _run_single_final(["revenue grew", "revenue grew"], [{"term_canonical": "revenue"}]) + assert tx.fact_deltas[0].compound_type == "standalone" + + +def test_to_stt_count_maps_engine_seconds_to_offset_not_spoken_at() -> None: + """to_stt_count emits offset_seconds, never the engine-relative float + spoken_at (review R2-F3). + + FactDelta.spoken_at is a float of seconds into the stream (e.g. 12.5). + Copying it verbatim into the occurrence made build_fact_rows write it to the + schema's timestamp_utc spoken_at column, and pyarrow silently coerced the + float to microseconds-after-epoch — persisting 1970-01-01 00:00:00.000012+00 + as the temporal audit marker. The engine-relative value belongs in + offset_seconds; spoken_at (nullable) is emitted ONLY for a genuine tz-aware + wallclock, which the streaming engine does not have. + """ + delta = FactDelta( + term_canonical="fire", + matched_surface_form="fire", + mention_count=1, + speaker_role="company_executive", + role_source="roster_match", + speaker_name="Safra Catz", + kalshi_counted=True, + is_final=True, + spoken_at=12.5, + stream_seq=3, + compound_type="closed", + ) + occ = delta.to_stt_count(turn_index=0) + assert "spoken_at" not in occ + assert occ["offset_seconds"] == 12 + assert isinstance(occ["offset_seconds"], int) + + +def test_seam_built_row_ledger_round_trip_has_no_epoch_timestamps(tmp_path) -> None: + """A seam-built fact row persists with NULL spoken_at, not 1970 (R2-F3).""" + from mostlyright.weather.earnings.fact_builder import build_fact_rows + from mostlyright.weather.earnings.ledger import FactLedger + + delta = FactDelta( + term_canonical="fire", + matched_surface_form="fire", + mention_count=1, + speaker_role="company_executive", + role_source="roster_match", + speaker_name="Safra Catz", + kalshi_counted=True, + is_final=True, + spoken_at=42.0, + stream_seq=1, + compound_type="closed", + ) + rows = build_fact_rows( + [delta.to_stt_count(turn_index=0)], + [_EXEC_TURN], + [{"term_canonical": "fire"}], + ticker="ORCL", + call_id="C1", + ) + assert rows[0]["spoken_at"] is None + assert rows[0]["offset_seconds"] == 42 + + ledger = FactLedger(root=tmp_path) + ledger.append(rows, ticker="ORCL", call_id="C1") + back = ledger.read("ORCL", "C1") + assert back[0]["spoken_at"] is None, "engine float must not persist as epoch-1970" + assert back[0]["offset_seconds"] == 42 + + +def test_end_to_end_closed_candidate_reaches_disputed_resolution() -> None: + """Streaming closed compound -> delta -> fact rows -> disputed, Kalshi no (F1). + + The full production path: a transcript containing ONLY a closed compound of + the term ('firefighter' for 'fire'), Polymarket threshold 1 -> the closed + candidate straddles (auto 0 + closed 1 >= 1) -> resolve_status returns the + human-review status, NEVER a silent resolved_no; Kalshi still resolves no + (closed compounds are Kalshi-No). + """ + from mostlyright.weather.earnings.fact_builder import build_fact_rows, resolve_status + + market_terms = [ + { + "term_canonical": "fire", + "counting_mode": "threshold_ge_n", + "threshold_n": 1, + "window_scope": "full_call", + } + ] + tx = _run_single_final( + ["the firefighter arrived", "the firefighter arrived"], + market_terms, + ) + assert len(tx.fact_deltas) == 1 + delta = tx.fact_deltas[0] + assert delta.compound_type == "closed" + + # compound_type survives into the stt_counts records build_fact_rows consumes. + occ = delta.to_stt_count(turn_index=0) + assert occ["compound_type"] == "closed" + rows = build_fact_rows([occ], [_EXEC_TURN], market_terms, ticker="ORCL", call_id="C1") + assert rows[0]["compound_type"] == "closed" + + # Fail-loud: the closed candidate could flip the Polymarket outcome -> + # disputed (human review), NOT resolved_no. + assert resolve_status(rows, venue="polymarket", threshold_n=1) == "disputed" + # Kalshi resolves no on a closed-only mention (correct: Kalshi-No on closed). + assert resolve_status(rows, venue="kalshi") == "resolved_no" diff --git a/packages/weather/tests/earnings/test_stt.py b/packages/weather/tests/earnings/test_stt.py index 2d080ad..749a52b 100644 --- a/packages/weather/tests/earnings/test_stt.py +++ b/packages/weather/tests/earnings/test_stt.py @@ -100,38 +100,63 @@ def test_canonical_and_plural_acronym_still_count_after_spelled_removal() -> Non assert count_mentions("we ship APIs", "API") == (1, ["APIs"]) -def test_hyphenated_compound_is_not_the_bare_word() -> None: - """``pre-tariff`` / ``anti-tariff`` are NOT the bare word ``tariff`` (P2, LOCKED). - - ``-`` is a non-word char, so the plain ``(? None: + """A hyphenated compound COUNTS for the bare term on BOTH venues (D-30, LOCKED). + + Both venues' primary rule PDFs say hyphenated compounds count as the bare + word — the earlier ``(? appears separated from other words by a space or hyphen or in a + compound form with a space or hyphen, it counts." + * Polymarket Event+Mentions DeFi: + https://polymarket-upload.s3.us-east-2.amazonaws.com/market_products/Event+Mentions+Contract+DeFi.pdf + — agrees: hyphenated compounds count for the bare word. + + ``-`` is a non-word char, so the plain ``(? None: - """A hyphenated product code (``GLP-1``) OPTS OUT of the compound guard (P2). + """A hyphenated product code (``GLP-1``) still matches verbatim (D-30). - The hyphenated-compound rule applies only to forms whose surface carries NO - hyphen; a slash-exploded product code contains its own hyphen, so ``GLP-1`` - still matches verbatim (its internal ``-`` is never at the match boundary). + With the inverted compound guard removed, every form uses the plain + ``(? None: assert {f.lower() for f in forms} == {"glp-1", "ozempic"} +# --------------------------------------------------------------------------- +# classify_mentions — per-occurrence compound_type tagging (D-30, Task 2) +# --------------------------------------------------------------------------- +def test_classify_standalone() -> None: + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("a tariff here", "tariff") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "standalone" + assert recs[0]["surface"].lower() == "tariff" + assert "start" in recs[0] + + +def test_classify_open_compound() -> None: + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("the fire station burned", "fire") + assert len(recs) == 1 + # Adjacent word separated by a space -> open compound. + assert recs[0]["compound_type"] == "open" + + +def test_classify_hyphenated() -> None: + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("a pre-tariff move", "tariff") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "hyphenated" + + +def test_classify_closed_candidate() -> None: + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("a wildfire spread", "fire") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "closed" + + recs2 = classify_mentions("a killjoy remark", "joy") + assert len(recs2) == 1 + assert recs2[0]["compound_type"] == "closed" + + +def test_classify_affix_derivation() -> None: + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("a joyful day", "joy") + assert len(recs) == 1 + # Derivational suffix -ful alters the root -> affix_derivation (counts for + # NEITHER venue). Do NOT silently drop — tag it so it is auditable. + assert recs[0]["compound_type"] == "affix_derivation" + + +def test_classify_reuses_form_machinery_not_bare_equality() -> None: + """classify_mentions must reuse the plural/possessive/apostrophe machinery. + + A possessive (``tariff's``) is a standalone occurrence, not a miss — the + classifier shares the same form expansion as count_mentions, never bare exact + equality. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("the tariff's impact", "tariff") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "standalone" + + +def test_classify_ambiguous_prefers_closed_never_drops() -> None: + """An ambiguous closed-vs-affix case resolves conservatively to closed. + + ``firefighter`` keeps ``fire`` as a distinct component -> closed candidate + (surfaces to a human), never silently dropped. Review F6: this MUST assert + ``== "closed"`` exactly — the earlier set-membership assertion + (``in {"closed", "affix_derivation"}``) was tautological: it stayed green if + firefighter regressed to affix_derivation, the precise silent drop this test + exists to prevent (firefighter is a named closed-compound example in + 27-RESEARCH-MARKETS-RULES-STT §2.1(3) / the Polymarket PDF). + """ + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("a firefighter arrived", "fire") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "closed" + # It is TAGGED, not dropped. + assert recs[0]["surface"] + + +def test_classify_prefix_residual_is_closed_never_affix() -> None: + """A prefix-attached substring is ALWAYS a closed candidate (review F4). + + 'oversupply' for 'supply', 'prepayment' for 'payment', 'underdog' for 'dog' + are genuine closed compounds a human must adjudicate. The old curated + _DERIVATIONAL_PREFIXES branch classified them affix_derivation -> excluded + from closed_candidate_count -> a Polymarket straddle resolved resolved_no + SILENTLY (the exact silent drop the locked design forbids). Prefix residuals + never classify as affix_derivation; only the grammatical-SUFFIX branch + (joyful, running, marketing) may. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + for transcript, term in ( + ("an oversupply problem", "supply"), + ("a prepayment clause", "payment"), + ("the underdog wins", "dog"), + ("a rerun aired", "run"), + ): + recs = classify_mentions(transcript, term) + assert len(recs) == 1, f"{term!r} in {transcript!r}: expected one candidate" + assert recs[0]["compound_type"] == "closed", ( + f"{term!r} in {transcript!r}: prefix residual must be a closed " + f"candidate (human review), got {recs[0]['compound_type']!r}" + ) + + +def test_classify_pass2_scans_acronyms_case_sensitively() -> None: + """Acronym forms scan pass 2 CASE-SENSITIVELY (review F3 -> R2-F2). + + Round 1 scanned acronym forms LOWERCASED — ``OCI`` tagged a candidate inside + ``social`` (s-OCI-al). Round 1's fix EXCLUDED acronyms from pass 2 entirely, + which overcorrected: a genuine acronym compound (``GenAI`` / ``OpenAI`` for + ``AI``) was SILENTLY DROPPED — zero rows, so a Polymarket threshold-1 market + resolved resolved_no instead of surfacing a closed-candidate review. The + correct rule (R2-F2): scan acronyms case-sensitively, exactly like the + case-sensitive-form path — the case-preserved substring inside ``GenAI`` / + ``OpenAI`` is a closed candidate (review-only, never auto-counts), while + lowercase ``social`` / ``said`` never match. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + # A case-preserved acronym inside a longer word -> closed candidate. + recs = classify_mentions("our GenAI strategy", "AI") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "closed" + assert recs[0]["surface"] == "AI" + + recs2 = classify_mentions("the OpenAI partnership", "AI") + assert len(recs2) == 1 + assert recs2[0]["compound_type"] == "closed" + + # Lowercase homograph text never matches the case-sensitive acronym scan. + assert classify_mentions("a social gathering", "OCI") == [] + assert classify_mentions("a social gathering", "OCI", match_rule="exact") == [] + assert classify_mentions("the airline said", "AI") == [] + + # The real solid acronym still classifies via pass 1. + recs3 = classify_mentions("our OCI revenue", "OCI") + assert any(r["surface"] == "OCI" for r in recs3) + + +def test_classify_pass2_honors_case_sensitive_forms() -> None: + """A case-sensitive form scans case-sensitively in pass 2 (review F3). + + Pass 1 correctly refuses the verb ``block`` for the ticker ``Block`` + (case-sensitive over-count guard); the substring pass must not bypass that + guard by scanning lowercased: ``blockchain`` is NOT a closed-compound + candidate for ``Block``. + + DOCUMENTED BEHAVIOR for the capitalized ``Blockchain``: it DOES contain the + case-sensitive surface ``Block`` as a leading component, so it surfaces as a + conservative ``closed`` CANDIDATE (human review adjudicates whether the + compound references the company) — candidates never auto-count, so this is + review-only surface area, never a settlement over-count. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + # Lowercase-blind path is closed: blockchain is not a candidate for Block. + assert classify_mentions("the blockchain market", "Block") == [] + # And the bare lowercase verb still produces nothing (pass 1 guard intact). + assert classify_mentions("we block the deal", "Block") == [] + # Capitalized Blockchain: case-sensitive substring hit -> conservative + # closed candidate for human review (never auto-counted). + recs = classify_mentions("the Blockchain market", "Block") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "closed" + assert recs[0]["surface"] == "Block" + + +def test_classify_fused_compound_inside_hyphenated_token() -> None: + """A fused closed compound INSIDE a hyphenated token is a candidate (R3-F1). + + Pass 2 used to skip any token containing "-" ("hyphenated handled in pass + 1") — but pass 1 only matches the term as a distinct hyphen-separated + ELEMENT (``fire-related``); it cannot match ``fire`` FUSED inside the + component ``wildfire`` of ``wildfire-related``. "wildfire-related costs" + for term "fire" at Polymarket threshold 1 emitted NOTHING -> auto=0, + closed=0 -> resolved_no instead of disputed: the exact silent under-count + class this fix targets. Each hyphen-separated component is now scanned + exactly like an unhyphenated word. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("wildfire-related costs", "fire") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "closed" + assert recs[0]["surface"] == "fire" + + recs2 = classify_mentions("firefighter-led effort", "fire") + assert len(recs2) == 1 + assert recs2[0]["compound_type"] == "closed" + + +def test_classify_fused_acronym_inside_hyphenated_token() -> None: + """The case-preserved acronym path applies inside hyphenated tokens (R3-F1).""" + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("OpenAI-based products", "AI") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "closed" + assert recs[0]["surface"] == "AI" + + +def test_classify_hyphen_element_matches_stay_pass1_no_duplicate() -> None: + """A distinct hyphen-separated ELEMENT is pass 1's territory — never + duplicated by the component scan (R3-F1). + """ + from mostlyright.weather.earnings.stt import classify_mentions + + # pre-tariff: pass 1 matches "tariff" at a hyphen boundary -> hyphenated; + # the component scan must not emit a second record for the same span. + recs = classify_mentions("pre-tariff pricing", "tariff") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "hyphenated" + + # Multi-hyphen chain: "art" is a distinct element of state-of-the-art. + recs2 = classify_mentions("a state-of-the-art plant", "art") + assert len(recs2) == 1 + assert recs2[0]["compound_type"] == "hyphenated" + + +def test_classify_affix_component_inside_hyphenated_token() -> None: + """A suffix derivation inside a hyphenated component is affix_derivation (R3-F1). + + Classification runs on the COMPONENT (``joyful``), not the whole token — + tagged for audit, counted for NEITHER venue. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions("joyful-sounding remarks", "joy") + assert len(recs) == 1 + assert recs[0]["compound_type"] == "affix_derivation" + + +def test_classify_count_conservation_across_hyphen_and_fused_forms() -> None: + """Mixed surface classes tally without overlap or double-count (R3-F1). + + "fire" + "fire-related" + "wildfire" + "wildfire-related" for term "fire": + 2 auto-countable occurrences (standalone + hyphenated) + 2 closed + candidates, each span classified exactly once. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + recs = classify_mentions( + "a fire began, fire-related costs rose, a wildfire spread, " + "and wildfire-related losses mounted", + "fire", + ) + types = sorted(r["compound_type"] for r in recs) + assert types == ["closed", "closed", "hyphenated", "standalone"] + # No two records overlap (each span classified exactly once). + import itertools + + spans = sorted((r["start"], r["start"] + len(r["surface"])) for r in recs) + for (s1, e1), (s2, e2) in itertools.pairwise(spans): + assert e1 <= s2, f"overlapping spans: {(s1, e1)} vs {(s2, e2)}" + + +def test_classify_rejects_unknown_match_rule_like_count_mentions() -> None: + """classify_mentions fails loud on a bogus match_rule (review F2). + + A typo'd rule must NOT silently fall through to exact semantics (dropping + the plural/possessive forms -> under-count) — identical to count_mentions. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + with pytest.raises(ValueError, match="match_rule"): + classify_mentions("a tariff here", "tariff", match_rule="bogus") + + +def test_classify_rejects_empty_and_degenerate_terms() -> None: + """classify_mentions fails loud on empty / separator-only / punctuation terms (F2). + + Silently returning [] would settle "not mentioned" on a config bug — the + exact hazard count_mentions' guards close. Both functions share the same + validated form-prep path. + """ + from mostlyright.weather.earnings.stt import classify_mentions + + with pytest.raises(ValueError, match="non-empty"): + classify_mentions("some text", "") + with pytest.raises(ValueError, match="non-empty"): + classify_mentions("some text", " ") + # A bare synonym separator explodes to zero forms. + with pytest.raises(ValueError, match="no surface forms"): + classify_mentions("some text", " / ") + # A lone punctuation char survives as a form but can never match. + with pytest.raises(ValueError, match="non-word"): + classify_mentions("some text", "/") + + def test_asr_misrender_off_by_default_does_not_overcount_ocr() -> None: """Default ``asr_misrenders=False``: OCR (a legit standalone term) is NOT OCI. diff --git a/schemas/EXPORT_MANIFEST.json b/schemas/EXPORT_MANIFEST.json index 46f45e3..1d8b7ee 100644 --- a/schemas/EXPORT_MANIFEST.json +++ b/schemas/EXPORT_MANIFEST.json @@ -15,8 +15,8 @@ { "gated": false, "path": "json/schema.earnings_fact.v1.json", - "sha256": "fc11a1ae645ffa54523b0d3ccbfb5b8673e5e7c4d577c89836d88bc3a13d222b", - "size_bytes": 5821 + "sha256": "01b1087ff374610ac36b1233426509edd2f1df56864622d0b8174ad95f45f53e", + "size_bytes": 6391 }, { "gated": false, diff --git a/schemas/json/schema.earnings_fact.v1.json b/schemas/json/schema.earnings_fact.v1.json index 4b3e7d1..606ea0c 100644 --- a/schemas/json/schema.earnings_fact.v1.json +++ b/schemas/json/schema.earnings_fact.v1.json @@ -14,6 +14,21 @@ "description": "provider event id for the call", "type": "string" }, + "compound_type": { + "description": "D-30 \u00a72.1(3) per-occurrence compound axis \u2014 SEPARATE from term_match_rule (do NOT overload MATCH_RULE_VALUES). standalone/open/hyphenated auto-count on both venues; closed is Polymarket candidate-only + Kalshi-No; affix_derivation counts for neither. Nullable: pre-fix rows omit it (default 'standalone')", + "enum": [ + "affix_derivation", + "closed", + "hyphenated", + "open", + "standalone", + null + ], + "type": [ + "null", + "string" + ] + }, "confidence": { "description": "0..1 STT/attribution confidence (\u00a72.1(6)) \u2014 flags borderline hits", "type": "number" diff --git a/services/earnings/routes/stream.py b/services/earnings/routes/stream.py index c8ce789..3916331 100644 --- a/services/earnings/routes/stream.py +++ b/services/earnings/routes/stream.py @@ -76,8 +76,19 @@ def _segment_payload(seg: Segment) -> dict[str, object]: def _fact_payload(delta: FactDelta) -> dict[str, object]: - """A schema.earnings_fact.v1-shaped row — derived facts only (no audio).""" + """A schema.earnings_fact.v1-shaped row — derived facts only (no audio). + + TEMPORAL MAPPING (D-30 wire seam, mirrors ``FactDelta.to_stt_count``): + ``delta.spoken_at`` is an ENGINE-RELATIVE float (seconds into the stream, + e.g. ``12.5``) — NEVER a wallclock. The schema's ``spoken_at`` column is a + tz-aware timestamp, so emitting the raw float invites the pyarrow/pandas + 1970-epoch coercion R2-F3 closed on the batch path. It crosses the wire as + ``offset_seconds`` (the schema's engine-relative int audit field) and + ``spoken_at`` is OMITTED — the column is nullable, and the SDK consumer + fail-louds on any non-tz-aware value. + """ payload = asdict(delta) + payload["offset_seconds"] = int(payload.pop("spoken_at")) # Defensive: never leak a non-serialisable / audio-shaped field. return {k: v for k, v in payload.items() if "audio" not in k.lower()} diff --git a/services/earnings/tests/test_stream_sse.py b/services/earnings/tests/test_stream_sse.py index 00d5694..cbc465c 100644 --- a/services/earnings/tests/test_stream_sse.py +++ b/services/earnings/tests/test_stream_sse.py @@ -34,6 +34,7 @@ from mostlyright.weather.earnings.streaming_transcriber import FactDelta, Segment from services.earnings.app import assert_no_audio_surface, create_app +from services.earnings.routes.stream import _fact_payload from services.earnings.sse import ( STREAM_EVENT_NAMES, format_sse, @@ -121,6 +122,22 @@ def test_framing_data_is_single_line_json() -> None: assert parsed == payload +def test_fact_payload_maps_engine_seconds_to_offset_seconds() -> None: + """D-30 wire seam: ``FactDelta.spoken_at`` is an ENGINE-RELATIVE float + (seconds into the stream), never a wallclock. The schema's ``spoken_at`` + column is a tz-aware timestamp — emitting the raw float invites the + 1970-epoch coercion that ``FactDelta.to_stt_count`` (R2-F3) exists to + prevent. The SSE payload must carry it as ``offset_seconds`` (the schema's + engine-relative audit field) and OMIT ``spoken_at`` entirely.""" + payload = _fact_payload(_fact_delta("Marketing", 7)) + assert payload["offset_seconds"] == 7 + assert "spoken_at" not in payload + # The rest of the delta still crosses the wire, compound axis included. + assert payload["compound_type"] == "standalone" + assert payload["stream_seq"] == 7 + assert payload["kalshi_counted"] is True + + def test_framing_heartbeat_is_ignorable_comment() -> None: hb = heartbeat_comment() assert hb == b": ping\n\n" diff --git a/tests/test_earnings_stream_consumer.py b/tests/test_earnings_stream_consumer.py index 0e35dd4..a8f68fb 100644 --- a/tests/test_earnings_stream_consumer.py +++ b/tests/test_earnings_stream_consumer.py @@ -23,10 +23,12 @@ from typing import TypeVar import httpx +import pytest from mostlyright.weather.catalog._earnings_stream import consume_sse from mostlyright.weather.catalog.earnings import ( EARNINGS_LIVE_STREAM_SOURCE, EarningsAdapter, + LiveStreamError, ) _T = TypeVar("_T") @@ -137,6 +139,37 @@ async def go() -> list: assert end["event"] == "end_of_call" +def test_stream_fact_delta_compound_type_passes_through(): + """compound_type survives the live SSE projection (review R2-F1). + + _PASSTHROUGH_FIELDS used to strip it, laundering a closed candidate into a + standalone auto-count for every live consumer. Projected only-when-present: + the plain FACT_FRAME (no compound_type) still yields a row without the key. + """ + closed_frame = ( + "event: fact_delta\n" + 'data: {"ticker": "GIS", "call_id": "GIS-2026Q4", "term_canonical": "fire", ' + '"mention_count": 1, "kalshi_counted": true, "is_final": true, ' + '"compound_type": "closed", ' + '"resolution_status": "provisional", "spoken_at": "2026-07-01T13:00:05Z", ' + '"published_at": "2026-07-01T13:00:08Z"}\n' + "id: 2\n\n" + ) + body = _sse_body(closed_frame, FACT_FRAME, END_FRAME) + adapter = EarningsAdapter() + + async def go() -> list: + async with _mock_client(body) as client: + return await _collect( + adapter.stream("GIS", "GIS-2026Q4", base_url="https://feed/stream", client=client) + ) + + closed, plain, _end = _run(go()) + assert closed["compound_type"] == "closed" + # Pre-fix payload without the field: projected only-when-present. + assert "compound_type" not in plain + + def test_stream_knowledge_time_is_publish_not_spoken_at(): """knowledge_time must be the publish wallclock (>= spoken_at), NOT spoken_at.""" body = _sse_body(FACT_FRAME, END_FRAME) @@ -153,6 +186,56 @@ async def go() -> list: assert fact["knowledge_time"] != fact["spoken_at"] +def test_stream_rejects_engine_relative_float_spoken_at(): + """A fact_delta leaking ``FactDelta.spoken_at`` (an ENGINE-RELATIVE float + like ``12.5``, D-30 wire seam) must fail loud — never coerce into a + 1970-epoch wallclock row. Such a frame carries no ``published_at``/ + ``knowledge_time``, so the single-sided temporal guard is all that stands + between the float and a persisted row with an invalid temporal shape.""" + bad_fact = ( + "event: fact_delta\n" + 'data: {"ticker": "GIS", "call_id": "GIS-2026Q4", "term_canonical": "revenue", ' + '"mention_count": 3, "kalshi_counted": true, "is_final": true, ' + '"resolution_status": "provisional", "spoken_at": 12.5}\n' + "id: 2\n\n" + ) + body = _sse_body(bad_fact, END_FRAME) + adapter = EarningsAdapter() + + async def go() -> list: + async with _mock_client(body) as client: + return await _collect( + adapter.stream("GIS", "GIS-2026Q4", base_url="https://feed/stream", client=client) + ) + + with pytest.raises(LiveStreamError): + _run(go()) + + +def test_stream_rejects_naive_single_sided_spoken_at(): + """A tz-naive ``spoken_at`` with no ``knowledge_time`` must also fail loud — + the 27-11 wire contract is tz-aware UTC on every present temporal field, + not just when both happen to arrive together.""" + naive_fact = ( + "event: fact_delta\n" + 'data: {"ticker": "GIS", "call_id": "GIS-2026Q4", "term_canonical": "revenue", ' + '"mention_count": 1, "kalshi_counted": true, "is_final": true, ' + '"resolution_status": "provisional", "spoken_at": "2026-07-01T13:00:05"}\n' + "id: 2\n\n" + ) + body = _sse_body(naive_fact, END_FRAME) + adapter = EarningsAdapter() + + async def go() -> list: + async with _mock_client(body) as client: + return await _collect( + adapter.stream("GIS", "GIS-2026Q4", base_url="https://feed/stream", client=client) + ) + + with pytest.raises(LiveStreamError): + _run(go()) + + def test_stream_passes_token_and_params(): captured: dict[str, str] = {}