diff --git a/alembic/versions/b7c1d9e3f204_add_showcase_workspace_run_config.py b/alembic/versions/b7c1d9e3f204_add_showcase_workspace_run_config.py new file mode 100644 index 00000000..60927c3e --- /dev/null +++ b/alembic/versions/b7c1d9e3f204_add_showcase_workspace_run_config.py @@ -0,0 +1,41 @@ +"""add showcase_workspace run_config column + +Revision ID: b7c1d9e3f204 +Revises: d45cf40dfe47 +Create Date: 2026-06-13 12:00:00.000000 + +E4 of the showcase-completion initiative (umbrella #406, epic #410). Adds a +single nullable JSONB ``run_config`` column to ``showcase_workspace`` -- a +REPLAY-INPUT column in the same class as ``seed`` / ``scenario`` / ``reset`` / +``skip_seed`` (NOT an E1 story slot; see docs/_base/DOMAIN_MODEL.md D1). It +records the start-frame model set + backtest config a ``preservation="keep"`` +run was launched with, so Load/Replay can reproduce it verbatim. NULL when the +run used default config. No index (the read path is by ``workspace_id``; the +column is a display/replay payload). Forward-only. +""" + +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "b7c1d9e3f204" +down_revision: str | None = "d45cf40dfe47" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Add the nullable ``run_config`` JSONB column.""" + op.add_column( + "showcase_workspace", + sa.Column("run_config", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + ) + + +def downgrade() -> None: + """Drop the ``run_config`` column.""" + op.drop_column("showcase_workspace", "run_config") diff --git a/app/features/demo/models.py b/app/features/demo/models.py index 4a50eb4a..4897f621 100644 --- a/app/features/demo/models.py +++ b/app/features/demo/models.py @@ -63,6 +63,7 @@ class ShowcaseWorkspace(TimestampMixin, Base): date_end: Seeded data window end; NULL when unknown. created_objects: Soft-reference ids of everything the run created (JSONB). result_summary: Winner / WAPE / wall-clock display payload (JSONB). + run_config: Replay-input run config -- model set + backtest knobs (E4 #410); NULL on defaults. archived: Operator curation flag -- archived rows still list in E1. pinned: Operator curation flag -- no behavioral semantics in E1. notes: Free-text operator annotation (capped at the Pydantic boundary). @@ -102,6 +103,14 @@ class ShowcaseWorkspace(TimestampMixin, Base): ) # winner_model_type / winner_wape / wall_clock_s -- display payload. result_summary: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True) + # E4 (#410) -- replay-input run config (NOT an E1 story slot; see + # DOMAIN_MODEL.md D1). Shape: {"train_model_types": [...], "backtest": {...}} + # via model_dump(mode="json"); NULL when the run used default config. + # Written by create_workspace at insert time (a replay input known before + # step 1, like seed/scenario); consumed by Load/Replay. config_schema_version + # is deliberately NOT bumped -- it versions the STORY-SLOT schema; run_config + # presence is NULL-detectable and carries its own documented shape. + run_config: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True) # ── E1 (#407) — lifecycle metadata ──────────────────────────────────── # Orthogonal to ``status`` (which the pipeline owns): archive/pin are diff --git a/app/features/demo/pipeline.py b/app/features/demo/pipeline.py index 6052a4be..a91c1dd9 100644 --- a/app/features/demo/pipeline.py +++ b/app/features/demo/pipeline.py @@ -42,6 +42,7 @@ from app.core.problem_details import EMBEDDING_AUTH_CODE, ERROR_TYPES from app.features.demo import workspace from app.features.demo.schemas import DemoRunRequest, StepEvent, StepStatus, UserScope +from app.shared.model_taxonomy import KNOWN_MODEL_TYPES from app.shared.seeder.config import ScenarioPreset from app.shared.seeder.overrides import SeederOverrides @@ -210,6 +211,55 @@ async def request( # ============================================================================= +@dataclass(frozen=True) +class ResolvedRunConfig: + """The request's run-config with legacy defaults filled in (E4, #410). + + ``customized`` is True when the request carried EITHER ``train_model_types`` + OR ``backtest`` -- it gates the byte-identical legacy path in step_backtest + (D4) and the ``run_config`` echo in pipeline_complete. When False every + field equals the legacy constant, so resolving an untouched frame is a + no-op. + """ + + model_types: tuple[str, ...] = DEMO_MODEL_TYPES + horizon: int = DEMO_HORIZON + strategy: str = "expanding" + n_splits: int = DEMO_BACKTEST_SPLITS + min_train_size: int = DEMO_MIN_TRAIN_SIZE + gap: int = 0 + metric: str = "wape" + customized: bool = False + + +def _resolve_run_config(req: DemoRunRequest) -> ResolvedRunConfig: + """Fold ``req.train_model_types`` / ``req.backtest`` over the legacy defaults. + + None on both -> the all-default ResolvedRunConfig (customized=False), + byte-identical to today. A partial config (only a selection, or only a + backtest block) fills the unspecified half from the legacy constants. + """ + customized = req.train_model_types is not None or req.backtest is not None + if not customized: + return ResolvedRunConfig() + model_types = ( + tuple(req.train_model_types) if req.train_model_types is not None else DEMO_MODEL_TYPES + ) + backtest = req.backtest + if backtest is None: + return ResolvedRunConfig(model_types=model_types, customized=True) + return ResolvedRunConfig( + model_types=model_types, + horizon=backtest.horizon, + strategy=backtest.strategy, + n_splits=backtest.n_splits, + min_train_size=backtest.min_train_size, + gap=backtest.gap, + metric=backtest.metric, + customized=True, + ) + + @dataclass class DemoContext: """Accumulator threaded through every step. @@ -268,6 +318,10 @@ class DemoContext: # validates and adopts (warn + fallback to discovery when dangling). seed_overrides: SeederOverrides | None = None user_scope: UserScope | None = None + # E4 (#410) -- resolved run config (selection + backtest split + ranking + # metric). Defaults to the all-legacy ResolvedRunConfig so a frame without + # the new fields behaves byte-identically. + run_config: ResolvedRunConfig = field(default_factory=ResolvedRunConfig) # ============================================================================= @@ -290,6 +344,13 @@ def _model_config_payload(model_type: str) -> dict[str, Any]: return {"model_type": "moving_average", "window_size": 7} if model_type == "prophet_like": return {"model_type": "prophet_like"} + # E4 (#410) -- any other KNOWN model type validates from a minimal + # {"model_type": X} body (runtime-verified across all 11 union members, + # PRP Gotcha 1). The explicit branches above stay because their non-default + # params (season_length / window_size) are load-bearing for config_hash + # stability of existing registry rows. + if model_type in KNOWN_MODEL_TYPES: + return {"model_type": model_type} raise ValueError(f"Unsupported demo model_type: {model_type}") @@ -452,18 +513,21 @@ def _is_embedding_auth_error(exc: _StepError) -> bool: def _select_winner( backtest_results: dict[str, dict[str, float]], + metric: str = "wape", ) -> tuple[str, float] | None: - """Pick the ``(model_type, WAPE)`` with the lowest aggregated WAPE. + """Pick the ``(model_type, metric_value)`` with the lowest configured metric. - Skips models whose WAPE is missing or NaN (port of run_demo.py:338-356). + ``metric`` is one of wape / mae / rmse (E4 #410, D5 -- all lower-is-better); + defaults to "wape" so every existing call site is unchanged. Skips models + whose metric value is missing or NaN (port of run_demo.py:338-356). """ best: tuple[str, float] | None = None for model_type, metrics in backtest_results.items(): - wape = metrics.get("wape") - if wape is None or math.isnan(wape): + value = metrics.get(metric) + if value is None or math.isnan(value): continue - if best is None or wape < best[1]: - best = (model_type, wape) + if best is None or value < best[1]: + best = (model_type, value) return best @@ -754,13 +818,38 @@ async def step_features(ctx: DemoContext, client: _Client) -> StepResult: async def step_train(ctx: DemoContext, client: _Client) -> StepResult: - """Train naive / seasonal_naive / moving_average in parallel.""" + """Train the configured model set in parallel (legacy trio by default). + + E4 (#410) -- the selection comes from ``ctx.run_config.model_types`` and the + horizon-tail reservation from ``ctx.run_config.horizon`` (both legacy + constants on an untouched frame). A disabled opt-in model (lightgbm / + xgboost / random_forest behind a False ``forecast_enable_*`` flag) fails the + step FAST with a detail naming the flag (D6) -- the settings read lives here, + never in the Pydantic schema (the documented ".env-bleed" class). + """ if ctx.date_start is None or ctx.date_end is None: return ("fail", "no date range on ctx", {}) + # D6 -- fail fast on a disabled opt-in model so the operator gets a clear, + # actionable message instead of a deeper 400 (route gate) or factory error. + settings = get_settings() + flag_by_model = { + "lightgbm": settings.forecast_enable_lightgbm, + "xgboost": settings.forecast_enable_xgboost, + "random_forest": settings.forecast_enable_random_forest, + } + disabled = [m for m in ctx.run_config.model_types if flag_by_model.get(m) is False] + if disabled: + return ( + "fail", + f"model(s) {disabled} requested but the matching forecast_enable_* flag " + "is off — enable the flag (and install the extra) or deselect the model", + {"requested_models": list(ctx.run_config.model_types), "disabled_models": disabled}, + ) + # Leave a horizon-sized tail unused by training so the backtest has room. train_start = ctx.date_start - train_end = ctx.date_end - timedelta(days=DEMO_HORIZON) + train_end = ctx.date_end - timedelta(days=ctx.run_config.horizon) async def _train(model_type: str) -> tuple[str, dict[str, Any]]: train_body = await client.request( @@ -778,7 +867,7 @@ async def _train(model_type: str) -> tuple[str, dict[str, Any]]: return model_type, train_body results: list[tuple[str, dict[str, Any]]] = list( - await asyncio.gather(*(_train(m) for m in DEMO_MODEL_TYPES)) + await asyncio.gather(*(_train(m) for m in ctx.run_config.model_types)) ) for model_type, train_body in results: ctx.train_results[model_type] = train_body @@ -786,7 +875,10 @@ async def _train(model_type: str) -> tuple[str, dict[str, Any]]: return ( "pass", f"trained {len(ctx.train_results)} models in parallel: {trained}", - {"trained": list(ctx.train_results.keys())}, + { + "trained": list(ctx.train_results.keys()), + "requested_models": list(ctx.run_config.model_types), + }, ) @@ -815,110 +907,154 @@ def _coerce_bucketed_metrics( return out or None +def _backtest_body( + ctx: DemoContext, + model_type: str, + start_date: date, + end_date: date, + *, + include_baselines: bool, +) -> dict[str, Any]: + """Build a ``POST /backtesting/run`` body from ``ctx.run_config`` (E4 #410). + + The split knobs come from the resolved run config -- all legacy constants on + an untouched frame, so the body is byte-identical to today on the + not-customized path. + """ + run_config = ctx.run_config + return { + "store_id": ctx.store_id, + "product_id": ctx.product_id, + "start_date": start_date.isoformat(), + "end_date": end_date.isoformat(), + "config": { + "split_config": { + "strategy": run_config.strategy, + "n_splits": run_config.n_splits, + "min_train_size": run_config.min_train_size, + "gap": run_config.gap, + "horizon": run_config.horizon, + }, + "model_config_main": _model_config_payload(model_type), + "include_baselines": include_baselines, + "store_fold_details": False, + }, + } + + async def step_backtest(ctx: DemoContext, client: _Client) -> StepResult: - """Run scenario-aware backtest; pick the lowest-WAPE winner. + """Run scenario-aware backtest; pick the winner by the configured metric. PRP-38 — on SHOWCASE_RICH the main model is feature-aware (``prophet_like``); baselines come back in ``baseline_results`` (one call, ``include_baselines=true``) and the response carries per-horizon-bucket metrics in ``main_model_results.bucketed_aggregated_metrics``. On DEMO_MINIMAL the original 3-baseline-loop behaviour is preserved. + + E4 (#410, D4) — when the operator supplied a custom run config + (``ctx.run_config.customized``), BOTH legacy branches give way to ONE + unified per-model loop over the selection (each ``include_baselines=False``); + on SHOWCASE_RICH ``prophet_like`` is appended if absent so the V2 story + (``step_v2_train`` registers it unconditionally) keeps a backtest entry, and + its call supplies the bucketed metrics. The winner is the best + ``ctx.run_config.metric`` (wape / mae / rmse). """ if ctx.date_start is None or ctx.date_end is None: return ("fail", "no date range on ctx", {}) + start_date = ctx.date_start + end_date = ctx.date_end + run_config = ctx.run_config - if ctx.scenario is ScenarioPreset.SHOWCASE_RICH: - body = await client.request( - f"backtest[{SHOWCASE_V2_MODEL_TYPE}]", - "POST", - "/backtesting/run", - json_body={ - "store_id": ctx.store_id, - "product_id": ctx.product_id, - "start_date": ctx.date_start.isoformat(), - "end_date": ctx.date_end.isoformat(), - "config": { - "split_config": { - "strategy": "expanding", - "n_splits": DEMO_BACKTEST_SPLITS, - "min_train_size": DEMO_MIN_TRAIN_SIZE, - "gap": 0, - "horizon": DEMO_HORIZON, - }, - "model_config_main": _model_config_payload(SHOWCASE_V2_MODEL_TYPE), - "include_baselines": True, - "store_fold_details": False, - }, - }, - ) - main_results = body.get("main_model_results", {}) - baseline_results = body.get("baseline_results") or [] - main_metrics = _coerce_metric_dict( - main_results.get("aggregated_metrics") if isinstance(main_results, dict) else None - ) - ctx.backtest_results[SHOWCASE_V2_MODEL_TYPE] = main_metrics - # baseline_results is list[ModelBacktestResult]. - if isinstance(baseline_results, list): - for entry in baseline_results: - if not isinstance(entry, dict): - continue - entry_type = entry.get("model_type") - if not isinstance(entry_type, str): - continue - ctx.backtest_results[entry_type] = _coerce_metric_dict( - entry.get("aggregated_metrics") + if not run_config.customized: + if ctx.scenario is ScenarioPreset.SHOWCASE_RICH: + body = await client.request( + f"backtest[{SHOWCASE_V2_MODEL_TYPE}]", + "POST", + "/backtesting/run", + json_body=_backtest_body( + ctx, SHOWCASE_V2_MODEL_TYPE, start_date, end_date, include_baselines=True + ), + ) + main_results = body.get("main_model_results", {}) + baseline_results = body.get("baseline_results") or [] + main_metrics = _coerce_metric_dict( + main_results.get("aggregated_metrics") if isinstance(main_results, dict) else None + ) + ctx.backtest_results[SHOWCASE_V2_MODEL_TYPE] = main_metrics + # baseline_results is list[ModelBacktestResult]. + if isinstance(baseline_results, list): + for entry in baseline_results: + if not isinstance(entry, dict): + continue + entry_type = entry.get("model_type") + if not isinstance(entry_type, str): + continue + ctx.backtest_results[entry_type] = _coerce_metric_dict( + entry.get("aggregated_metrics") + ) + ctx.bucketed_aggregated_metrics = _coerce_bucketed_metrics( + main_results.get("bucketed_aggregated_metrics") + if isinstance(main_results, dict) + else None + ) + else: + # DEMO_MINIMAL / SPARSE / others: loop over baselines (legacy path). + for model_type in DEMO_MODEL_TYPES: + body = await client.request( + f"backtest[{model_type}]", + "POST", + "/backtesting/run", + json_body=_backtest_body( + ctx, model_type, start_date, end_date, include_baselines=False + ), + ) + main_results = body.get("main_model_results", {}) + ctx.backtest_results[model_type] = _coerce_metric_dict( + main_results.get("aggregated_metrics") + if isinstance(main_results, dict) + else None ) - ctx.bucketed_aggregated_metrics = _coerce_bucketed_metrics( - main_results.get("bucketed_aggregated_metrics") - if isinstance(main_results, dict) - else None - ) else: - # DEMO_MINIMAL / SPARSE / others: loop over baselines (legacy path). - for model_type in DEMO_MODEL_TYPES: + # E4 (#410, D4) — unified per-model loop over the operator's selection. + models = list(run_config.model_types) + if ctx.scenario is ScenarioPreset.SHOWCASE_RICH and SHOWCASE_V2_MODEL_TYPE not in models: + models.append(SHOWCASE_V2_MODEL_TYPE) + for model_type in models: body = await client.request( f"backtest[{model_type}]", "POST", "/backtesting/run", - json_body={ - "store_id": ctx.store_id, - "product_id": ctx.product_id, - "start_date": ctx.date_start.isoformat(), - "end_date": ctx.date_end.isoformat(), - "config": { - "split_config": { - "strategy": "expanding", - "n_splits": DEMO_BACKTEST_SPLITS, - "min_train_size": DEMO_MIN_TRAIN_SIZE, - "gap": 0, - "horizon": DEMO_HORIZON, - }, - "model_config_main": _model_config_payload(model_type), - "include_baselines": False, - "store_fold_details": False, - }, - }, + json_body=_backtest_body( + ctx, model_type, start_date, end_date, include_baselines=False + ), ) main_results = body.get("main_model_results", {}) ctx.backtest_results[model_type] = _coerce_metric_dict( main_results.get("aggregated_metrics") if isinstance(main_results, dict) else None ) + if model_type == SHOWCASE_V2_MODEL_TYPE: + ctx.bucketed_aggregated_metrics = _coerce_bucketed_metrics( + main_results.get("bucketed_aggregated_metrics") + if isinstance(main_results, dict) + else None + ) - winner = _select_winner(ctx.backtest_results) + winner = _select_winner(ctx.backtest_results, run_config.metric) if winner is None: - return ("fail", "no model produced a usable WAPE (all NaN?)", {}) + return ("fail", f"no model produced a usable {run_config.metric} (all NaN?)", {}) ctx.winner_model_type, ctx.winner_wape = winner payload: dict[str, Any] = { "per_model": dict(ctx.backtest_results), "winner": ctx.winner_model_type, "winner_wape": ctx.winner_wape, + "metric": run_config.metric, } if ctx.bucketed_aggregated_metrics is not None: payload["bucketed_aggregated_metrics"] = ctx.bucketed_aggregated_metrics return ( "pass", f"{len(ctx.backtest_results)} models, winner={ctx.winner_model_type} " - f"wape={ctx.winner_wape:.4f}", + f"{run_config.metric}={ctx.winner_wape:.4f}", payload, ) @@ -1105,7 +1241,8 @@ async def step_v2_train(ctx: DemoContext, client: _Client) -> StepResult: if ctx.date_start is None or ctx.date_end is None: return ("fail", "no date range on ctx", {}) train_start = ctx.date_start - train_end = ctx.date_end - timedelta(days=DEMO_HORIZON) + # E4 (#410, D8) -- the configured horizon drives the modeling steps' tail. + train_end = ctx.date_end - timedelta(days=ctx.run_config.horizon) train_body = await client.request( "v2_train[train]", @@ -2738,6 +2875,9 @@ async def run_pipeline(app: FastAPI, req: DemoRunRequest) -> AsyncIterator[StepE # E3 (#409) -- thread the validated start-frame config verbatim. seed_overrides=req.seed_overrides, user_scope=req.user_scope, + # E4 (#410) -- resolve the run config (selection + backtest) once; + # legacy defaults fill in the unspecified half. + run_config=_resolve_run_config(req), ) # E1 (#390) -- create the workspace row BEFORE the first step executes so # even an early failure records the run config. create_workspace is @@ -2857,5 +2997,22 @@ async def run_pipeline(app: FastAPI, req: DemoRunRequest) -> AsyncIterator[StepE # E1 (#390) -- additive; a string on preservation='keep' runs, # None otherwise (legacy clients ignore unknown keys). "workspace_id": ctx.workspace_id, + # E4 (#410) -- echo the resolved run config on customized runs so the + # FE can confirm what ran; None on legacy (default-config) runs. + "run_config": ( + { + "train_model_types": list(ctx.run_config.model_types), + "backtest": { + "horizon": ctx.run_config.horizon, + "strategy": ctx.run_config.strategy, + "n_splits": ctx.run_config.n_splits, + "min_train_size": ctx.run_config.min_train_size, + "gap": ctx.run_config.gap, + "metric": ctx.run_config.metric, + }, + } + if ctx.run_config.customized + else None + ), }, ) diff --git a/app/features/demo/schemas.py b/app/features/demo/schemas.py index d5aa78ea..352770aa 100644 --- a/app/features/demo/schemas.py +++ b/app/features/demo/schemas.py @@ -13,6 +13,7 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from app.shared.model_taxonomy import KNOWN_MODEL_TYPES from app.shared.seeder.config import ScenarioPreset from app.shared.seeder.overrides import SeederOverrides @@ -43,6 +44,40 @@ class UserScope(BaseModel): product_id: int = Field(..., ge=1, description="Real product id from /dimensions/products.") +class DemoBacktestConfig(BaseModel): + """Backtest knobs for the showcase pipeline (E4, issue #410). + + Bounds MIRROR ``app/features/backtesting/schemas.py:SplitConfig`` exactly -- + the pipeline forwards them verbatim into ``POST /backtesting/run``. The only + intentional divergence is ``n_splits``'s default (3, the demo default, vs + SplitConfig's 5) and the addition of ``metric``, the winner-ranking choice + (D5: WAPE / MAE / RMSE, all lower-is-better; smape/bias deliberately + excluded -- issue #410 names exactly these three). Every field is + JSON-native so the parent's ``strict=True`` needs no per-field override. + """ + + model_config = ConfigDict(strict=True) + + horizon: int = Field(default=14, ge=1, le=90, description="Forecast horizon per fold.") + strategy: Literal["expanding", "sliding"] = Field( + default="expanding", + description="Expanding grows the training window; sliding keeps it fixed.", + ) + n_splits: int = Field(default=3, ge=2, le=20, description="Number of CV folds.") + min_train_size: int = Field(default=30, ge=7, description="Minimum training samples.") + gap: int = Field(default=0, ge=0, le=30, description="Gap days between train end and test.") + metric: Literal["wape", "mae", "rmse"] = Field( + default="wape", description="Winner-ranking metric (lower is better)." + ) + + @model_validator(mode="after") + def _gap_lt_horizon(self) -> DemoBacktestConfig: + """Mirror SplitConfig's horizon > gap invariant (avoids a 422 deeper in).""" + if self.gap >= self.horizon: + raise ValueError(f"horizon ({self.horizon}) must be greater than gap ({self.gap})") + return self + + class DemoRunRequest(BaseModel): """Request body for ``POST /demo/run`` and the ``WS /demo/stream`` start frame. @@ -124,6 +159,37 @@ class DemoRunRequest(BaseModel): "step (warn + fallback to discovery on a dangling pair)." ), ) + # E4 (#410): additive run-config. None -> the legacy DEMO_MODEL_TYPES trio + + # legacy split constants, byte-identical behaviour. The model allow-list + # comes from app.shared.model_taxonomy (vertical-slice rule: the demo slice + # never imports model_selection / forecasting). Flag enforcement is NOT + # here -- a disabled opt-in model fails fast in step_train (D6) to avoid the + # documented ".env-bleed" class from reading settings inside a schema. + train_model_types: list[str] | None = Field( + default=None, + min_length=1, + max_length=10, + description="Models the pipeline trains/backtests; None = the legacy baseline trio.", + ) + backtest: DemoBacktestConfig | None = Field( + default=None, + description="Backtest split + ranking-metric config; None = the legacy demo split.", + ) + + @field_validator("train_model_types") + @classmethod + def _known_unique_models(cls, v: list[str] | None) -> list[str] | None: + """Allow-list + de-dup the model selection against KNOWN_MODEL_TYPES.""" + if v is None: + return v + unknown = [m for m in v if m not in KNOWN_MODEL_TYPES] + if unknown: + raise ValueError( + f"Unknown model type(s): {unknown!r}. Valid: {sorted(KNOWN_MODEL_TYPES)}" + ) + if len(set(v)) != len(v): + raise ValueError("train_model_types contains duplicates") + return v @model_validator(mode="after") def _workspace_name_requires_keep(self) -> DemoRunRequest: @@ -342,6 +408,14 @@ class WorkspaceListItem(BaseModel): user_scope: dict[str, Any] | None = Field( default=None, description="Story slot (E3 #409): operator-selected focus." ) + # E4 (#410) -- replay-input echo (NOT a story slot; a dedicated nullable + # JSONB column, see DOMAIN_MODEL.md D1). None on default-config / pre-E4 + # rows. On the LIST item because the frontend Replay reads list rows and + # rebuilds the start frame's train_model_types + backtest from it. + run_config: dict[str, Any] | None = Field( + default=None, + description="Replay-input run config (model set + backtest); None on defaults.", + ) class WorkspaceDetailResponse(WorkspaceListItem): diff --git a/app/features/demo/tests/test_models.py b/app/features/demo/tests/test_models.py index 28791caa..ee048764 100644 --- a/app/features/demo/tests/test_models.py +++ b/app/features/demo/tests/test_models.py @@ -191,3 +191,41 @@ async def test_showcase_workspace_replayed_from_recorded(db_session: AsyncSessio loaded = await get_workspace(db_session, row.workspace_id) assert loaded is not None assert loaded.replayed_from_workspace_id == dangling_source + + +# ============================================================================= +# E4 (#410) -- run_config replay-input column +# ============================================================================= + + +async def test_showcase_workspace_run_config_roundtrip(db_session: AsyncSession) -> None: + """run_config round-trips through JSONB intact.""" + run_config = { + "train_model_types": ["naive", "regression"], + "backtest": { + "horizon": 21, + "strategy": "expanding", + "n_splits": 4, + "min_train_size": 30, + "gap": 0, + "metric": "rmse", + }, + } + row = _make_row(run_config=run_config) + db_session.add(row) + await db_session.commit() + + loaded = await get_workspace(db_session, row.workspace_id) + assert loaded is not None + assert loaded.run_config == run_config + + +async def test_showcase_workspace_run_config_null_default(db_session: AsyncSession) -> None: + """run_config stays NULL on a default-config insert.""" + row = _make_row() + db_session.add(row) + await db_session.commit() + + loaded = await get_workspace(db_session, row.workspace_id) + assert loaded is not None + assert loaded.run_config is None diff --git a/app/features/demo/tests/test_pipeline.py b/app/features/demo/tests/test_pipeline.py index 1fc4c1b4..7862d5a2 100644 --- a/app/features/demo/tests/test_pipeline.py +++ b/app/features/demo/tests/test_pipeline.py @@ -16,7 +16,7 @@ from fastapi import FastAPI from app.features.demo import pipeline -from app.features.demo.schemas import DemoRunRequest, UserScope +from app.features.demo.schemas import DemoBacktestConfig, DemoRunRequest, UserScope from app.shared.seeder.config import ScenarioPreset from app.shared.seeder.overrides import SeederOverrides @@ -378,6 +378,9 @@ def _fake_settings( *, rag_embedding_provider: str = "openai", openai_api_key: str = "sk-test", + forecast_enable_lightgbm: bool = False, + forecast_enable_xgboost: bool = False, + forecast_enable_random_forest: bool = False, ) -> SimpleNamespace: """Fake settings: usable registry root, no agent LLM key (agent skips). @@ -385,6 +388,11 @@ def _fake_settings( PRP-40 knowledge phase runs to completion in test fixtures; the knowledge-skip tests override via ``rag_embedding_provider="openai"`` + ``openai_api_key=""`` (or "ollama" with an unreachable canned probe). + + E4 (#410) -- the ``forecast_enable_*`` flags default False (matching + app/core/config.py), so the legacy demo trio (all always-on) still trains; + step_train's disabled-model fail-fast path is exercised by overriding a flag + AND selecting that model. """ return SimpleNamespace( registry_artifact_root=registry_root, @@ -393,6 +401,9 @@ def _fake_settings( openai_api_key=openai_api_key, google_api_key="", rag_embedding_provider=rag_embedding_provider, + forecast_enable_lightgbm=forecast_enable_lightgbm, + forecast_enable_xgboost=forecast_enable_xgboost, + forecast_enable_random_forest=forecast_enable_random_forest, ) @@ -416,6 +427,247 @@ def test_select_winner_none_when_no_usable_wape(): assert pipeline._select_winner({"naive": {"wape": float("nan")}}) is None +# ============================================================================= +# E4 (#410) -- run-config resolution, selection, split, metric, echo +# ============================================================================= + + +def _ctx_for_step( + scenario: ScenarioPreset = ScenarioPreset.DEMO_MINIMAL, + run_config: pipeline.ResolvedRunConfig | None = None, +) -> pipeline.DemoContext: + """A DemoContext positioned at the modeling phase (grain + window set).""" + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False, scenario=scenario) + ctx.store_id = 7 + ctx.product_id = 3 + ctx.date_start = date(2024, 1, 1) + ctx.date_end = date(2024, 12, 31) + if run_config is not None: + ctx.run_config = run_config + return ctx + + +def test_resolve_run_config_defaults_and_custom(): + """E4 (#410) -- None/None -> legacy; partial configs fill the other half.""" + legacy = pipeline._resolve_run_config(DemoRunRequest()) + assert legacy.customized is False + assert legacy.model_types == pipeline.DEMO_MODEL_TYPES + assert legacy.horizon == pipeline.DEMO_HORIZON + assert legacy.n_splits == pipeline.DEMO_BACKTEST_SPLITS + assert legacy.min_train_size == pipeline.DEMO_MIN_TRAIN_SIZE + assert legacy.gap == 0 + assert legacy.metric == "wape" + + sel_only = pipeline._resolve_run_config( + DemoRunRequest(train_model_types=["naive", "seasonal_average"]) + ) + assert sel_only.customized is True + assert sel_only.model_types == ("naive", "seasonal_average") + assert sel_only.horizon == pipeline.DEMO_HORIZON # backtest defaults stay legacy + assert sel_only.metric == "wape" + + bt_only = pipeline._resolve_run_config( + DemoRunRequest(backtest=DemoBacktestConfig(horizon=21, n_splits=4, metric="rmse")) + ) + assert bt_only.customized is True + assert bt_only.model_types == pipeline.DEMO_MODEL_TYPES # selection stays legacy + assert bt_only.horizon == 21 + assert bt_only.n_splits == 4 + assert bt_only.metric == "rmse" + + +def test_model_config_payload_minimal_fallback_for_all_known_types(): + """E4 (#410) -- every KNOWN type resolves; explicit branches keep params.""" + from app.shared.model_taxonomy import KNOWN_MODEL_TYPES + + for mt in KNOWN_MODEL_TYPES: + assert pipeline._model_config_payload(mt)["model_type"] == mt + assert pipeline._model_config_payload("seasonal_naive") == { + "model_type": "seasonal_naive", + "season_length": 7, + } + assert pipeline._model_config_payload("moving_average") == { + "model_type": "moving_average", + "window_size": 7, + } + with pytest.raises(ValueError, match="Unsupported demo model_type"): + pipeline._model_config_payload("not_a_model") + + +def test_select_winner_honors_metric(): + """E4 (#410, D5) -- the metric param drives selection; NaN/missing skip.""" + results = { + "naive": {"wape": 0.30, "mae": 5.0, "rmse": 9.0}, + "seasonal_naive": {"wape": 0.12, "mae": 6.0, "rmse": 7.0}, + } + assert pipeline._select_winner(results, "wape") == ("seasonal_naive", 0.12) + assert pipeline._select_winner(results, "mae") == ("naive", 5.0) + assert pipeline._select_winner(results, "rmse") == ("seasonal_naive", 7.0) + sparse = {"a": {"wape": 0.2}, "b": {"mae": 4.0}} + assert pipeline._select_winner(sparse, "mae") == ("b", 4.0) + nan = {"a": {"rmse": float("nan")}, "b": {"rmse": 3.0}} + assert pipeline._select_winner(nan, "rmse") == ("b", 3.0) + + +async def test_step_train_trains_selected_models(monkeypatch, tmp_path): + """E4 (#410) -- step_train trains exactly the configured selection.""" + monkeypatch.setattr(pipeline, "get_settings", lambda: _fake_settings(str(tmp_path / "reg"))) + rc = pipeline._resolve_run_config( + DemoRunRequest(train_model_types=["naive", "seasonal_average"]) + ) + ctx = _ctx_for_step(run_config=rc) + rec = _RecordingClient( + None, + responses={("POST", "/forecasting/train"): {"model_path": "demo/x-model_abc.joblib"}}, + ) + status, _detail, data = await pipeline.step_train(ctx, _as_client(rec)) + assert status == "pass" + assert set(ctx.train_results) == {"naive", "seasonal_average"} + assert data["requested_models"] == ["naive", "seasonal_average"] + posted = [ + b["config"]["model_type"] + for (_m, p, b) in rec.calls + if p == "/forecasting/train" and b is not None + ] + assert sorted(posted) == ["naive", "seasonal_average"] + + +async def test_step_train_fails_fast_on_disabled_flag(monkeypatch, tmp_path): + """E4 (#410, D6) -- a disabled opt-in model fails before any train POST.""" + monkeypatch.setattr( + pipeline, + "get_settings", + lambda: _fake_settings(str(tmp_path / "reg"), forecast_enable_lightgbm=False), + ) + rc = pipeline._resolve_run_config(DemoRunRequest(train_model_types=["naive", "lightgbm"])) + ctx = _ctx_for_step(run_config=rc) + rec = _RecordingClient(None, responses={("POST", "/forecasting/train"): {"model_path": "x"}}) + status, detail, data = await pipeline.step_train(ctx, _as_client(rec)) + assert status == "fail" + assert "forecast_enable" in detail + assert "lightgbm" in detail + assert data["disabled_models"] == ["lightgbm"] + assert rec.calls == [] # fail-fast: no train requests issued + + +async def test_step_backtest_sends_configured_split_config(): + """E4 (#410) -- the configured split + metric ride into POST /backtesting/run.""" + rc = pipeline._resolve_run_config( + DemoRunRequest( + train_model_types=["naive", "seasonal_average"], + backtest=DemoBacktestConfig( + horizon=21, strategy="sliding", n_splits=4, min_train_size=40, gap=2, metric="rmse" + ), + ) + ) + ctx = _ctx_for_step(run_config=rc) + rec = _RecordingClient( + None, + responses={ + ("POST", "/backtesting/run"): { + "main_model_results": {"aggregated_metrics": {"wape": 0.3, "mae": 5.0, "rmse": 9.0}} + } + }, + ) + status, detail, data = await pipeline.step_backtest(ctx, _as_client(rec)) + assert status == "pass" + assert data["metric"] == "rmse" + bodies = [b for (_m, p, b) in rec.calls if p == "/backtesting/run" and b is not None] + assert len(bodies) == 2 # exactly the selected models, no separate baselines call + for body in bodies: + assert body["config"]["split_config"] == { + "strategy": "sliding", + "n_splits": 4, + "min_train_size": 40, + "gap": 2, + "horizon": 21, + } + assert body["config"]["include_baselines"] is False + assert detail.startswith("2 models") and "rmse=" in detail + + +async def test_step_backtest_custom_selection_appends_prophet_like_on_showcase_rich(): + """E4 (#410, D4) -- prophet_like is appended on showcase_rich custom runs.""" + rc = pipeline._resolve_run_config( + DemoRunRequest(train_model_types=["naive", "seasonal_average"]) + ) + ctx = _ctx_for_step(scenario=ScenarioPreset.SHOWCASE_RICH, run_config=rc) + rec = _RecordingClient( + None, + responses={ + ("POST", "/backtesting/run"): { + "main_model_results": { + "aggregated_metrics": {"wape": 0.3}, + "bucketed_aggregated_metrics": {"h_1_7": {"wape": 0.25}}, + } + } + }, + ) + status, _detail, _data = await pipeline.step_backtest(ctx, _as_client(rec)) + assert status == "pass" + posted = [ + b["config"]["model_config_main"]["model_type"] + for (_m, p, b) in rec.calls + if p == "/backtesting/run" and b is not None + ] + assert posted == ["naive", "seasonal_average", "prophet_like"] + # bucketed metrics captured from the prophet_like (V2) call. + assert ctx.bucketed_aggregated_metrics == {"h_1_7": {"wape": 0.25}} + + +async def test_step_backtest_legacy_path_unchanged_when_not_customized(): + """E4 (#410, D4) -- a non-customized run keeps the legacy 3-baseline loop.""" + ctx = _ctx_for_step() # demo_minimal, default (not customized) run_config + rec = _RecordingClient( + None, + responses={ + ("POST", "/backtesting/run"): { + "main_model_results": {"aggregated_metrics": {"wape": 0.3}} + } + }, + ) + status, detail, data = await pipeline.step_backtest(ctx, _as_client(rec)) + assert status == "pass" + bodies = [b for (_m, p, b) in rec.calls if p == "/backtesting/run" and b is not None] + posted = [b["config"]["model_config_main"]["model_type"] for b in bodies] + assert posted == list(pipeline.DEMO_MODEL_TYPES) + for body in bodies: + assert body["config"]["split_config"] == { + "strategy": "expanding", + "n_splits": pipeline.DEMO_BACKTEST_SPLITS, + "min_train_size": pipeline.DEMO_MIN_TRAIN_SIZE, + "gap": 0, + "horizon": pipeline.DEMO_HORIZON, + } + assert body["config"]["include_baselines"] is False + assert data["metric"] == "wape" + assert "wape=" in detail + + +async def test_pipeline_complete_echoes_run_config(monkeypatch, tmp_path): + """E4 (#410) -- pipeline_complete echoes run_config on custom runs, None on legacy.""" + artifact = tmp_path / "naive-model.joblib" + artifact.write_bytes(b"fake joblib artifact bytes") + registry_root = tmp_path / "registry" + monkeypatch.setattr(pipeline, "get_settings", lambda: _fake_settings(str(registry_root))) + wapes = {"naive": 0.30, "seasonal_average": 0.15} + monkeypatch.setattr(pipeline, "_Client", _build_fake_client(str(artifact), wapes)) + + req = DemoRunRequest( + train_model_types=["naive", "seasonal_average"], + backtest=DemoBacktestConfig(horizon=14, n_splits=3, metric="rmse"), + ) + events = [e async for e in pipeline.run_pipeline(app=_FAKE_APP, req=req)] + final = events[-1] + assert final.event_type == "pipeline_complete" + assert final.data["run_config"] is not None + assert final.data["run_config"]["train_model_types"] == ["naive", "seasonal_average"] + assert final.data["run_config"]["backtest"]["metric"] == "rmse" + + legacy = [e async for e in pipeline.run_pipeline(app=_FAKE_APP, req=DemoRunRequest())] + assert legacy[-1].data["run_config"] is None + + # ============================================================================= # run_pipeline -- full green run # ============================================================================= diff --git a/app/features/demo/tests/test_schemas.py b/app/features/demo/tests/test_schemas.py index 8019d219..d7ba3573 100644 --- a/app/features/demo/tests/test_schemas.py +++ b/app/features/demo/tests/test_schemas.py @@ -7,6 +7,7 @@ from pydantic import ValidationError from app.features.demo.schemas import ( + DemoBacktestConfig, DemoRunRequest, DemoRunResult, StepEvent, @@ -80,6 +81,9 @@ def test_demo_run_request_legacy_frame_still_validates(): assert req.seed == 7 assert req.preservation == "ephemeral" assert req.workspace_name is None + # E4 (#410) -- the run-config fields default None on a legacy frame. + assert req.train_model_types is None + assert req.backtest is None def test_demo_run_request_workspace_name_requires_keep(): @@ -233,6 +237,110 @@ def test_user_scope_rejects_extra_keys_and_bad_ids(): UserScope.model_validate({"store_id": "1", "product_id": 1}) +# ============================================================================= +# E4 (#410) -- train_model_types + backtest (run-config phase controls) +# ============================================================================= + + +def test_demo_run_request_run_config_defaults_none(): + """E4 (#410) -- both run-config fields default None (legacy behaviour).""" + req = DemoRunRequest() + assert req.train_model_types is None + assert req.backtest is None + + +def test_demo_run_request_accepts_model_selection_json_path(): + """E4 (#410) -- the JSON wire form accepts a selection + nested backtest + dict (validate_python on a parsed dict, the path FastAPI uses).""" + req = DemoRunRequest.model_validate( + { + "train_model_types": ["naive", "seasonal_average"], + "backtest": {"horizon": 21, "n_splits": 4, "metric": "rmse"}, + } + ) + assert req.train_model_types == ["naive", "seasonal_average"] + assert req.backtest is not None + assert req.backtest.horizon == 21 + assert req.backtest.n_splits == 4 + assert req.backtest.metric == "rmse" + # Unset nested knobs fall back to their defaults. + assert req.backtest.strategy == "expanding" + assert req.backtest.min_train_size == 30 + assert req.backtest.gap == 0 + + +def test_demo_run_request_rejects_unknown_model_type(): + """E4 (#410) -- a model_type outside KNOWN_MODEL_TYPES is rejected.""" + with pytest.raises(ValidationError): + DemoRunRequest.model_validate({"train_model_types": ["naive", "bogus_model"]}) + + +def test_demo_run_request_rejects_duplicate_model_types(): + """E4 (#410) -- duplicate model types are rejected.""" + with pytest.raises(ValidationError): + DemoRunRequest.model_validate({"train_model_types": ["naive", "naive"]}) + + +def test_demo_run_request_rejects_empty_and_oversized_selection(): + """E4 (#410) -- selection size is bounded 1..10.""" + with pytest.raises(ValidationError): + DemoRunRequest.model_validate({"train_model_types": []}) + # 11 distinct known models -> over the cap of 10. + eleven = [ + "naive", + "seasonal_naive", + "moving_average", + "weighted_moving_average", + "seasonal_average", + "trend_regression_baseline", + "regression", + "prophet_like", + "lightgbm", + "xgboost", + "random_forest", + ] + with pytest.raises(ValidationError): + DemoRunRequest.model_validate({"train_model_types": eleven}) + + +def test_demo_backtest_config_defaults_and_bounds(): + """E4 (#410) -- DemoBacktestConfig defaults + bound/invariant enforcement.""" + cfg = DemoBacktestConfig() + assert cfg.horizon == 14 + assert cfg.strategy == "expanding" + assert cfg.n_splits == 3 # demo default, NOT SplitConfig's 5 + assert cfg.min_train_size == 30 + assert cfg.gap == 0 + assert cfg.metric == "wape" + # n_splits floor is 2. + with pytest.raises(ValidationError): + DemoBacktestConfig.model_validate({"n_splits": 1}) + # gap >= horizon is rejected (mirrors SplitConfig). + with pytest.raises(ValidationError): + DemoBacktestConfig.model_validate({"horizon": 5, "gap": 5}) + # Unknown metric rejected (closed Literal). + with pytest.raises(ValidationError): + DemoBacktestConfig.model_validate({"metric": "smape"}) + + +def test_workspace_list_item_run_config_round_trip(): + """E4 (#410) -- run_config rides on the list item, default None.""" + bare = WorkspaceListItem.model_validate(_orm_like_workspace_row()) + assert bare.run_config is None + slotted = WorkspaceListItem.model_validate( + _orm_like_workspace_row( + run_config={ + "train_model_types": ["naive", "regression"], + "backtest": {"horizon": 21, "metric": "rmse"}, + } + ) + ) + assert slotted.run_config == { + "train_model_types": ["naive", "regression"], + "backtest": {"horizon": 21, "metric": "rmse"}, + } + + # ============================================================================= # E1 (#407) -- WorkspaceUpdateRequest (PATCH body) # ============================================================================= diff --git a/app/features/demo/tests/test_workspace.py b/app/features/demo/tests/test_workspace.py index fcef7115..b0597981 100644 --- a/app/features/demo/tests/test_workspace.py +++ b/app/features/demo/tests/test_workspace.py @@ -107,6 +107,41 @@ async def test_create_workspace_without_e3_fields_persists_nulls( assert row.user_scope is None +async def test_create_workspace_records_run_config(db_session: AsyncSession) -> None: + """E4 (#410) -- a custom run-config keep-run persists run_config verbatim.""" + workspace_id = await workspace.create_workspace( + _keep_request( + train_model_types=["naive", "seasonal_average"], + backtest={"horizon": 21, "n_splits": 4, "metric": "rmse"}, + ) + ) + assert workspace_id is not None + + row = await workspace.get_workspace(db_session, workspace_id) + assert row is not None + assert row.run_config == { + "train_model_types": ["naive", "seasonal_average"], + "backtest": { + "horizon": 21, + "strategy": "expanding", + "n_splits": 4, + "min_train_size": 30, + "gap": 0, + "metric": "rmse", + }, + } + + +async def test_create_workspace_run_config_null_on_defaults(db_session: AsyncSession) -> None: + """E4 (#410) -- a default-config keep-run leaves run_config NULL.""" + workspace_id = await workspace.create_workspace(_keep_request()) + assert workspace_id is not None + + row = await workspace.get_workspace(db_session, workspace_id) + assert row is not None + assert row.run_config is None + + async def test_finalize_workspace_completed(db_session: AsyncSession) -> None: """finalize(failed=False) settles to completed with collected ids.""" workspace_id = await workspace.create_workspace(_keep_request()) diff --git a/app/features/demo/workspace.py b/app/features/demo/workspace.py index ca3002df..1b3ba4aa 100644 --- a/app/features/demo/workspace.py +++ b/app/features/demo/workspace.py @@ -87,6 +87,22 @@ def _apply_filters[SelectT: Select[Any]]( return stmt +def _run_config_payload(req: DemoRunRequest) -> dict[str, Any] | None: + """Build the ``run_config`` JSONB payload for a kept run (E4, #410). + + Returns ``None`` when the run used default config (BOTH fields absent) so + the column stays NULL and Load/Replay can NULL-detect "defaults". Otherwise + a sparse dict carrying only the operator-set portions, JSON-serialised via + ``model_dump(mode="json")`` so a verbatim Replay re-submits it unchanged. + """ + if req.train_model_types is None and req.backtest is None: + return None + return { + "train_model_types": req.train_model_types, + "backtest": req.backtest.model_dump(mode="json") if req.backtest is not None else None, + } + + async def create_workspace(req: DemoRunRequest) -> str | None: """Insert a ``running`` workspace row for a ``preservation="keep"`` run. @@ -127,6 +143,9 @@ async def create_workspace(req: DemoRunRequest) -> str | None: if req.user_scope is not None else None ), + # E4 (#409 sibling, #410): replay-input run config -- model + # set + backtest knobs, recorded verbatim (NULL on defaults). + run_config=_run_config_payload(req), ) ) await db.commit() diff --git a/app/features/model_selection/schemas.py b/app/features/model_selection/schemas.py index f494882d..2ebb3482 100644 --- a/app/features/model_selection/schemas.py +++ b/app/features/model_selection/schemas.py @@ -426,6 +426,11 @@ class CandidateModelInfo(BaseModel): default_params: dict[str, Any] supports_auto_predict: bool # False for feature-aware models (predict() rejects them) description: str + # E4 (#410) — runtime forecast_enable_* overlay; SERVICE-set (the pure + # capabilities.build_model_catalog leaves the default True). False exactly + # when the matching forecast_enable_{lightgbm,xgboost,random_forest} flag is + # off; the showcase model picker hides disabled opt-ins. + enabled: bool = True class ModelCatalogResponse(BaseModel): diff --git a/app/features/model_selection/service.py b/app/features/model_selection/service.py index 10220540..3f02eb3f 100644 --- a/app/features/model_selection/service.py +++ b/app/features/model_selection/service.py @@ -111,12 +111,28 @@ class ModelSelectionService: # ------------------------------------------------------------------------- def get_model_catalog(self) -> ModelCatalogResponse: - """Return the backend-owned candidate-model catalog (static, no I/O). + """Return the backend-owned candidate-model catalog with the enabled overlay. - Thin pass-through to the pure :func:`capabilities.build_model_catalog`; - kept on the service for symmetry with ``get_availability`` / ``run``. + Thin orchestration over the pure :func:`capabilities.build_model_catalog` + (which stays I/O-free): the service overlays the runtime + ``forecast_enable_*`` flags onto each item's ``enabled`` field (E4 #410, + D3) so the showcase model picker can hide disabled opt-in models. Every + always-on model stays ``enabled=True``. """ - return build_model_catalog() + base = build_model_catalog() + settings = get_settings() + flag_by_model = { + "lightgbm": settings.forecast_enable_lightgbm, + "xgboost": settings.forecast_enable_xgboost, + "random_forest": settings.forecast_enable_random_forest, + } + return ModelCatalogResponse( + models=[ + model.model_copy(update={"enabled": flag_by_model.get(model.model_type, True)}) + for model in base.models + ], + default_candidate_model_types=base.default_candidate_model_types, + ) # ------------------------------------------------------------------------- # Availability diff --git a/app/features/model_selection/tests/test_capabilities.py b/app/features/model_selection/tests/test_capabilities.py index 3ff73804..667beb84 100644 --- a/app/features/model_selection/tests/test_capabilities.py +++ b/app/features/model_selection/tests/test_capabilities.py @@ -34,6 +34,16 @@ def test_catalog_families_are_valid_literals() -> None: assert model.family in {"baseline", "tree", "additive"} +def test_capabilities_stays_pure_default_enabled_true() -> None: + """E4 (#410, D3) -- the pure catalog leaves enabled=True (no settings read). + + The forecast_enable_* overlay is the SERVICE's job; build_model_catalog + stays I/O-free, so every item carries the schema default. + """ + for model in build_model_catalog().models: + assert model.enabled is True + + def test_requires_extra_flags_lightgbm_xgboost_only() -> None: """Only the opt-in extras (lightgbm/xgboost) carry requires_extra=True.""" catalog = build_model_catalog() diff --git a/app/features/model_selection/tests/test_service.py b/app/features/model_selection/tests/test_service.py index 67f60a60..ab13d6a3 100644 --- a/app/features/model_selection/tests/test_service.py +++ b/app/features/model_selection/tests/test_service.py @@ -57,6 +57,51 @@ def _patch_availability(monkeypatch: pytest.MonkeyPatch, status: str) -> None: ) +# ----------------------------------------------------------------------------- +# E4 (#410) -- catalog enabled overlay +# ----------------------------------------------------------------------------- + +_OPT_IN_MODELS = {"lightgbm", "xgboost", "random_forest"} + + +def _patch_catalog_settings( + monkeypatch: pytest.MonkeyPatch, + *, + lightgbm: bool = False, + xgboost: bool = False, + random_forest: bool = False, +) -> None: + """Patch the service's get_settings with the three forecast_enable_* flags.""" + settings = SimpleNamespace( + forecast_enable_lightgbm=lightgbm, + forecast_enable_xgboost=xgboost, + forecast_enable_random_forest=random_forest, + ) + monkeypatch.setattr("app.features.model_selection.service.get_settings", lambda: settings) + + +def test_catalog_enabled_false_when_flags_off(monkeypatch: pytest.MonkeyPatch) -> None: + """E4 (#410, D3) -- with all flags off the three opt-ins are disabled, + every always-on model stays enabled.""" + _patch_catalog_settings(monkeypatch) # all default False + catalog = ModelSelectionService().get_model_catalog() + by_type = {m.model_type: m.enabled for m in catalog.models} + for opt_in in _OPT_IN_MODELS: + assert by_type[opt_in] is False + for model_type, enabled in by_type.items(): + if model_type not in _OPT_IN_MODELS: + assert enabled is True + + +def test_catalog_enabled_true_when_flag_on(monkeypatch: pytest.MonkeyPatch) -> None: + """E4 (#410, D3) -- enabling a flag flips exactly that model to enabled.""" + _patch_catalog_settings(monkeypatch, lightgbm=True) + by_type = {m.model_type: m.enabled for m in ModelSelectionService().get_model_catalog().models} + assert by_type["lightgbm"] is True + assert by_type["xgboost"] is False + assert by_type["random_forest"] is False + + # ----------------------------------------------------------------------------- # Flattening # ----------------------------------------------------------------------------- diff --git a/app/shared/model_taxonomy.py b/app/shared/model_taxonomy.py index a42f10e1..8d3f21b7 100644 --- a/app/shared/model_taxonomy.py +++ b/app/shared/model_taxonomy.py @@ -56,6 +56,15 @@ class ModelFamily(str, Enum): } +# E4 (#410) — public cross-slice request-validation allow-list. The demo +# slice (and any other slice that must validate a model_type without importing +# a sibling feature slice) checks membership against this frozenset instead of +# reaching into forecasting/model_selection. Derived from the canonical map +# above so it can never drift (drift-locked by +# ``app/shared/tests/test_model_taxonomy.py``). +KNOWN_MODEL_TYPES: frozenset[str] = frozenset(_MODEL_FAMILY_MAP) + + def model_family_for(model_type: str) -> ModelFamily: """Return the :class:`ModelFamily` for a given ``model_type`` string. diff --git a/app/shared/tests/test_model_taxonomy.py b/app/shared/tests/test_model_taxonomy.py index bf241d0f..76c2b5e3 100644 --- a/app/shared/tests/test_model_taxonomy.py +++ b/app/shared/tests/test_model_taxonomy.py @@ -16,7 +16,12 @@ import pytest -from app.shared.model_taxonomy import ModelFamily, model_family_for +from app.shared.model_taxonomy import ( + _MODEL_FAMILY_MAP, + KNOWN_MODEL_TYPES, + ModelFamily, + model_family_for, +) # --------------------------------------------------------------------------- # model_family_for — canonical mapping (mirrors the legacy suite in @@ -50,6 +55,21 @@ def test_model_family_for_unknown_returns_baseline() -> None: assert model_family_for("future_arima_v9") == ModelFamily.BASELINE +# --------------------------------------------------------------------------- +# KNOWN_MODEL_TYPES — cross-slice request-validation allow-list (E4 #410). +# --------------------------------------------------------------------------- + + +def test_known_model_types_matches_family_map() -> None: + """Drift-lock: the public allow-list IS the canonical map's key set.""" + assert KNOWN_MODEL_TYPES == frozenset(_MODEL_FAMILY_MAP) + + +def test_known_model_types_contains_demo_trio() -> None: + """The legacy demo trio must always validate (byte-compat criterion).""" + assert {"naive", "seasonal_naive", "moving_average"} <= KNOWN_MODEL_TYPES + + # --------------------------------------------------------------------------- # Back-compat re-exports — OBJECT IDENTITY across the legacy paths (#268). # Enum members are str-valued, so == would pass even across distinct class diff --git a/docs/_base/API_CONTRACTS.md b/docs/_base/API_CONTRACTS.md index e9c2ff7a..2922c077 100644 --- a/docs/_base/API_CONTRACTS.md +++ b/docs/_base/API_CONTRACTS.md @@ -58,9 +58,9 @@ All endpoints serve JSON; error responses use `application/problem+json` (RFC 78 | agents | WS | `/agents/stream` | Token-by-token streaming + tool-call events | | seeder | (see `app/features/seeder/routes.py`) | `/seeder/*` | Trigger scenarios, status, customization. **E3 (#409)** — `POST /seeder/generate` accepts an additive Optional `overrides` object (`SeederOverrides`, `app/shared/seeder/overrides.py`) with 7 allow-listed knobs: `stores` (1-100), `products` (1-500), `window_days` (75-365; recomputes `start_date` from `end_date`), `sparsity` (0-0.9), `promotion_intensity` (0-0.5), `stockout_intensity` (0-0.5), `noise_sigma` (0-0.5). `extra=forbid` → an unknown knob is a `422`; applied LAST in `_build_config_from_params` so it wins over the scalar `stores`/`products`/`sparsity` params; absent = byte-identical legacy behavior | | seeder | POST | `/seeder/phase2-enrichment` | PRP-38 — run Phase 2 generators (lifecycle, replenishment, exogenous, returns) against the existing seeded data. `422 application/problem+json` on an empty database. | -| demo | POST | `/demo/run` | Run the end-to-end demo pipeline in-process; returns a `DemoRunResult`. `409 application/problem+json` if a run is already active. **PRP-38** — body accepts an Optional `scenario: 'demo_minimal' \| 'showcase_rich' \| 'sparse'` field; default `'demo_minimal'` (back-compat). **E1 (#390)** — body accepts additive Optional `preservation: 'ephemeral' \| 'keep'` (default `'ephemeral'`, today's no-row behavior) and `workspace_name: str \| null` (pattern `^[a-z0-9][a-z0-9\-_]*$`, ≤100 chars); `workspace_name` without `preservation='keep'` → `422 application/problem+json`. `preservation='keep'` records the run as a `showcase_workspace` row; `DemoRunResult` gains an additive Optional `workspace_id: str \| null`. **E2 (#391)** — `scenario` accepts all 8 `ScenarioPreset` values (`retail_standard` / `holiday_rush` / `high_variance` / `stockout_heavy` / `new_launches` / `sparse` / `demo_minimal` / `showcase_rich`); only `showcase_rich` changes the step table (24 rows), every other preset runs the legacy 11-row flow. **E1 (#407)** — body accepts additive Optional `replayed_from_workspace_id: str \| null` (`^[0-9a-f]{32}$`); requires `preservation='keep'` (else `422 application/problem+json`); recorded verbatim on the new `showcase_workspace` row as a SOFT reference (no existence check — dangles are designed). **E3 (#409)** — body accepts additive Optional `seed_overrides` (the same `SeederOverrides` object as `POST /seeder/generate`; requires `skip_seed=false` else `422`; `window_days` rejected on the calendar-pinned `holiday_rush` preset; `{}` normalizes to `null`) and `user_scope` (`{store_id: int>=1, product_id: int>=1}`, `extra=forbid` — the focus pair the pipeline models instead of the auto-discovered first pair; validated by the status step, WARN + fallback to discovery on a dangling pair). Both persist into the kept workspace row's story slots and replay verbatim. | -| demo | WS | `/demo/stream` | Stream one `StepEvent` per pipeline step for the live Showcase page | -| demo | GET | `/demo/workspaces` | **E4 (#393)** — list saved showcase workspaces, newest first (`limit` 1-100 default 20 / `offset`); `200` + empty list on an empty table. **E1 (#407)** — list items additively carry `archived`, `pinned`, `tags`, `replayed_from_workspace_id`. **E2 (#408)** — additive query params: `q` (name ILIKE search, min 2 chars), repeated `tags` (JSONB containment — all listed tags must match), `include_archived` (default `false` — archived rows are now HIDDEN by default), allow-listed `sort_by` (`created_at`/`name`/`seed`/`status`; unknown → default `created_at desc`, no 422) + `sort_order` (`asc`/`desc`); pinned rows always order first; `total` respects the active filters. **E3 (#409)** — list items additively carry the `seed_overrides` / `user_scope` story slots (`null` on runs without them) — deliberately on the LIST item, because the frontend Replay builds its verbatim start frame from list rows | +| demo | POST | `/demo/run` | Run the end-to-end demo pipeline in-process; returns a `DemoRunResult`. `409 application/problem+json` if a run is already active. **PRP-38** — body accepts an Optional `scenario: 'demo_minimal' \| 'showcase_rich' \| 'sparse'` field; default `'demo_minimal'` (back-compat). **E1 (#390)** — body accepts additive Optional `preservation: 'ephemeral' \| 'keep'` (default `'ephemeral'`, today's no-row behavior) and `workspace_name: str \| null` (pattern `^[a-z0-9][a-z0-9\-_]*$`, ≤100 chars); `workspace_name` without `preservation='keep'` → `422 application/problem+json`. `preservation='keep'` records the run as a `showcase_workspace` row; `DemoRunResult` gains an additive Optional `workspace_id: str \| null`. **E2 (#391)** — `scenario` accepts all 8 `ScenarioPreset` values (`retail_standard` / `holiday_rush` / `high_variance` / `stockout_heavy` / `new_launches` / `sparse` / `demo_minimal` / `showcase_rich`); only `showcase_rich` changes the step table (24 rows), every other preset runs the legacy 11-row flow. **E1 (#407)** — body accepts additive Optional `replayed_from_workspace_id: str \| null` (`^[0-9a-f]{32}$`); requires `preservation='keep'` (else `422 application/problem+json`); recorded verbatim on the new `showcase_workspace` row as a SOFT reference (no existence check — dangles are designed). **E3 (#409)** — body accepts additive Optional `seed_overrides` (the same `SeederOverrides` object as `POST /seeder/generate`; requires `skip_seed=false` else `422`; `window_days` rejected on the calendar-pinned `holiday_rush` preset; `{}` normalizes to `null`) and `user_scope` (`{store_id: int>=1, product_id: int>=1}`, `extra=forbid` — the focus pair the pipeline models instead of the auto-discovered first pair; validated by the status step, WARN + fallback to discovery on a dangling pair). Both persist into the kept workspace row's story slots and replay verbatim. **E4 (#410)** — body accepts additive Optional `train_model_types: list[str] \| null` (1-10 items, allow-listed against the 11 `KNOWN_MODEL_TYPES` in `app/shared/model_taxonomy.py`; unknown/duplicate → `422`) and `backtest: DemoBacktestConfig \| null` (`{horizon 1-90 def 14, strategy expanding\|sliding, n_splits 2-20 def 3, min_train_size ≥7 def 30, gap 0-30 def 0, metric wape\|mae\|rmse}`; `gap ≥ horizon` → `422`). Both `None` → byte-identical legacy behaviour (the baseline trio + default split). A selected opt-in model whose `forecast_enable_*` flag is off fails the `train` step (NOT validation — D6) with a detail naming the flag. On `preservation='keep'` runs the config is recorded verbatim in the `showcase_workspace.run_config` column and replayed verbatim; `pipeline_complete.data.run_config` echoes it (`null` on default-config runs). | +| demo | WS | `/demo/stream` | Stream one `StepEvent` per pipeline step for the live Showcase page. **E4 (#410)** — the start frame additively accepts `train_model_types` + `backtest` (same shapes/validation as `POST /demo/run`); a bad selection (unknown/duplicate model, `gap ≥ horizon`) is one `error` event then close. The frontend sends both keys only when the operator changed them (dirty-only rule), so an untouched run streams a byte-identical legacy frame. | +| demo | GET | `/demo/workspaces` | **E4 (#393)** — list saved showcase workspaces, newest first (`limit` 1-100 default 20 / `offset`); `200` + empty list on an empty table. **E1 (#407)** — list items additively carry `archived`, `pinned`, `tags`, `replayed_from_workspace_id`. **E2 (#408)** — additive query params: `q` (name ILIKE search, min 2 chars), repeated `tags` (JSONB containment — all listed tags must match), `include_archived` (default `false` — archived rows are now HIDDEN by default), allow-listed `sort_by` (`created_at`/`name`/`seed`/`status`; unknown → default `created_at desc`, no 422) + `sort_order` (`asc`/`desc`); pinned rows always order first; `total` respects the active filters. **E3 (#409)** — list items additively carry the `seed_overrides` / `user_scope` story slots (`null` on runs without them) — deliberately on the LIST item, because the frontend Replay builds its verbatim start frame from list rows. **E4 (#410)** — list items additively carry `run_config` (`{train_model_types, backtest}` or `null` on default-config rows) — also on the LIST item so Replay rebuilds the run config from list rows | | demo | GET | `/demo/workspaces/{workspace_id}` | **E4 (#393)** — full workspace row incl. `created_objects` soft references + grain/window columns; `404 application/problem+json` when missing. **E1 (#407)** — response additively carries the list-item lifecycle fields plus `notes`, `config_schema_version`, and the six story slots (`seed_overrides` / `user_scope` / `approval_events` / `rag_events` / `job_ids` / `phase_summaries` — `null` until their writer epic lands; schemas in `docs/_base/DOMAIN_MODEL.md`). **E3 (#409)** — `seed_overrides` and `user_scope` are now WRITTEN (recorded at create time from the start frame) and surfaced on the LIST item as well (Detail inherits) | | demo | GET | `/demo/workspaces/{workspace_id}/health` | **E2 (#408)** — probe the workspace's soft references in-process (model runs, scenario plans, alias, batch, agent session, `job_ids` slot) via `httpx.ASGITransport`; per-reference `status` ∈ `alive` (2xx) / `dead` (404 — deleted after the run) / `unknown` (anything else — never a 500), plus `alive`/`dead`/`unknown` counts and `partial_run` (true when the row's status ≠ `completed`); non-probeable keys (`v2_model_path`, `scenario_artifact_key`, `train_model_types`) are skipped; `404 application/problem+json` when the workspace is missing | | demo | PATCH | `/demo/workspaces/{workspace_id}` | **E1 (#407)** — partial lifecycle update (`name` / `notes` / `tags` / `archived` / `pinned`; `exclude_unset` semantics — only provided fields change; explicit `null` clears `name`/`notes`; explicit `null` on `archived`/`pinned`/`tags` → `422` (send `[]` to clear tags); `status` NOT patchable — the pipeline owns it); returns the updated `WorkspaceDetailResponse`; empty body = `200` no-op; `404 application/problem+json` when missing; `422` on unknown keys / bad name pattern / >20 tags | diff --git a/docs/_base/DOMAIN_MODEL.md b/docs/_base/DOMAIN_MODEL.md index a7493219..150a3a41 100644 --- a/docs/_base/DOMAIN_MODEL.md +++ b/docs/_base/DOMAIN_MODEL.md @@ -58,7 +58,7 @@ ### `showcase_workspace` (Demo) - **Root:** `ShowcaseWorkspace(workspace_id: str, status: str)` — one row = one preserved (`preservation="keep"`) showcase run. Ephemeral runs (the default) write no row; a `workspace_name` merely labels a keep-run row (names are non-unique). - **Status state machine:** `running` → `completed` | `failed` (CHECK-constrained; the finalize hook settles the row even on mid-run failure). -- **Stored metadata:** replay config (`seed`, `scenario`, `reset`, `skip_seed`), showcase grain + window (`store_id`, `product_id`, `date_start`, `date_end` — NULL on early failure), lifecycle (`status`, `created_at`/`updated_at`), and the JSONB payloads below. E1 (#407) adds operator-curation columns `archived` / `pinned` (booleans, default false, PATCH-mutable, orthogonal to `status` — the pipeline owns the run lifecycle), `notes` (free text, 2000-char cap at the Pydantic boundary), `tags` (a queryable JSONB string array — its own GIN-indexed column, exact `scenario_plan.tags` pattern, ≤20 items at the PATCH boundary), `config_schema_version` (int, default 1 — versions the workspace config + story-slot schema as a whole; any epic that changes a documented slot shape bumps the ORM default and documents the delta here), and the provenance column `replayed_from_workspace_id` (String(32), btree-indexed SOFT reference — see Invariants). +- **Stored metadata:** replay config (`seed`, `scenario`, `reset`, `skip_seed`), showcase grain + window (`store_id`, `product_id`, `date_start`, `date_end` — NULL on early failure), lifecycle (`status`, `created_at`/`updated_at`), and the JSONB payloads below. E1 (#407) adds operator-curation columns `archived` / `pinned` (booleans, default false, PATCH-mutable, orthogonal to `status` — the pipeline owns the run lifecycle), `notes` (free text, 2000-char cap at the Pydantic boundary), `tags` (a queryable JSONB string array — its own GIN-indexed column, exact `scenario_plan.tags` pattern, ≤20 items at the PATCH boundary), `config_schema_version` (int, default 1 — versions the workspace config + story-slot schema as a whole; any epic that changes a documented slot shape bumps the ORM default and documents the delta here), and the provenance column `replayed_from_workspace_id` (String(32), btree-indexed SOFT reference — see Invariants). E4 (#410) adds the replay-input column `run_config` (nullable JSONB, `{"train_model_types": [...], "backtest": {...}}` or NULL on default-config runs) — a REPLAY INPUT in the same class as `seed`/`scenario`/`reset`/`skip_seed`, **NOT a story slot** (D1): it records the start-frame model set + backtest config a kept run was launched with, written by `create_workspace` at insert time and consumed by Load/Replay. `config_schema_version` is deliberately NOT bumped by E4 — it versions the STORY-SLOT schema; `run_config` presence is NULL-detectable and carries its own documented shape. - **JSONB fields:** `created_objects` (sparse soft-reference keys — `winning_run_id`, `v2_run_id`, `v2_model_path`, `alias`, `agent_session_id`, `batch_id`, `scenario_plan_ids`, `scenario_artifact_key`, `train_model_types`, `stale_alias_run_id`) and `result_summary` (winner / WAPE / wall-clock display payload). - **JSONB story slots (E1 #407 — authoritative per-slot schema):** six dedicated nullable JSONB columns; `NULL` = "slot never written" (distinct from empty). E1 ships the columns only — each slot has an assigned writer epic: - `seed_overrides` (**WRITTEN since E3 #409**) — SPARSE dict: only operator-set knobs appear, `{}` is never stored (`None` instead). Allow-listed keys (the `SeederOverrides` schema, `app/shared/seeder/overrides.py`): `stores` int 1-100, `products` int 1-500, `window_days` int 75-365, `sparsity` float 0-0.9, `promotion_intensity` float 0-0.5, `stockout_intensity` float 0-0.5, `noise_sigma` float 0-0.5. Persisted via `model_dump(mode="json", exclude_none=True)` at create time; replay re-submits it verbatim. Records the REQUESTED config — the data the run actually seeded follows from it deterministically. @@ -69,7 +69,7 @@ - `phase_summaries` (later parallel epic) — list[dict], one per phase: `{"phase_name": str, "status": "pass"|"fail"|"warn"|"skip", "steps": int, "duration_ms": float}`. - **Relationship to demo pipeline runs:** one workspace row per kept pipeline run — `create_workspace` inserts it as `running` before the first step; `finalize_workspace` settles it with the run's collected ids. NOT a seeder `scenario`: a preset is a reusable data-generation recipe; a workspace is the record of ONE concrete run (which preset it used, with what seed, and what it produced). - **Invariants:** - - The config columns (`seed`, `scenario`, `reset`, `skip_seed`) — plus, since E3 #409, the `seed_overrides`/`user_scope` story slots — are sufficient for a verbatim Replay through the normal run path; replay never mutates the original row; it creates a NEW row. + - The config columns (`seed`, `scenario`, `reset`, `skip_seed`) — plus, since E3 #409, the `seed_overrides`/`user_scope` story slots, and since E4 #410 the `run_config` replay-input column — are sufficient for a verbatim Replay through the normal run path; replay never mutates the original row; it creates a NEW row. - `name` is deliberately NON-unique; `workspace_id` (UUID hex) is the unique handle. - `created_objects` carries SOFT references only — **no ForeignKeys by design**. The workspace row is an audit record, not an ownership root: the referenced runs/plans/aliases are independently operator-deletable, and a workspace must never block (or cascade) their deletion. - Deletion is METADATA-ONLY, symmetric with the no-FK design: `DELETE /demo/workspaces/{id}` removes the `showcase_workspace` row and nothing else — the soft-referenced model runs, scenario plans, aliases, jobs, agent sessions, and artifacts survive, and a workspace whose references already dangle still deletes cleanly. diff --git a/docs/_base/RUNBOOKS.md b/docs/_base/RUNBOOKS.md index f7aa35a5..495e476d 100644 --- a/docs/_base/RUNBOOKS.md +++ b/docs/_base/RUNBOOKS.md @@ -144,6 +144,8 @@ uv run python scripts/run_demo.py --seed 42 --quiet 2>&1 | tee demo.log - **422 `window_days cannot override the calendar-pinned holiday_rush window`** — expected; the preset's holiday spikes are fixed 2024 dates and a shifted window would silently drop all of them (the UI disables the window control on `holiday_rush`). Fix: pick a today-anchored preset or drop `window_days`. - **`status` step shows ⚠️ `user_scope (store=X, product=Y) not found — fell back to discovered pair`** — expected after a reset/reseed re-issued entity ids (Postgres sequences never reset). The run continues on the discovered pair; the workspace row's `user_scope` slot keeps the REQUESTED pair while the `store_id`/`product_id` columns record the EFFECTIVE grain (divergence is visible by design). Fix: re-pick the pair from the live dropdowns after the run. - **`backtest` step ❌ NaN WAPE after high `stockout_intensity` / `sparsity` overrides** — documented expected outcome, same semantics as the `sparse` preset (incident 28); the panel shows a caveat badge at risky values. Not graceful-skipped by design — a skip would mask real regressions on healthy configs. Fix: lower the knob or accept the documented fail. +30. **`train` step ❌ "forecast_enable_* flag is off" after selecting an opt-in model (E4 #410)** — the "Run configuration (advanced)" model picker only surfaces opt-in models (`lightgbm` / `xgboost` / `random_forest`) when the matching `forecast_enable_*` flag is on, so this normally cannot happen from the UI. A direct `POST /demo/run` / WS start frame naming a disabled opt-in passes Pydantic validation (the schema checks only the static `KNOWN_MODEL_TYPES` allow-list, NOT settings — D6, to avoid the `.env`-bleed class) and fails fast at the `train` step with a detail naming the flag. Cause: the flag defaults False (`app/core/config.py:118-120`), or the extra is not installed even with the flag on. Fix: set `forecast_enable_=true` in `.env` (and `uv sync --all-extras` for lightgbm/xgboost), or deselect the model. The catalog's `requires_extra` badge hints at the install need. +31. **`backtest` step ❌ NaN / too-few-folds after an aggressive custom split (E4 #410)** — the "Advanced split settings" form lets the operator push `horizon` / `n_splits` / `min_train_size` / `gap` past what the seeded window can fit; `min_train_size + n_splits×(horizon+gap)` greater than the scenario window (92d for demo_minimal/sparse/holiday_rush, 180d otherwise) cannot produce valid folds. The backend does NOT clamp — it fails honestly (same policy as the `sparse` preset, incident 28; a silent clamp would mask real regressions on healthy configs). The form shows a non-blocking amber split-fit warning ahead of time. Fix: reduce horizon / splits / min train (or pick a 180-day preset), then re-run. > ⚠️ **RAG embedding-dim mismatch can orphan chunks (R4).** PRP-40 indexes a curated 5-file subset; if the operator switches the embedding provider mid-showcase, indexed chunks orphan (pgvector assumes one fixed dimension per column). PRP-40 does NOT ship a `clear_rag` UI toggle — that's a future PRP. Stick to one provider for the showcase run. @@ -162,7 +164,7 @@ uv run python scripts/run_demo.py --seed 42 --quiet 2>&1 | tee demo.log **Notes:** keep-runs are recorded by warn-and-continue hooks — a DB hiccup during `create_workspace` yields a green pipeline with `workspace_id: null` and no row (check uvicorn logs for `demo.workspace_create_failed`). Ephemeral runs write no workspace rows and stay in the localStorage Run-history strip; kept runs appear ONLY in the server-backed panel. On `showcase_rich` keep-runs, the planning-phase scenario plans carry the `workspace:` tag (E3 #392) — retrieve them via `GET /scenarios?tags=workspace: