igerber · igerber · Jun 23, 2026 · Jun 23, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- **`generate_synthetic_control_data()` data generator + a capstone `SyntheticControl` tutorial.** New public generator (`diff_diff/prep_dgp.py`, exported from `diff_diff`) builds a **single-treated-unit** factor-model panel for synthetic-control demos and tests: one treated unit whose latent factor loadings and baseline are an exact convex combination of a few donors (so the noiseless trajectory lies in the donor convex hull and a synthetic control reproduces it closely — the observed fit is approximate under added noise), persistent AR(1) factors, predictor covariates that each proxy a distinct factor, a common calendar time effect, and a known `"ramp"` or `"constant"` treatment effect emitted as `true_effect`. Tutorial **`docs/tutorials/25_synthetic_control_policy.ipynb`** walks the whole `SyntheticControl` surface end-to-end on a policy-evaluation story (one state adopts a clean-energy standard), structured around **two inference philosophies**: cross-unit permutation (`in_space_placebo` + Firpo–Possebom `confidence_set`, with `leave_one_out` / `in_time_placebo` robustness) versus over-time conformal (CWZ `conformal_test` / `conformal_confidence_intervals` / `conformal_average_effect`), with the per-period conformal band as the climax. A `tests/test_t25_synthetic_control_policy_drift.py` drift guard re-derives every quoted number from the generator.
+
 ### Fixed
 - **`LinearRegression.get_se()` / `get_inference()` no longer return a `NaN` standard error from a tiny-negative variance artifact.** A high-leverage / degenerate coefficient (e.g. an absorbed-FE dummy near-collinear with the treatment, whose Bell-McCaffrey Satterthwaite DOF already hits the noise-floor guard) can have a CR2/HC variance of ~0 (≈1e-32) whose vcov diagonal lands just-below-zero under BLAS-dependent float rounding; `np.sqrt` of the negative then produced a `NaN` SE **nondeterministically** — passing single-threaded but failing under the parallel pure-Python full-suite run (`tests/test_methodology_wls_cr2.py::TestLinearRegressionFENanGuardEndToEnd::test_did_absorbed_fe_lr_inference_nan_for_guarded_coefs`). Both SE sites now clamp the vcov diagonal at 0, so the SE is finite (0 for a genuinely-zero variance), deterministic, and BLAS-independent. **No change for any positive variance** (the clamp is a no-op there); only the previously-`NaN` degenerate case is affected.
 

diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py
@@ -133,6 +133,7 @@
     generate_staggered_data,
     generate_staggered_ddd_data,
     generate_survey_did_data,
+    generate_synthetic_control_data,
     make_post_indicator,
     make_treatment_indicator,
     rank_control_units,
@@ -426,6 +427,7 @@
     "generate_survey_did_data",
     "generate_continuous_did_data",
     "generate_reversible_did_data",
+    "generate_synthetic_control_data",
     "create_event_time",
     "aggregate_survey",
     "aggregate_to_cohorts",

diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt
@@ -1956,6 +1956,7 @@ from diff_diff import (
     generate_factor_data,
     generate_ddd_data,
     generate_continuous_did_data,
+    generate_synthetic_control_data,
 )
 
 # Basic 2x2 DiD data
@@ -1985,6 +1986,11 @@ data = generate_ddd_data(n_per_cell=100, treatment_effect=2.0, seed=42)
 # Continuous dose data
 data = generate_continuous_did_data(n_units=500, n_periods=4,
                                     att_function="linear", att_slope=2.0, seed=42)
+
+# Single-treated-unit panel for synthetic control (noiseless treated path in the
+# donor convex hull; ramping or constant effect). Use with SyntheticControl.
+data = generate_synthetic_control_data(n_donors=20, n_pre=60, n_post=5,
+                                       effect_type="ramp", seed=0)
 ```
 
 ## Real-World Datasets

diff --git a/diff_diff/prep.py b/diff_diff/prep.py
@@ -28,6 +28,7 @@
     generate_staggered_data,
     generate_staggered_ddd_data,
     generate_survey_did_data,
+    generate_synthetic_control_data,
 )
 from diff_diff.survey import (
     ResolvedSurveyDesign,

diff --git a/diff_diff/prep_dgp.py b/diff_diff/prep_dgp.py
@@ -466,6 +466,223 @@ def generate_factor_data(
     return pd.DataFrame(records)
 
 
+def generate_synthetic_control_data(
+    n_donors: int = 20,
+    n_pre: int = 24,
+    n_post: int = 6,
+    n_factors: int = 3,
+    n_predictors: int = 3,
+    treatment_effect: float = 5.0,
+    effect_type: str = "ramp",
+    effect_growth: float = 1.0,
+    factor_strength: float = 1.0,
+    factor_persistence: float = 0.9,
+    n_convex_donors: int = 4,
+    baseline: float = 50.0,
+    unit_baseline_sd: float = 5.0,
+    time_trend: float = 0.5,
+    predictor_noise_sd: float = 0.5,
+    noise_sd: float = 1.0,
+    seed: Optional[int] = None,
+) -> pd.DataFrame:
+    """
+    Generate a single-treated-unit panel for synthetic control (ADH) demos.
+
+    Creates a factor-model panel with ONE treated unit (``unit == 0``) and
+    ``n_donors`` never-treated donors, designed so the treated unit's underlying
+    (noiseless) trajectory lies in the span of the donor trajectories — so a synthetic
+    control can reproduce it closely. The DGP:
+
+        Y_it = b_i + beta_t + factor_strength * (Lambda_i . F_t) + tau_it + eps_it
+
+    where ``F_t`` is a persistent (AR(1)) latent factor path, ``Lambda_i`` are
+    nonnegative factor loadings, ``b_i`` is a unit baseline, ``beta_t`` is a common
+    calendar time effect, ``tau_it`` is the treatment effect (treated unit, post
+    periods only), and ``eps_it ~ N(0, noise_sd)``.
+
+    The treated unit's loadings and baseline are an exact convex combination of
+    ``n_convex_donors`` donors (Dirichlet weights), so in the **noiseless** limit the
+    treated unit's outcome path is itself that convex combination of donor paths — it
+    lies in the donor convex hull and a synthetic control reproduces it exactly. With
+    nonzero ``noise_sd`` / ``predictor_noise_sd`` the observed data is only
+    *approximately* in-hull, so a fitted synthetic control achieves a small (not
+    exactly zero) pre-period RMSPE. Note that the generating weights are NOT identified
+    by the estimator: many donor combinations reproduce the treated pre-period path
+    equally well, so a fitted synthetic control recovers the counterfactual outcome path
+    (and ATT), not this specific weight vector.
+
+    Unlike :func:`generate_factor_data` (which *shifts* treated loadings to create
+    confounding for TROP), this generator keeps the treated unit reproducible from
+    donors, which is what classic synthetic control requires.
+
+    Parameters
+    ----------
+    n_donors : int, default=20
+        Number of never-treated donor units. The in-space placebo p-value floor is
+        ``1 / (n_donors + 1)``; larger pools give finer permutation p-values.
+    n_pre : int, default=24
+        Number of pre-treatment periods.
+    n_post : int, default=6
+        Number of post-treatment periods.
+    n_factors : int, default=3
+        Number of latent factors driving the common time structure.
+    n_predictors : int, default=3
+        Number of time-invariant predictor covariates (each a noisy linear map of
+        the unit's loading vector). Pass these as ``predictors=`` to
+        :class:`~diff_diff.SyntheticControl` to exercise the V-search; otherwise
+        the fit defaults to pre-period outcomes and these columns are unused.
+    treatment_effect : float, default=5.0
+        Base treatment effect on the treated unit in post periods. Under
+        ``effect_type="ramp"`` this is the first post-period effect.
+    effect_type : str, default="ramp"
+        ``"ramp"`` (effect grows by ``effect_growth`` each post period) or
+        ``"constant"`` (effect fixed at ``treatment_effect``).
+    effect_growth : float, default=1.0
+        Per-post-period increment under ``effect_type="ramp"`` (ignored otherwise).
+    factor_strength : float, default=1.0
+        Scaling on the interactive (loading . factor) component.
+    factor_persistence : float, default=0.9
+        AR(1) coefficient on the latent factors in ``[0, 1]``: ``0`` gives i.i.d.
+        factors, ``1`` a random walk. Serial dependence is what makes the
+        moving-block conformal scheme meaningful.
+    n_convex_donors : int, default=4
+        Number of donors whose convex combination defines the treated unit (must be
+        in ``[1, n_donors]``).
+    baseline : float, default=50.0
+        Mean unit baseline (e.g. a per-capita outcome level).
+    unit_baseline_sd : float, default=5.0
+        Standard deviation of donor baselines.
+    time_trend : float, default=0.5
+        Slope of the common calendar time effect ``beta_t = time_trend * t`` (a
+        common trend is matched by any convex combination, so it preserves in-hull
+        existence). Set to ``0.0`` for no trend.
+    predictor_noise_sd : float, default=0.5
+        Noise added to each predictor covariate.
+    noise_sd : float, default=1.0
+        Standard deviation of idiosyncratic outcome noise.
+    seed : int, optional
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    pd.DataFrame
+        Long-format balanced panel with columns:
+
+        - ``unit``: unit identifier (``0`` is the treated unit; ``1..n_donors`` donors)
+        - ``period``: time period (``0..n_pre-1`` pre, ``n_pre..`` post)
+        - ``outcome``: outcome variable
+        - ``treatment``: absorbing 0/1 indicator (1 only for the treated unit in
+          post periods) — :meth:`SyntheticControl.fit` infers the treated unit and
+          post periods from this column
+        - ``treat``: unit-level ever-treated flag
+        - ``true_effect``: the true treatment effect for this observation
+        - ``x1 .. x{n_predictors}``: predictor covariates
+
+    Examples
+    --------
+    Generate a ramped-effect panel and fit a synthetic control:
+
+    >>> from diff_diff import generate_synthetic_control_data, SyntheticControl
+    >>> data = generate_synthetic_control_data(seed=42)
+    >>> res = SyntheticControl(seed=42).fit(
+    ...     data, outcome="outcome", treatment="treatment",
+    ...     unit="unit", time="period", predictors=["x1", "x2", "x3"])
+    >>> round(res.att, 1) > 0  # post-period gap recovers the injected effect
+    True
+    """
+    if n_donors < 2:
+        raise ValueError(f"n_donors ({n_donors}) must be at least 2")
+    if n_pre < 1:
+        raise ValueError(f"n_pre ({n_pre}) must be at least 1")
+    if n_post < 1:
+        raise ValueError(f"n_post ({n_post}) must be at least 1")
+    if n_factors < 1:
+        raise ValueError(f"n_factors ({n_factors}) must be at least 1")
+    if n_predictors < 0:
+        raise ValueError(f"n_predictors ({n_predictors}) must be non-negative")
+    if n_convex_donors < 1 or n_convex_donors > n_donors:
+        raise ValueError(f"n_convex_donors ({n_convex_donors}) must be in [1, n_donors={n_donors}]")
+    if effect_type not in ("ramp", "constant"):
+        raise ValueError(f"effect_type ({effect_type!r}) must be 'ramp' or 'constant'")
+    if not (0.0 <= factor_persistence <= 1.0):
+        raise ValueError(f"factor_persistence ({factor_persistence}) must be in [0, 1]")
+
+    rng = np.random.default_rng(seed)
+
+    n_periods = n_pre + n_post
+    n_units = n_donors + 1  # unit 0 is treated, 1..n_donors are donors
+
+    # Persistent latent factors F: (n_periods, n_factors). AR(1): rho=1 -> random
+    # walk, rho=0 -> i.i.d. The synthetic control removes this common structure, so
+    # the post-fit residuals stay stationary regardless of persistence.
+    F = np.empty((n_periods, n_factors))
+    F[0] = rng.normal(0, 1, n_factors)
+    for t in range(1, n_periods):
+        F[t] = factor_persistence * F[t - 1] + rng.normal(0, 1, n_factors)
+
+    # Donor loadings (nonnegative so a convex combination is well-defined) and
+    # baselines.
+    lambda_donor = np.abs(rng.normal(0, 1, (n_factors, n_donors)))
+    b_donor = rng.normal(baseline, unit_baseline_sd, n_donors)
+
+    # Treated unit = exact convex combination of n_convex_donors donors (in-hull).
+    convex_idx = rng.choice(n_donors, size=n_convex_donors, replace=False)
+    w_star = np.zeros(n_donors)
+    w_star[convex_idx] = rng.dirichlet(np.ones(n_convex_donors))
+    lambda_treated = lambda_donor @ w_star
+    b_treated = float(b_donor @ w_star)
+
+    # Stack: column 0 = treated, columns 1.. = donors.
+    loadings = np.column_stack([lambda_treated, lambda_donor])  # (n_factors, n_units)
+    baselines = np.concatenate([[b_treated], b_donor])  # (n_units,)
+
+    # Time-invariant predictors: each PRIMARILY proxies one distinct latent factor
+    # (diagonal-dominant map) plus a small cross-factor mix and noise. Distinct
+    # proxies give each predictor complementary signal, so the V-search spreads
+    # weight across them instead of leaning on a single redundant column.
+    pred_map = 0.25 * rng.normal(0, 1, (n_predictors, n_factors))
+    for h in range(n_predictors):
+        pred_map[h, h % n_factors] += 1.0
+    predictors = pred_map @ loadings + rng.normal(
+        0, predictor_noise_sd, (n_predictors, n_units)
+    )  # (n_predictors, n_units)
+
+    # Common calendar time effect (matched by any convex combination since sum w=1).
+    beta = time_trend * np.arange(n_periods)
+
+    records = []
+    for i in range(n_units):
+        is_treated_unit = i == 0
+        for t in range(n_periods):
+            post = t >= n_pre
+            y = baselines[i] + beta[t] + factor_strength * (loadings[:, i] @ F[t])
+
+            effect = 0.0
+            if is_treated_unit and post:
+                k = t - n_pre  # post-period index, 0-based
+                if effect_type == "ramp":
+                    effect = treatment_effect + effect_growth * k
+                else:
+                    effect = treatment_effect
+                y += effect
+
+            y += rng.normal(0, noise_sd)
+
+            row = {
+                "unit": i,
+                "period": t,
+                "outcome": y,
+                "treatment": int(is_treated_unit and post),
+                "treat": int(is_treated_unit),
+                "true_effect": effect,
+            }
+            for h in range(n_predictors):
+                row[f"x{h + 1}"] = predictors[h, i]
+            records.append(row)
+
+    return pd.DataFrame(records)
+
+
 def generate_ddd_data(
     n_per_cell: int = 100,
     treatment_effect: float = 2.0,

diff --git a/docs/api/prep.rst b/docs/api/prep.rst
@@ -128,6 +128,38 @@ Example
        time="period", treatment="treatment",
    )
 
+generate_synthetic_control_data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Generate a **single-treated-unit** panel for synthetic control demos. A factor model
+places the treated unit's *noiseless* trajectory in the span of the donor trajectories
+(so a synthetic control reproduces it closely — the observed fit is approximate under
+the added transitory/predictor noise) and injects a known ramping or constant treatment
+effect. Use this with :class:`~diff_diff.SyntheticControl`.
+
+.. autofunction:: diff_diff.generate_synthetic_control_data
+
+Example
+^^^^^^^
+
+.. code-block:: python
+
+   from diff_diff import generate_synthetic_control_data, SyntheticControl
+
+   data = generate_synthetic_control_data(
+       n_donors=10,
+       n_pre=20,
+       n_post=4,
+       effect_type="ramp",
+       seed=0,
+   )
+
+   res = SyntheticControl(n_starts=1, seed=0).fit(
+       data, outcome="outcome", treatment="treatment",
+       unit="unit", time="period", predictors=["x1", "x2", "x3"],
+   )
+   print(round(res.att, 2), round(res.pre_rmspe, 2))
+
 Indicator Creation
 ------------------
 

diff --git a/docs/doc-deps.yaml b/docs/doc-deps.yaml
@@ -643,6 +643,8 @@ sources:
       - path: diff_diff/guides/llms.txt
         section: "Estimators"
         type: user_guide
+      - path: docs/tutorials/25_synthetic_control_policy.ipynb
+        type: tutorial
 
   diff_diff/synthetic_control_results.py:
     drift_risk: low
@@ -659,6 +661,8 @@ sources:
       - path: diff_diff/guides/llms-full.txt
         section: "SyntheticControlResults"
         type: user_guide
+      - path: docs/tutorials/25_synthetic_control_policy.ipynb
+        type: tutorial
 
   diff_diff/conformal.py:
     drift_risk: medium
@@ -670,6 +674,8 @@ sources:
         type: methodology
       - path: docs/api/synthetic_control.rst
         type: api_reference
+      - path: docs/tutorials/25_synthetic_control_policy.ipynb
+        type: tutorial
 
   # ── HonestDiD ───────��──────────────────────────────────────────────
 
@@ -917,6 +923,8 @@ sources:
         type: user_guide
       - path: docs/practitioner_decision_tree.rst
         type: user_guide
+      - path: docs/tutorials/25_synthetic_control_policy.ipynb
+        type: tutorial
 
   diff_diff/datasets.py:
     drift_risk: low

diff --git a/docs/index.rst b/docs/index.rst
@@ -109,6 +109,7 @@ Quick Links
    tutorials/15_efficient_did
    tutorials/16_survey_did
    tutorials/16_wooldridge_etwfe
+   tutorials/25_synthetic_control_policy
 
 .. toctree::
    :maxdepth: 1