diff --git a/diff_diff/visualization.py b/diff_diff/visualization.py index 6584b51..3ba3a80 100644 --- a/diff_diff/visualization.py +++ b/diff_diff/visualization.py @@ -73,8 +73,10 @@ def plot_event_study( periods : list, optional List of periods to plot. If None, uses all periods from results. reference_period : any, optional - The reference period (normalized to effect=0). Will be shown as a - hollow marker. If None, tries to infer from results. + The reference period to highlight. When explicitly provided, effects + are normalized (ref effect subtracted) and ref SE is set to NaN. + When None and auto-inferred from results, only hollow marker styling + is applied (no normalization). If None, tries to infer from results. pre_periods : list, optional List of pre-treatment periods. Used for shading. post_periods : list, optional @@ -151,8 +153,9 @@ def plot_event_study( trends holds. Large pre-treatment effects suggest the assumption may be violated. - 2. **Reference period**: Usually the last pre-treatment period (t=-1), - normalized to zero. This is the omitted category. + 2. **Reference period**: Usually the last pre-treatment period (t=-1). + When explicitly specified via ``reference_period``, effects are normalized + to zero at this period. When auto-inferred, shown with hollow marker only. 3. **Post-treatment periods**: The treatment effects of interest. These show how the outcome evolved after treatment. @@ -170,10 +173,18 @@ def plot_event_study( from scipy import stats as scipy_stats + # Track if reference_period was explicitly provided by user + reference_period_explicit = reference_period is not None + # Extract data from results if provided if results is not None: - effects, se, periods, pre_periods, post_periods, reference_period = \ - _extract_plot_data(results, periods, pre_periods, post_periods, reference_period) + extracted = _extract_plot_data( + results, periods, pre_periods, post_periods, reference_period + ) + effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred = extracted + # If reference was inferred from results, it was NOT explicitly provided + if reference_inferred: + reference_period_explicit = False elif effects is None or se is None: raise ValueError( "Must provide either 'results' or both 'effects' and 'se'" @@ -192,6 +203,19 @@ def plot_event_study( # Compute confidence intervals critical_value = scipy_stats.norm.ppf(1 - alpha / 2) + # Normalize effects to reference period ONLY if explicitly specified by user + # Auto-inferred reference periods (from CallawaySantAnna) just get hollow marker styling, + # NO normalization. This prevents unintended normalization when the reference period + # isn't a true identifying constraint (e.g., CallawaySantAnna with base_period="varying"). + if (reference_period is not None and reference_period in effects and + reference_period_explicit): + ref_effect = effects[reference_period] + if np.isfinite(ref_effect): + effects = {p: e - ref_effect for p, e in effects.items()} + # Set reference SE to NaN (it's now a constraint, not an estimate) + # This follows fixest convention where the omitted category has no SE/CI + se = {p: (np.nan if p == reference_period else s) for p, s in se.items()} + plot_data = [] for period in periods: effect = effects.get(period, np.nan) @@ -304,14 +328,17 @@ def _extract_plot_data( pre_periods: Optional[List[Any]], post_periods: Optional[List[Any]], reference_period: Optional[Any], -) -> Tuple[Dict, Dict, List, List, List, Any]: +) -> Tuple[Dict, Dict, List, List, List, Any, bool]: """ Extract plotting data from various result types. Returns ------- tuple - (effects, se, periods, pre_periods, post_periods, reference_period) + (effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred) + + reference_inferred is True if reference_period was auto-detected from results + rather than explicitly provided by the user. """ # Handle DataFrame input if isinstance(results, pd.DataFrame): @@ -328,7 +355,8 @@ def _extract_plot_data( if periods is None: periods = list(results['period']) - return effects, se, periods, pre_periods, post_periods, reference_period + # DataFrame input: reference_period was already set by caller, never inferred here + return effects, se, periods, pre_periods, post_periods, reference_period, False # Handle MultiPeriodDiDResults if hasattr(results, 'period_effects'): @@ -348,7 +376,8 @@ def _extract_plot_data( if periods is None: periods = post_periods - return effects, se, periods, pre_periods, post_periods, reference_period + # MultiPeriodDiDResults: reference_period was already set by caller, never inferred here + return effects, se, periods, pre_periods, post_periods, reference_period, False # Handle CallawaySantAnnaResults (event study aggregation) if hasattr(results, 'event_study_effects') and results.event_study_effects is not None: @@ -362,8 +391,12 @@ def _extract_plot_data( if periods is None: periods = sorted(effects.keys()) + # Track if reference_period was explicitly provided vs auto-inferred + reference_inferred = False + # Reference period is typically -1 for event study if reference_period is None: + reference_inferred = True # We're about to infer it # Detect reference period from n_groups=0 marker (normalization constraint) # This handles anticipation > 0 where reference is at e = -1 - anticipation for period, effect_data in results.event_study_effects.items(): @@ -380,7 +413,7 @@ def _extract_plot_data( if post_periods is None: post_periods = [p for p in periods if p >= 0] - return effects, se, periods, pre_periods, post_periods, reference_period + return effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred raise TypeError( f"Cannot extract plot data from {type(results).__name__}. " diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 6d6c1b6..015ba98 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -728,6 +728,52 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE² --- +# Visualization + +## Event Study Plotting (`plot_event_study`) + +**Reference Period Normalization** + +Normalization only occurs when `reference_period` is **explicitly specified** by the user: + +- **Explicit `reference_period=X`**: Normalizes effects (subtracts ref effect), sets ref SE to NaN + - Point estimates: `effect_normalized = effect - effect_ref` + - Reference period SE → NaN (it's now a constraint, not an estimate) + - Other periods' SEs unchanged (uncertainty relative to the constraint) + - CIs recomputed from normalized effects and original SEs + +- **Auto-inferred reference** (from CallawaySantAnna results): Hollow marker styling only, no normalization + - Original effects are plotted unchanged + - Reference period shown with hollow marker for visual indication + - All periods retain their original SEs and error bars + +This design prevents unintended normalization when the reference period isn't a true +identifying constraint (e.g., CallawaySantAnna with `base_period="varying"` where different +cohorts use different comparison periods). + +The explicit-only normalization follows the `fixest` (R) convention where the omitted/reference +category is an identifying constraint with no associated uncertainty. Auto-inferred references +follow the `did` (R) package convention which does not normalize and reports full inference. + +**Rationale**: When normalizing to a reference period, we're treating that period as an +identifying constraint (effect ≡ 0 by definition). The variance of a constant is zero, +but since it's a constraint rather than an estimated quantity, we report NaN rather than 0. +Auto-inferred references may not represent true identifying constraints, so normalization +should be a deliberate user choice. + +**Edge Cases:** +- If `reference_period` not in data: No normalization applied +- If reference effect is NaN: No normalization applied +- Reference period CI becomes (NaN, NaN) after normalization (explicit only) +- Reference period is plotted with hollow marker (both explicit and auto-inferred) +- Reference period error bars: removed for explicit, retained for auto-inferred + +**Reference implementation(s):** +- R: `fixest::coefplot()` with reference category shown at 0 with no CI +- R: `did::ggdid()` does not normalize; shows full inference for all periods + +--- + # Cross-Reference: Standard Errors Summary | Estimator | Default SE | Alternatives | diff --git a/docs/tutorials/02_staggered_did.ipynb b/docs/tutorials/02_staggered_did.ipynb index 62d913d..4657608 100644 --- a/docs/tutorials/02_staggered_did.ipynb +++ b/docs/tutorials/02_staggered_did.ipynb @@ -3,7 +3,31 @@ { "cell_type": "markdown", "metadata": {}, - "source": "# Staggered Difference-in-Differences\n\nThis notebook demonstrates how to handle **staggered treatment adoption** using modern DiD estimators. In staggered DiD settings:\n\n- Different units get treated at different times\n- Traditional TWFE can give biased estimates due to \"forbidden comparisons\"\n- Modern estimators compute group-time specific effects and aggregate them properly\n\nWe'll cover:\n1. Understanding staggered adoption\n2. The problem with TWFE (and Goodman-Bacon decomposition)\n3. The Callaway-Sant'Anna estimator\n4. Group-time effects ATT(g,t)\n5. Aggregating effects (simple, group, event-study)\n6. Bootstrap inference for valid standard errors\n7. Visualization\n8. Pre-treatment effects and parallel trends testing\n9. Different control group options\n10. Handling anticipation effects\n11. Adding covariates\n12. Comparing with MultiPeriodDiD\n13. Sun-Abraham interaction-weighted estimator\n14. Comparing CS and SA as a robustness check" + "source": [ + "# Staggered Difference-in-Differences\n", + "\n", + "This notebook demonstrates how to handle **staggered treatment adoption** using modern DiD estimators. In staggered DiD settings:\n", + "\n", + "- Different units get treated at different times\n", + "- Traditional TWFE can give biased estimates due to \"forbidden comparisons\"\n", + "- Modern estimators compute group-time specific effects and aggregate them properly\n", + "\n", + "We'll cover:\n", + "1. Understanding staggered adoption\n", + "2. The problem with TWFE (and Goodman-Bacon decomposition)\n", + "3. The Callaway-Sant'Anna estimator\n", + "4. Group-time effects ATT(g,t)\n", + "5. Aggregating effects (simple, group, event-study)\n", + "6. Bootstrap inference for valid standard errors\n", + "7. Visualization\n", + "8. Pre-treatment effects and parallel trends testing\n", + "9. Different control group options\n", + "10. Handling anticipation effects\n", + "11. Adding covariates\n", + "12. Comparing with MultiPeriodDiD\n", + "13. Sun-Abraham interaction-weighted estimator\n", + "14. Comparing CS and SA as a robustness check" + ] }, { "cell_type": "code", @@ -810,19 +834,126 @@ { "cell_type": "markdown", "metadata": {}, - "source": "## 14. Comparing CS and SA as a Robustness Check\n\nRunning both estimators provides a useful robustness check. When they agree, results are more credible.\n\n### Understanding Pre-Period Differences\n\nYou may notice that **post-treatment effects align closely** between CS and SA, but **pre-treatment effects can differ in magnitude and significance**. This is expected methodological behavior, not a bug.\n\n**Why the difference?**\n\n1. **Callaway-Sant'Anna with `base_period=\"varying\"` (default)**:\n - Pre-treatment effects use **consecutive period comparisons** (period t vs period t-1)\n - Each pre-period coefficient represents a one-period change\n - These smaller incremental changes often yield lower t-statistics\n\n2. **Sun-Abraham**:\n - Uses a **fixed reference period** (e=-1 when anticipation=0, or e=-1-anticipation otherwise)\n - All coefficients are deviations from this single reference\n - Pre-period coefficients show cumulative difference from the reference\n\n**To make CS pre-periods more comparable to SA**, use `base_period=\"universal\"`:\n\n```python\ncs_universal = CallawaySantAnna(base_period=\"universal\")\n```\n\nThis makes CS compare all periods to g-1 (like SA), producing more similar pre-treatment estimates." + "source": [ + "## 14. Comparing CS and SA as a Robustness Check\n", + "\n", + "Running both estimators provides a useful robustness check. When they agree, results are more credible.\n", + "\n", + "### Understanding Pre-Period Differences\n", + "\n", + "You may notice that **post-treatment effects align closely** between CS and SA, but **pre-treatment effects can differ in magnitude and significance**. This is expected methodological behavior, not a bug.\n", + "\n", + "**Why the difference?**\n", + "\n", + "1. **Callaway-Sant'Anna with `base_period=\"varying\"` (default)**:\n", + " - Pre-treatment effects use **consecutive period comparisons** (period t vs period t-1)\n", + " - Each pre-period coefficient represents a one-period change\n", + " - These smaller incremental changes often yield lower t-statistics\n", + "\n", + "2. **Sun-Abraham**:\n", + " - Uses a **fixed reference period** (e=-1 when anticipation=0, or e=-1-anticipation otherwise)\n", + " - All coefficients are deviations from this single reference\n", + " - Pre-period coefficients show cumulative difference from the reference\n", + "\n", + "**To make CS pre-periods more comparable to SA**, use `base_period=\"universal\"`:\n", + "\n", + "```python\n", + "cs_universal = CallawaySantAnna(base_period=\"universal\")\n", + "```\n", + "\n", + "This makes CS compare all periods to g-1 (like SA), producing more similar pre-treatment estimates." + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Compare overall ATT from both estimators\nprint(\"Robustness Check: CS vs SA\")\nprint(\"=\" * 60)\nprint(f\"{'Estimator':<30} {'Overall ATT':>12} {'SE':>10}\")\nprint(\"-\" * 60)\nprint(f\"{'Callaway-Sant\\\\'Anna (varying)':<30} {results_cs.overall_att:>12.4f} {results_cs.overall_se:>10.4f}\")\nprint(f\"{'Sun-Abraham':<30} {results_sa.overall_att:>12.4f} {results_sa.overall_se:>10.4f}\")\n\n# Also fit CS with universal base period for comparison\ncs_universal = CallawaySantAnna(control_group=\"never_treated\", base_period=\"universal\")\nresults_cs_univ = cs_universal.fit(\n df, outcome=\"outcome\", unit=\"unit\",\n time=\"period\", first_treat=\"first_treat\",\n aggregate=\"event_study\"\n)\n\n# Compare event study effects\nprint(\"\\n\\nEvent Study Comparison:\")\nprint(\"Note: Pre-periods differ due to base period methodology (see explanation above)\")\nprint(f\"{'Rel. Time':>10} {'CS (vary)':>12} {'CS (univ)':>12} {'SA':>10} {'Note':>20}\")\nprint(\"-\" * 70)\n\nfor rel_time in sorted(results_sa.event_study_effects.keys()):\n sa_eff = results_sa.event_study_effects[rel_time]['effect']\n cs_vary = results_cs.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n cs_univ = results_cs_univ.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n \n note = \"pre (differs)\" if rel_time < 0 else \"post (matches)\"\n print(f\"{rel_time:>10} {cs_vary:>12.4f} {cs_univ:>12.4f} {sa_eff:>10.4f} {note:>20}\")\n\nprint(\"\\nPost-treatment effects should be similar across all methods\")\nprint(\"Pre-treatment differences are expected due to base period methodology\")" + "source": [ + "# Compare overall ATT from both estimators\n", + "cs_label = \"Callaway-Sant'Anna (varying)\"\n", + "print(\"Robustness Check: CS vs SA\")\n", + "print(\"=\" * 60)\n", + "print(f\"{'Estimator':<30} {'Overall ATT':>12} {'SE':>10}\")\n", + "print(\"-\" * 60)\n", + "print(f\"{cs_label:<30} {results_cs.overall_att:>12.4f} {results_cs.overall_se:>10.4f}\")\n", + "print(f\"{'Sun-Abraham':<30} {results_sa.overall_att:>12.4f} {results_sa.overall_se:>10.4f}\")\n", + "\n", + "# Also fit CS with universal base period for comparison\n", + "cs_universal = CallawaySantAnna(control_group=\"never_treated\", base_period=\"universal\")\n", + "results_cs_univ = cs_universal.fit(\n", + " df, outcome=\"outcome\", unit=\"unit\",\n", + " time=\"period\", first_treat=\"first_treat\",\n", + " aggregate=\"event_study\"\n", + ")\n", + "\n", + "# Compare event study effects\n", + "print(\"\\n\\nEvent Study Comparison:\")\n", + "print(\"Note: Pre-periods differ due to base period methodology (see explanation above)\")\n", + "print(f\"{'Rel. Time':>10} {'CS (vary)':>12} {'CS (univ)':>12} {'SA':>10} {'Note':>20}\")\n", + "print(\"-\" * 70)\n", + "\n", + "for rel_time in sorted(results_sa.event_study_effects.keys()):\n", + " sa_eff = results_sa.event_study_effects[rel_time]['effect']\n", + " cs_vary = results_cs.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n", + " cs_univ = results_cs_univ.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n", + " \n", + " note = \"pre (differs)\" if rel_time < 0 else \"post (matches)\"\n", + " print(f\"{rel_time:>10} {cs_vary:>12.4f} {cs_univ:>12.4f} {sa_eff:>10.4f} {note:>20}\")\n", + "\n", + "print(\"\\nPost-treatment effects should be similar across all methods\")\n", + "print(\"Pre-treatment differences are expected due to base period methodology\")" + ] }, { "cell_type": "markdown", "metadata": {}, - "source": "## Summary\n\nKey takeaways:\n\n1. **TWFE can be biased** with staggered adoption and heterogeneous effects\n2. **Goodman-Bacon decomposition** reveals *why* TWFE fails by showing:\n - The implicit 2x2 comparisons and their weights\n - How much weight falls on \"forbidden comparisons\" (already-treated as controls)\n3. **Callaway-Sant'Anna** properly handles staggered adoption by:\n - Computing group-time specific effects ATT(g,t)\n - Only using valid comparison groups\n - Properly aggregating effects\n4. **Sun-Abraham** provides an alternative approach using:\n - Interaction-weighted regression with cohort x relative-time indicators\n - Different weighting scheme than CS\n - More efficient under homogeneous effects\n5. **Run both CS and SA** as a robustness check—when they agree, results are more credible\n6. **Aggregation options**:\n - `\"simple\"`: Overall ATT\n - `\"group\"`: ATT by cohort\n - `\"event\"`: ATT by event time (for event-study plots)\n7. **Bootstrap inference** provides valid standard errors and confidence intervals:\n - Use `n_bootstrap` parameter to enable multiplier bootstrap\n - Choose weight type: `'rademacher'`, `'mammen'`, or `'webb'`\n - Bootstrap results include SEs, CIs, and p-values for all aggregations\n8. **Pre-treatment effects** provide parallel trends diagnostics:\n - Use `base_period=\"varying\"` for consecutive period comparisons\n - Pre-treatment ATT(g,t) should be near zero\n - 95% CIs including zero is consistent with parallel trends\n - See Tutorial 07 for pre-trends power analysis (Roth 2022)\n9. **Control group choices** affect efficiency and assumptions:\n - `\"never_treated\"`: Stronger parallel trends assumption\n - `\"not_yet_treated\"`: Weaker assumption, uses more data\n10. **CS vs SA pre-period differences are expected**:\n - Post-treatment effects should be similar (robustness check)\n - Pre-treatment effects differ due to base period methodology\n - CS (varying): consecutive comparisons → one-period changes\n - SA: fixed reference (e=-1-anticipation) → cumulative deviations\n - Use `base_period=\"universal\"` in CS for comparable pre-periods\n\nFor more details, see:\n- Callaway, B., & Sant'Anna, P. H. (2021). Difference-in-differences with multiple time periods. *Journal of Econometrics*.\n- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies with heterogeneous treatment effects. *Journal of Econometrics*.\n- Goodman-Bacon, A. (2021). Difference-in-differences with variation in treatment timing. *Journal of Econometrics*." + "source": [ + "## Summary\n", + "\n", + "Key takeaways:\n", + "\n", + "1. **TWFE can be biased** with staggered adoption and heterogeneous effects\n", + "2. **Goodman-Bacon decomposition** reveals *why* TWFE fails by showing:\n", + " - The implicit 2x2 comparisons and their weights\n", + " - How much weight falls on \"forbidden comparisons\" (already-treated as controls)\n", + "3. **Callaway-Sant'Anna** properly handles staggered adoption by:\n", + " - Computing group-time specific effects ATT(g,t)\n", + " - Only using valid comparison groups\n", + " - Properly aggregating effects\n", + "4. **Sun-Abraham** provides an alternative approach using:\n", + " - Interaction-weighted regression with cohort x relative-time indicators\n", + " - Different weighting scheme than CS\n", + " - More efficient under homogeneous effects\n", + "5. **Run both CS and SA** as a robustness check—when they agree, results are more credible\n", + "6. **Aggregation options**:\n", + " - `\"simple\"`: Overall ATT\n", + " - `\"group\"`: ATT by cohort\n", + " - `\"event\"`: ATT by event time (for event-study plots)\n", + "7. **Bootstrap inference** provides valid standard errors and confidence intervals:\n", + " - Use `n_bootstrap` parameter to enable multiplier bootstrap\n", + " - Choose weight type: `'rademacher'`, `'mammen'`, or `'webb'`\n", + " - Bootstrap results include SEs, CIs, and p-values for all aggregations\n", + "8. **Pre-treatment effects** provide parallel trends diagnostics:\n", + " - Use `base_period=\"varying\"` for consecutive period comparisons\n", + " - Pre-treatment ATT(g,t) should be near zero\n", + " - 95% CIs including zero is consistent with parallel trends\n", + " - See Tutorial 07 for pre-trends power analysis (Roth 2022)\n", + "9. **Control group choices** affect efficiency and assumptions:\n", + " - `\"never_treated\"`: Stronger parallel trends assumption\n", + " - `\"not_yet_treated\"`: Weaker assumption, uses more data\n", + "10. **CS vs SA pre-period differences are expected**:\n", + " - Post-treatment effects should be similar (robustness check)\n", + " - Pre-treatment effects differ due to base period methodology\n", + " - CS (varying): consecutive comparisons → one-period changes\n", + " - SA: fixed reference (e=-1-anticipation) → cumulative deviations\n", + " - Use `base_period=\"universal\"` in CS for comparable pre-periods\n", + "\n", + "For more details, see:\n", + "- Callaway, B., & Sant'Anna, P. H. (2021). Difference-in-differences with multiple time periods. *Journal of Econometrics*.\n", + "- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies with heterogeneous treatment effects. *Journal of Econometrics*.\n", + "- Goodman-Bacon, A. (2021). Difference-in-differences with variation in treatment timing. *Journal of Econometrics*." + ] } ], "metadata": { @@ -832,4 +963,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 4f78254..4a3ae3b 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -318,6 +318,262 @@ def test_plot_cs_with_anticipation(self): plt.close() + def test_plot_event_study_reference_period_normalization(self): + """Test that reference_period normalizes effects and sets reference SE to NaN. + + When reference_period is specified: + 1. The effect at that period is subtracted from all effects (ref period = 0) + 2. The SE at the reference period is set to NaN (it's a constraint, not an estimate) + 3. Other periods retain their original SEs and have error bars + + This follows the fixest (R) convention where the omitted/reference category + has no associated uncertainty (it's an identifying constraint). + """ + pytest.importorskip("matplotlib") + import matplotlib.pyplot as plt + + # Create data where reference period (period=0) has effect=0.3 + df = pd.DataFrame({ + 'period': [-2, -1, 0, 1, 2], + 'effect': [0.1, 0.2, 0.3, 0.5, 0.6], # ref at 0 has effect 0.3 + 'se': [0.1, 0.1, 0.1, 0.1, 0.1] + }) + + ax = plot_event_study(df, reference_period=0, show=False) + + # Find plotted y-values by extracting data from Line2D objects + # The point estimates are plotted as individual markers + y_values = [] + for child in ax.get_children(): + # Line2D objects with single points are our markers + if hasattr(child, 'get_ydata'): + ydata = child.get_ydata() + if len(ydata) == 1: + y_values.append(float(ydata[0])) + + # After normalization: + # - Original effects: [0.1, 0.2, 0.3, 0.5, 0.6] + # - Reference effect: 0.3 + # - Normalized: [-0.2, -0.1, 0.0, 0.2, 0.3] + expected_normalized = [-0.2, -0.1, 0.0, 0.2, 0.3] + + # Check that reference period (0) is at y=0 + assert 0.0 in y_values or any(abs(y) < 0.01 for y in y_values), \ + f"Reference period should be at y=0, got y_values={y_values}" + + # Verify all expected normalized values are present + for expected in expected_normalized: + assert any(abs(y - expected) < 0.01 for y in y_values), \ + f"Expected normalized value {expected} not found in {y_values}" + + # Verify error bars: reference period (y=0) should have NO error bars + # while other periods should have error bars + # Error bars are drawn via ax.errorbar, which creates ErrorbarContainer or Line2D + # The error bar x-coordinates tell us which periods have error bars + + # Find the errorbar data (the line segments that form error bars) + errorbar_x_coords = set() + for child in ax.get_children(): + # ErrorbarContainer's children include LineCollection for the caps/stems + if hasattr(child, 'get_segments'): + segments = child.get_segments() + for seg in segments: + # Each segment is [[x1, y1], [x2, y2]] + if len(seg) >= 2: + # x-coordinate of error bar (both points have same x) + errorbar_x_coords.add(round(seg[0][0], 1)) + + # x-coordinates: period -2 -> x=0, -1 -> x=1, 0 -> x=2, 1 -> x=3, 2 -> x=4 + # The reference period (period=0) is at x=2 + reference_x = 2 # period 0 is at x-coordinate 2 + + # Reference period should NOT have error bars (x=2 should not be in errorbar_x_coords) + assert reference_x not in errorbar_x_coords, \ + f"Reference period should have no error bars but found error bar at x={reference_x}" + + # Other periods SHOULD have error bars + # At least some of x=0, x=1, x=3, x=4 should have error bars + non_ref_x_coords = {0, 1, 3, 4} + assert len(errorbar_x_coords & non_ref_x_coords) >= 2, \ + f"Non-reference periods should have error bars, found: {errorbar_x_coords}" + + plt.close() + + def test_plot_event_study_no_normalization_without_reference(self): + """Test that effects are NOT normalized when reference_period is None.""" + pytest.importorskip("matplotlib") + import matplotlib.pyplot as plt + + df = pd.DataFrame({ + 'period': [-1, 0, 1], + 'effect': [0.1, 0.3, 0.5], + 'se': [0.1, 0.1, 0.1] + }) + + ax = plot_event_study(df, reference_period=None, show=False) + + # Extract y-values + y_values = [] + for child in ax.get_children(): + if hasattr(child, 'get_ydata'): + ydata = child.get_ydata() + if len(ydata) == 1: + y_values.append(float(ydata[0])) + + # Without normalization, original values should be preserved + for expected in [0.1, 0.3, 0.5]: + assert any(abs(y - expected) < 0.01 for y in y_values), \ + f"Original value {expected} not found in {y_values}" + + plt.close() + + def test_plot_event_study_normalization_with_nan_reference(self): + """Test that normalization is skipped when reference effect is NaN.""" + pytest.importorskip("matplotlib") + import matplotlib.pyplot as plt + + df = pd.DataFrame({ + 'period': [-1, 0, 1], + 'effect': [0.1, np.nan, 0.5], # Reference period has NaN effect + 'se': [0.1, 0.1, 0.1] + }) + + # This should not raise and should skip normalization + ax = plot_event_study(df, reference_period=0, show=False) + + # Extract y-values (NaN effect is skipped in plotting) + y_values = [] + for child in ax.get_children(): + if hasattr(child, 'get_ydata'): + ydata = child.get_ydata() + if len(ydata) == 1: + y_values.append(float(ydata[0])) + + # Original non-NaN values should be preserved (not normalized) + for expected in [0.1, 0.5]: + assert any(abs(y - expected) < 0.01 for y in y_values), \ + f"Original value {expected} not found in {y_values}" + + plt.close() + + def test_plot_cs_results_no_auto_normalization(self, cs_results): + """Test that auto-inferred reference period does NOT normalize effects. + + When CallawaySantAnna results auto-infer reference_period=-1 (or from n_groups=0), + effects should NOT be normalized (just hollow marker styling). + Only explicit reference_period=X should trigger normalization. + """ + pytest.importorskip("matplotlib") + import matplotlib.pyplot as plt + + # Use fixture instead of re-fitting + results = cs_results + + # Get original effects from results (before any normalization) + original_effects = { + period: effect_data['effect'] + for period, effect_data in results.event_study_effects.items() + } + + # Plot WITHOUT explicitly specifying reference_period + # This should auto-infer reference but NOT normalize + ax = plot_event_study(results, show=False) + + # Extract plotted y-values + y_values = [] + for child in ax.get_children(): + if hasattr(child, 'get_ydata'): + ydata = child.get_ydata() + if len(ydata) == 1: + y_values.append(float(ydata[0])) + + # Verify that the original (non-normalized) effects are plotted + # Check that at least some non-zero effects are preserved + non_zero_originals = [e for e in original_effects.values() if abs(e) > 0.01] + assert len(non_zero_originals) > 0, "Should have non-zero original effects" + + # The key check: effects should NOT all be relative to some reference + # If normalized, reference would be at 0 and others shifted accordingly + # Since NOT normalized, we should see the original effect values + for period, orig_effect in original_effects.items(): + if np.isfinite(orig_effect): + # Check that original value is present (not normalized) + assert any(abs(y - orig_effect) < 0.05 for y in y_values), \ + f"Original effect {orig_effect:.3f} for period {period} " \ + f"should be plotted without normalization. Found y_values: {y_values}" + + plt.close() + + def test_plot_cs_results_explicit_reference_normalizes(self, cs_results): + """Test that explicit reference_period normalizes CallawaySantAnna results. + + When user explicitly passes reference_period=X to plot_event_study, + it should normalize effects (subtract ref effect) and set ref SE to NaN. + """ + pytest.importorskip("matplotlib") + import matplotlib.pyplot as plt + + # Use fixture instead of re-fitting + results = cs_results + + # Get original effects from results + original_effects = { + period: effect_data['effect'] + for period, effect_data in results.event_study_effects.items() + } + + # Choose reference period (typically -1) + ref_period = -1 + ref_effect = original_effects.get(ref_period, 0.0) + + # Compute expected normalized effects + expected_normalized = { + period: effect - ref_effect + for period, effect in original_effects.items() + } + + # Plot WITH explicit reference_period - this SHOULD normalize + ax = plot_event_study(results, reference_period=ref_period, show=False) + + # Extract plotted y-values + y_values = [] + for child in ax.get_children(): + if hasattr(child, 'get_ydata'): + ydata = child.get_ydata() + if len(ydata) == 1: + y_values.append(float(ydata[0])) + + # The reference period should now be at y=0 (normalized) + assert any(abs(y) < 0.01 for y in y_values), \ + f"Reference period should be normalized to y=0, got y_values={y_values}" + + # Verify normalized values are present + for period, norm_effect in expected_normalized.items(): + if np.isfinite(norm_effect): + assert any(abs(y - norm_effect) < 0.05 for y in y_values), \ + f"Normalized effect {norm_effect:.3f} for period {period} " \ + f"not found in {y_values}" + + # Verify reference period has no error bars (SE was set to NaN) + # Find error bar x-coordinates + periods_in_plot = sorted(original_effects.keys()) + ref_x_idx = periods_in_plot.index(ref_period) if ref_period in periods_in_plot else None + + if ref_x_idx is not None: + errorbar_x_coords = set() + for child in ax.get_children(): + if hasattr(child, 'get_segments'): + segments = child.get_segments() + for seg in segments: + if len(seg) >= 2: + errorbar_x_coords.add(round(seg[0][0], 1)) + + # Reference period should NOT have error bars + assert ref_x_idx not in errorbar_x_coords, \ + f"Reference period at x={ref_x_idx} should have no error bars" + + plt.close() + class TestPlotEventStudyIntegration: """Integration tests for event study plotting."""