diff --git a/diff_diff/visualization.py b/diff_diff/visualization.py
index 6584b51..3ba3a80 100644
--- a/diff_diff/visualization.py
+++ b/diff_diff/visualization.py
@@ -73,8 +73,10 @@ def plot_event_study(
     periods : list, optional
         List of periods to plot. If None, uses all periods from results.
     reference_period : any, optional
-        The reference period (normalized to effect=0). Will be shown as a
-        hollow marker. If None, tries to infer from results.
+        The reference period to highlight. When explicitly provided, effects
+        are normalized (ref effect subtracted) and ref SE is set to NaN.
+        When None and auto-inferred from results, only hollow marker styling
+        is applied (no normalization). If None, tries to infer from results.
     pre_periods : list, optional
         List of pre-treatment periods. Used for shading.
     post_periods : list, optional
@@ -151,8 +153,9 @@ def plot_event_study(
        trends holds. Large pre-treatment effects suggest the assumption may
        be violated.
 
-    2. **Reference period**: Usually the last pre-treatment period (t=-1),
-       normalized to zero. This is the omitted category.
+    2. **Reference period**: Usually the last pre-treatment period (t=-1).
+       When explicitly specified via ``reference_period``, effects are normalized
+       to zero at this period. When auto-inferred, shown with hollow marker only.
 
     3. **Post-treatment periods**: The treatment effects of interest. These
        show how the outcome evolved after treatment.
@@ -170,10 +173,18 @@ def plot_event_study(
 
     from scipy import stats as scipy_stats
 
+    # Track if reference_period was explicitly provided by user
+    reference_period_explicit = reference_period is not None
+
     # Extract data from results if provided
     if results is not None:
-        effects, se, periods, pre_periods, post_periods, reference_period = \
-            _extract_plot_data(results, periods, pre_periods, post_periods, reference_period)
+        extracted = _extract_plot_data(
+            results, periods, pre_periods, post_periods, reference_period
+        )
+        effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred = extracted
+        # If reference was inferred from results, it was NOT explicitly provided
+        if reference_inferred:
+            reference_period_explicit = False
     elif effects is None or se is None:
         raise ValueError(
             "Must provide either 'results' or both 'effects' and 'se'"
@@ -192,6 +203,19 @@ def plot_event_study(
     # Compute confidence intervals
     critical_value = scipy_stats.norm.ppf(1 - alpha / 2)
 
+    # Normalize effects to reference period ONLY if explicitly specified by user
+    # Auto-inferred reference periods (from CallawaySantAnna) just get hollow marker styling,
+    # NO normalization. This prevents unintended normalization when the reference period
+    # isn't a true identifying constraint (e.g., CallawaySantAnna with base_period="varying").
+    if (reference_period is not None and reference_period in effects and
+            reference_period_explicit):
+        ref_effect = effects[reference_period]
+        if np.isfinite(ref_effect):
+            effects = {p: e - ref_effect for p, e in effects.items()}
+            # Set reference SE to NaN (it's now a constraint, not an estimate)
+            # This follows fixest convention where the omitted category has no SE/CI
+            se = {p: (np.nan if p == reference_period else s) for p, s in se.items()}
+
     plot_data = []
     for period in periods:
         effect = effects.get(period, np.nan)
@@ -304,14 +328,17 @@ def _extract_plot_data(
     pre_periods: Optional[List[Any]],
     post_periods: Optional[List[Any]],
     reference_period: Optional[Any],
-) -> Tuple[Dict, Dict, List, List, List, Any]:
+) -> Tuple[Dict, Dict, List, List, List, Any, bool]:
     """
     Extract plotting data from various result types.
 
     Returns
     -------
     tuple
-        (effects, se, periods, pre_periods, post_periods, reference_period)
+        (effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred)
+
+        reference_inferred is True if reference_period was auto-detected from results
+        rather than explicitly provided by the user.
     """
     # Handle DataFrame input
     if isinstance(results, pd.DataFrame):
@@ -328,7 +355,8 @@ def _extract_plot_data(
         if periods is None:
             periods = list(results['period'])
 
-        return effects, se, periods, pre_periods, post_periods, reference_period
+        # DataFrame input: reference_period was already set by caller, never inferred here
+        return effects, se, periods, pre_periods, post_periods, reference_period, False
 
     # Handle MultiPeriodDiDResults
     if hasattr(results, 'period_effects'):
@@ -348,7 +376,8 @@ def _extract_plot_data(
         if periods is None:
             periods = post_periods
 
-        return effects, se, periods, pre_periods, post_periods, reference_period
+        # MultiPeriodDiDResults: reference_period was already set by caller, never inferred here
+        return effects, se, periods, pre_periods, post_periods, reference_period, False
 
     # Handle CallawaySantAnnaResults (event study aggregation)
     if hasattr(results, 'event_study_effects') and results.event_study_effects is not None:
@@ -362,8 +391,12 @@ def _extract_plot_data(
         if periods is None:
             periods = sorted(effects.keys())
 
+        # Track if reference_period was explicitly provided vs auto-inferred
+        reference_inferred = False
+
         # Reference period is typically -1 for event study
         if reference_period is None:
+            reference_inferred = True  # We're about to infer it
             # Detect reference period from n_groups=0 marker (normalization constraint)
             # This handles anticipation > 0 where reference is at e = -1 - anticipation
             for period, effect_data in results.event_study_effects.items():
@@ -380,7 +413,7 @@ def _extract_plot_data(
         if post_periods is None:
             post_periods = [p for p in periods if p >= 0]
 
-        return effects, se, periods, pre_periods, post_periods, reference_period
+        return effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred
 
     raise TypeError(
         f"Cannot extract plot data from {type(results).__name__}. "
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 6d6c1b6..015ba98 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -728,6 +728,52 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
 
 ---
 
+# Visualization
+
+## Event Study Plotting (`plot_event_study`)
+
+**Reference Period Normalization**
+
+Normalization only occurs when `reference_period` is **explicitly specified** by the user:
+
+- **Explicit `reference_period=X`**: Normalizes effects (subtracts ref effect), sets ref SE to NaN
+  - Point estimates: `effect_normalized = effect - effect_ref`
+  - Reference period SE → NaN (it's now a constraint, not an estimate)
+  - Other periods' SEs unchanged (uncertainty relative to the constraint)
+  - CIs recomputed from normalized effects and original SEs
+
+- **Auto-inferred reference** (from CallawaySantAnna results): Hollow marker styling only, no normalization
+  - Original effects are plotted unchanged
+  - Reference period shown with hollow marker for visual indication
+  - All periods retain their original SEs and error bars
+
+This design prevents unintended normalization when the reference period isn't a true
+identifying constraint (e.g., CallawaySantAnna with `base_period="varying"` where different
+cohorts use different comparison periods).
+
+The explicit-only normalization follows the `fixest` (R) convention where the omitted/reference
+category is an identifying constraint with no associated uncertainty. Auto-inferred references
+follow the `did` (R) package convention which does not normalize and reports full inference.
+
+**Rationale**: When normalizing to a reference period, we're treating that period as an
+identifying constraint (effect ≡ 0 by definition). The variance of a constant is zero,
+but since it's a constraint rather than an estimated quantity, we report NaN rather than 0.
+Auto-inferred references may not represent true identifying constraints, so normalization
+should be a deliberate user choice.
+
+**Edge Cases:**
+- If `reference_period` not in data: No normalization applied
+- If reference effect is NaN: No normalization applied
+- Reference period CI becomes (NaN, NaN) after normalization (explicit only)
+- Reference period is plotted with hollow marker (both explicit and auto-inferred)
+- Reference period error bars: removed for explicit, retained for auto-inferred
+
+**Reference implementation(s):**
+- R: `fixest::coefplot()` with reference category shown at 0 with no CI
+- R: `did::ggdid()` does not normalize; shows full inference for all periods
+
+---
+
 # Cross-Reference: Standard Errors Summary
 
 | Estimator | Default SE | Alternatives |
diff --git a/docs/tutorials/02_staggered_did.ipynb b/docs/tutorials/02_staggered_did.ipynb
index 62d913d..4657608 100644
--- a/docs/tutorials/02_staggered_did.ipynb
+++ b/docs/tutorials/02_staggered_did.ipynb
@@ -3,7 +3,31 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "# Staggered Difference-in-Differences\n\nThis notebook demonstrates how to handle **staggered treatment adoption** using modern DiD estimators. In staggered DiD settings:\n\n- Different units get treated at different times\n- Traditional TWFE can give biased estimates due to \"forbidden comparisons\"\n- Modern estimators compute group-time specific effects and aggregate them properly\n\nWe'll cover:\n1. Understanding staggered adoption\n2. The problem with TWFE (and Goodman-Bacon decomposition)\n3. The Callaway-Sant'Anna estimator\n4. Group-time effects ATT(g,t)\n5. Aggregating effects (simple, group, event-study)\n6. Bootstrap inference for valid standard errors\n7. Visualization\n8. Pre-treatment effects and parallel trends testing\n9. Different control group options\n10. Handling anticipation effects\n11. Adding covariates\n12. Comparing with MultiPeriodDiD\n13. Sun-Abraham interaction-weighted estimator\n14. Comparing CS and SA as a robustness check"
+   "source": [
+    "# Staggered Difference-in-Differences\n",
+    "\n",
+    "This notebook demonstrates how to handle **staggered treatment adoption** using modern DiD estimators. In staggered DiD settings:\n",
+    "\n",
+    "- Different units get treated at different times\n",
+    "- Traditional TWFE can give biased estimates due to \"forbidden comparisons\"\n",
+    "- Modern estimators compute group-time specific effects and aggregate them properly\n",
+    "\n",
+    "We'll cover:\n",
+    "1. Understanding staggered adoption\n",
+    "2. The problem with TWFE (and Goodman-Bacon decomposition)\n",
+    "3. The Callaway-Sant'Anna estimator\n",
+    "4. Group-time effects ATT(g,t)\n",
+    "5. Aggregating effects (simple, group, event-study)\n",
+    "6. Bootstrap inference for valid standard errors\n",
+    "7. Visualization\n",
+    "8. Pre-treatment effects and parallel trends testing\n",
+    "9. Different control group options\n",
+    "10. Handling anticipation effects\n",
+    "11. Adding covariates\n",
+    "12. Comparing with MultiPeriodDiD\n",
+    "13. Sun-Abraham interaction-weighted estimator\n",
+    "14. Comparing CS and SA as a robustness check"
+   ]
   },
   {
    "cell_type": "code",
@@ -810,19 +834,126 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## 14. Comparing CS and SA as a Robustness Check\n\nRunning both estimators provides a useful robustness check. When they agree, results are more credible.\n\n### Understanding Pre-Period Differences\n\nYou may notice that **post-treatment effects align closely** between CS and SA, but **pre-treatment effects can differ in magnitude and significance**. This is expected methodological behavior, not a bug.\n\n**Why the difference?**\n\n1. **Callaway-Sant'Anna with `base_period=\"varying\"` (default)**:\n   - Pre-treatment effects use **consecutive period comparisons** (period t vs period t-1)\n   - Each pre-period coefficient represents a one-period change\n   - These smaller incremental changes often yield lower t-statistics\n\n2. **Sun-Abraham**:\n   - Uses a **fixed reference period** (e=-1 when anticipation=0, or e=-1-anticipation otherwise)\n   - All coefficients are deviations from this single reference\n   - Pre-period coefficients show cumulative difference from the reference\n\n**To make CS pre-periods more comparable to SA**, use `base_period=\"universal\"`:\n\n```python\ncs_universal = CallawaySantAnna(base_period=\"universal\")\n```\n\nThis makes CS compare all periods to g-1 (like SA), producing more similar pre-treatment estimates."
+   "source": [
+    "## 14. Comparing CS and SA as a Robustness Check\n",
+    "\n",
+    "Running both estimators provides a useful robustness check. When they agree, results are more credible.\n",
+    "\n",
+    "### Understanding Pre-Period Differences\n",
+    "\n",
+    "You may notice that **post-treatment effects align closely** between CS and SA, but **pre-treatment effects can differ in magnitude and significance**. This is expected methodological behavior, not a bug.\n",
+    "\n",
+    "**Why the difference?**\n",
+    "\n",
+    "1. **Callaway-Sant'Anna with `base_period=\"varying\"` (default)**:\n",
+    "   - Pre-treatment effects use **consecutive period comparisons** (period t vs period t-1)\n",
+    "   - Each pre-period coefficient represents a one-period change\n",
+    "   - These smaller incremental changes often yield lower t-statistics\n",
+    "\n",
+    "2. **Sun-Abraham**:\n",
+    "   - Uses a **fixed reference period** (e=-1 when anticipation=0, or e=-1-anticipation otherwise)\n",
+    "   - All coefficients are deviations from this single reference\n",
+    "   - Pre-period coefficients show cumulative difference from the reference\n",
+    "\n",
+    "**To make CS pre-periods more comparable to SA**, use `base_period=\"universal\"`:\n",
+    "\n",
+    "```python\n",
+    "cs_universal = CallawaySantAnna(base_period=\"universal\")\n",
+    "```\n",
+    "\n",
+    "This makes CS compare all periods to g-1 (like SA), producing more similar pre-treatment estimates."
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Compare overall ATT from both estimators\nprint(\"Robustness Check: CS vs SA\")\nprint(\"=\" * 60)\nprint(f\"{'Estimator':<30} {'Overall ATT':>12} {'SE':>10}\")\nprint(\"-\" * 60)\nprint(f\"{'Callaway-Sant\\\\'Anna (varying)':<30} {results_cs.overall_att:>12.4f} {results_cs.overall_se:>10.4f}\")\nprint(f\"{'Sun-Abraham':<30} {results_sa.overall_att:>12.4f} {results_sa.overall_se:>10.4f}\")\n\n# Also fit CS with universal base period for comparison\ncs_universal = CallawaySantAnna(control_group=\"never_treated\", base_period=\"universal\")\nresults_cs_univ = cs_universal.fit(\n    df, outcome=\"outcome\", unit=\"unit\",\n    time=\"period\", first_treat=\"first_treat\",\n    aggregate=\"event_study\"\n)\n\n# Compare event study effects\nprint(\"\\n\\nEvent Study Comparison:\")\nprint(\"Note: Pre-periods differ due to base period methodology (see explanation above)\")\nprint(f\"{'Rel. Time':>10} {'CS (vary)':>12} {'CS (univ)':>12} {'SA':>10} {'Note':>20}\")\nprint(\"-\" * 70)\n\nfor rel_time in sorted(results_sa.event_study_effects.keys()):\n    sa_eff = results_sa.event_study_effects[rel_time]['effect']\n    cs_vary = results_cs.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n    cs_univ = results_cs_univ.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n    \n    note = \"pre (differs)\" if rel_time < 0 else \"post (matches)\"\n    print(f\"{rel_time:>10} {cs_vary:>12.4f} {cs_univ:>12.4f} {sa_eff:>10.4f} {note:>20}\")\n\nprint(\"\\nPost-treatment effects should be similar across all methods\")\nprint(\"Pre-treatment differences are expected due to base period methodology\")"
+   "source": [
+    "# Compare overall ATT from both estimators\n",
+    "cs_label = \"Callaway-Sant'Anna (varying)\"\n",
+    "print(\"Robustness Check: CS vs SA\")\n",
+    "print(\"=\" * 60)\n",
+    "print(f\"{'Estimator':<30} {'Overall ATT':>12} {'SE':>10}\")\n",
+    "print(\"-\" * 60)\n",
+    "print(f\"{cs_label:<30} {results_cs.overall_att:>12.4f} {results_cs.overall_se:>10.4f}\")\n",
+    "print(f\"{'Sun-Abraham':<30} {results_sa.overall_att:>12.4f} {results_sa.overall_se:>10.4f}\")\n",
+    "\n",
+    "# Also fit CS with universal base period for comparison\n",
+    "cs_universal = CallawaySantAnna(control_group=\"never_treated\", base_period=\"universal\")\n",
+    "results_cs_univ = cs_universal.fit(\n",
+    "    df, outcome=\"outcome\", unit=\"unit\",\n",
+    "    time=\"period\", first_treat=\"first_treat\",\n",
+    "    aggregate=\"event_study\"\n",
+    ")\n",
+    "\n",
+    "# Compare event study effects\n",
+    "print(\"\\n\\nEvent Study Comparison:\")\n",
+    "print(\"Note: Pre-periods differ due to base period methodology (see explanation above)\")\n",
+    "print(f\"{'Rel. Time':>10} {'CS (vary)':>12} {'CS (univ)':>12} {'SA':>10} {'Note':>20}\")\n",
+    "print(\"-\" * 70)\n",
+    "\n",
+    "for rel_time in sorted(results_sa.event_study_effects.keys()):\n",
+    "    sa_eff = results_sa.event_study_effects[rel_time]['effect']\n",
+    "    cs_vary = results_cs.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n",
+    "    cs_univ = results_cs_univ.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n",
+    "    \n",
+    "    note = \"pre (differs)\" if rel_time < 0 else \"post (matches)\"\n",
+    "    print(f\"{rel_time:>10} {cs_vary:>12.4f} {cs_univ:>12.4f} {sa_eff:>10.4f} {note:>20}\")\n",
+    "\n",
+    "print(\"\\nPost-treatment effects should be similar across all methods\")\n",
+    "print(\"Pre-treatment differences are expected due to base period methodology\")"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## Summary\n\nKey takeaways:\n\n1. **TWFE can be biased** with staggered adoption and heterogeneous effects\n2. **Goodman-Bacon decomposition** reveals *why* TWFE fails by showing:\n   - The implicit 2x2 comparisons and their weights\n   - How much weight falls on \"forbidden comparisons\" (already-treated as controls)\n3. **Callaway-Sant'Anna** properly handles staggered adoption by:\n   - Computing group-time specific effects ATT(g,t)\n   - Only using valid comparison groups\n   - Properly aggregating effects\n4. **Sun-Abraham** provides an alternative approach using:\n   - Interaction-weighted regression with cohort x relative-time indicators\n   - Different weighting scheme than CS\n   - More efficient under homogeneous effects\n5. **Run both CS and SA** as a robustness check—when they agree, results are more credible\n6. **Aggregation options**:\n   - `\"simple\"`: Overall ATT\n   - `\"group\"`: ATT by cohort\n   - `\"event\"`: ATT by event time (for event-study plots)\n7. **Bootstrap inference** provides valid standard errors and confidence intervals:\n   - Use `n_bootstrap` parameter to enable multiplier bootstrap\n   - Choose weight type: `'rademacher'`, `'mammen'`, or `'webb'`\n   - Bootstrap results include SEs, CIs, and p-values for all aggregations\n8. **Pre-treatment effects** provide parallel trends diagnostics:\n   - Use `base_period=\"varying\"` for consecutive period comparisons\n   - Pre-treatment ATT(g,t) should be near zero\n   - 95% CIs including zero is consistent with parallel trends\n   - See Tutorial 07 for pre-trends power analysis (Roth 2022)\n9. **Control group choices** affect efficiency and assumptions:\n   - `\"never_treated\"`: Stronger parallel trends assumption\n   - `\"not_yet_treated\"`: Weaker assumption, uses more data\n10. **CS vs SA pre-period differences are expected**:\n    - Post-treatment effects should be similar (robustness check)\n    - Pre-treatment effects differ due to base period methodology\n    - CS (varying): consecutive comparisons → one-period changes\n    - SA: fixed reference (e=-1-anticipation) → cumulative deviations\n    - Use `base_period=\"universal\"` in CS for comparable pre-periods\n\nFor more details, see:\n- Callaway, B., & Sant'Anna, P. H. (2021). Difference-in-differences with multiple time periods. *Journal of Econometrics*.\n- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies with heterogeneous treatment effects. *Journal of Econometrics*.\n- Goodman-Bacon, A. (2021). Difference-in-differences with variation in treatment timing. *Journal of Econometrics*."
+   "source": [
+    "## Summary\n",
+    "\n",
+    "Key takeaways:\n",
+    "\n",
+    "1. **TWFE can be biased** with staggered adoption and heterogeneous effects\n",
+    "2. **Goodman-Bacon decomposition** reveals *why* TWFE fails by showing:\n",
+    "   - The implicit 2x2 comparisons and their weights\n",
+    "   - How much weight falls on \"forbidden comparisons\" (already-treated as controls)\n",
+    "3. **Callaway-Sant'Anna** properly handles staggered adoption by:\n",
+    "   - Computing group-time specific effects ATT(g,t)\n",
+    "   - Only using valid comparison groups\n",
+    "   - Properly aggregating effects\n",
+    "4. **Sun-Abraham** provides an alternative approach using:\n",
+    "   - Interaction-weighted regression with cohort x relative-time indicators\n",
+    "   - Different weighting scheme than CS\n",
+    "   - More efficient under homogeneous effects\n",
+    "5. **Run both CS and SA** as a robustness check—when they agree, results are more credible\n",
+    "6. **Aggregation options**:\n",
+    "   - `\"simple\"`: Overall ATT\n",
+    "   - `\"group\"`: ATT by cohort\n",
+    "   - `\"event\"`: ATT by event time (for event-study plots)\n",
+    "7. **Bootstrap inference** provides valid standard errors and confidence intervals:\n",
+    "   - Use `n_bootstrap` parameter to enable multiplier bootstrap\n",
+    "   - Choose weight type: `'rademacher'`, `'mammen'`, or `'webb'`\n",
+    "   - Bootstrap results include SEs, CIs, and p-values for all aggregations\n",
+    "8. **Pre-treatment effects** provide parallel trends diagnostics:\n",
+    "   - Use `base_period=\"varying\"` for consecutive period comparisons\n",
+    "   - Pre-treatment ATT(g,t) should be near zero\n",
+    "   - 95% CIs including zero is consistent with parallel trends\n",
+    "   - See Tutorial 07 for pre-trends power analysis (Roth 2022)\n",
+    "9. **Control group choices** affect efficiency and assumptions:\n",
+    "   - `\"never_treated\"`: Stronger parallel trends assumption\n",
+    "   - `\"not_yet_treated\"`: Weaker assumption, uses more data\n",
+    "10. **CS vs SA pre-period differences are expected**:\n",
+    "    - Post-treatment effects should be similar (robustness check)\n",
+    "    - Pre-treatment effects differ due to base period methodology\n",
+    "    - CS (varying): consecutive comparisons → one-period changes\n",
+    "    - SA: fixed reference (e=-1-anticipation) → cumulative deviations\n",
+    "    - Use `base_period=\"universal\"` in CS for comparable pre-periods\n",
+    "\n",
+    "For more details, see:\n",
+    "- Callaway, B., & Sant'Anna, P. H. (2021). Difference-in-differences with multiple time periods. *Journal of Econometrics*.\n",
+    "- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies with heterogeneous treatment effects. *Journal of Econometrics*.\n",
+    "- Goodman-Bacon, A. (2021). Difference-in-differences with variation in treatment timing. *Journal of Econometrics*."
+   ]
   }
  ],
  "metadata": {
@@ -832,4 +963,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 4f78254..4a3ae3b 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -318,6 +318,262 @@ def test_plot_cs_with_anticipation(self):
 
         plt.close()
 
+    def test_plot_event_study_reference_period_normalization(self):
+        """Test that reference_period normalizes effects and sets reference SE to NaN.
+
+        When reference_period is specified:
+        1. The effect at that period is subtracted from all effects (ref period = 0)
+        2. The SE at the reference period is set to NaN (it's a constraint, not an estimate)
+        3. Other periods retain their original SEs and have error bars
+
+        This follows the fixest (R) convention where the omitted/reference category
+        has no associated uncertainty (it's an identifying constraint).
+        """
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        # Create data where reference period (period=0) has effect=0.3
+        df = pd.DataFrame({
+            'period': [-2, -1, 0, 1, 2],
+            'effect': [0.1, 0.2, 0.3, 0.5, 0.6],  # ref at 0 has effect 0.3
+            'se': [0.1, 0.1, 0.1, 0.1, 0.1]
+        })
+
+        ax = plot_event_study(df, reference_period=0, show=False)
+
+        # Find plotted y-values by extracting data from Line2D objects
+        # The point estimates are plotted as individual markers
+        y_values = []
+        for child in ax.get_children():
+            # Line2D objects with single points are our markers
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # After normalization:
+        # - Original effects: [0.1, 0.2, 0.3, 0.5, 0.6]
+        # - Reference effect: 0.3
+        # - Normalized: [-0.2, -0.1, 0.0, 0.2, 0.3]
+        expected_normalized = [-0.2, -0.1, 0.0, 0.2, 0.3]
+
+        # Check that reference period (0) is at y=0
+        assert 0.0 in y_values or any(abs(y) < 0.01 for y in y_values), \
+            f"Reference period should be at y=0, got y_values={y_values}"
+
+        # Verify all expected normalized values are present
+        for expected in expected_normalized:
+            assert any(abs(y - expected) < 0.01 for y in y_values), \
+                f"Expected normalized value {expected} not found in {y_values}"
+
+        # Verify error bars: reference period (y=0) should have NO error bars
+        # while other periods should have error bars
+        # Error bars are drawn via ax.errorbar, which creates ErrorbarContainer or Line2D
+        # The error bar x-coordinates tell us which periods have error bars
+
+        # Find the errorbar data (the line segments that form error bars)
+        errorbar_x_coords = set()
+        for child in ax.get_children():
+            # ErrorbarContainer's children include LineCollection for the caps/stems
+            if hasattr(child, 'get_segments'):
+                segments = child.get_segments()
+                for seg in segments:
+                    # Each segment is [[x1, y1], [x2, y2]]
+                    if len(seg) >= 2:
+                        # x-coordinate of error bar (both points have same x)
+                        errorbar_x_coords.add(round(seg[0][0], 1))
+
+        # x-coordinates: period -2 -> x=0, -1 -> x=1, 0 -> x=2, 1 -> x=3, 2 -> x=4
+        # The reference period (period=0) is at x=2
+        reference_x = 2  # period 0 is at x-coordinate 2
+
+        # Reference period should NOT have error bars (x=2 should not be in errorbar_x_coords)
+        assert reference_x not in errorbar_x_coords, \
+            f"Reference period should have no error bars but found error bar at x={reference_x}"
+
+        # Other periods SHOULD have error bars
+        # At least some of x=0, x=1, x=3, x=4 should have error bars
+        non_ref_x_coords = {0, 1, 3, 4}
+        assert len(errorbar_x_coords & non_ref_x_coords) >= 2, \
+            f"Non-reference periods should have error bars, found: {errorbar_x_coords}"
+
+        plt.close()
+
+    def test_plot_event_study_no_normalization_without_reference(self):
+        """Test that effects are NOT normalized when reference_period is None."""
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        df = pd.DataFrame({
+            'period': [-1, 0, 1],
+            'effect': [0.1, 0.3, 0.5],
+            'se': [0.1, 0.1, 0.1]
+        })
+
+        ax = plot_event_study(df, reference_period=None, show=False)
+
+        # Extract y-values
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # Without normalization, original values should be preserved
+        for expected in [0.1, 0.3, 0.5]:
+            assert any(abs(y - expected) < 0.01 for y in y_values), \
+                f"Original value {expected} not found in {y_values}"
+
+        plt.close()
+
+    def test_plot_event_study_normalization_with_nan_reference(self):
+        """Test that normalization is skipped when reference effect is NaN."""
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        df = pd.DataFrame({
+            'period': [-1, 0, 1],
+            'effect': [0.1, np.nan, 0.5],  # Reference period has NaN effect
+            'se': [0.1, 0.1, 0.1]
+        })
+
+        # This should not raise and should skip normalization
+        ax = plot_event_study(df, reference_period=0, show=False)
+
+        # Extract y-values (NaN effect is skipped in plotting)
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # Original non-NaN values should be preserved (not normalized)
+        for expected in [0.1, 0.5]:
+            assert any(abs(y - expected) < 0.01 for y in y_values), \
+                f"Original value {expected} not found in {y_values}"
+
+        plt.close()
+
+    def test_plot_cs_results_no_auto_normalization(self, cs_results):
+        """Test that auto-inferred reference period does NOT normalize effects.
+
+        When CallawaySantAnna results auto-infer reference_period=-1 (or from n_groups=0),
+        effects should NOT be normalized (just hollow marker styling).
+        Only explicit reference_period=X should trigger normalization.
+        """
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        # Use fixture instead of re-fitting
+        results = cs_results
+
+        # Get original effects from results (before any normalization)
+        original_effects = {
+            period: effect_data['effect']
+            for period, effect_data in results.event_study_effects.items()
+        }
+
+        # Plot WITHOUT explicitly specifying reference_period
+        # This should auto-infer reference but NOT normalize
+        ax = plot_event_study(results, show=False)
+
+        # Extract plotted y-values
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # Verify that the original (non-normalized) effects are plotted
+        # Check that at least some non-zero effects are preserved
+        non_zero_originals = [e for e in original_effects.values() if abs(e) > 0.01]
+        assert len(non_zero_originals) > 0, "Should have non-zero original effects"
+
+        # The key check: effects should NOT all be relative to some reference
+        # If normalized, reference would be at 0 and others shifted accordingly
+        # Since NOT normalized, we should see the original effect values
+        for period, orig_effect in original_effects.items():
+            if np.isfinite(orig_effect):
+                # Check that original value is present (not normalized)
+                assert any(abs(y - orig_effect) < 0.05 for y in y_values), \
+                    f"Original effect {orig_effect:.3f} for period {period} " \
+                    f"should be plotted without normalization. Found y_values: {y_values}"
+
+        plt.close()
+
+    def test_plot_cs_results_explicit_reference_normalizes(self, cs_results):
+        """Test that explicit reference_period normalizes CallawaySantAnna results.
+
+        When user explicitly passes reference_period=X to plot_event_study,
+        it should normalize effects (subtract ref effect) and set ref SE to NaN.
+        """
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        # Use fixture instead of re-fitting
+        results = cs_results
+
+        # Get original effects from results
+        original_effects = {
+            period: effect_data['effect']
+            for period, effect_data in results.event_study_effects.items()
+        }
+
+        # Choose reference period (typically -1)
+        ref_period = -1
+        ref_effect = original_effects.get(ref_period, 0.0)
+
+        # Compute expected normalized effects
+        expected_normalized = {
+            period: effect - ref_effect
+            for period, effect in original_effects.items()
+        }
+
+        # Plot WITH explicit reference_period - this SHOULD normalize
+        ax = plot_event_study(results, reference_period=ref_period, show=False)
+
+        # Extract plotted y-values
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # The reference period should now be at y=0 (normalized)
+        assert any(abs(y) < 0.01 for y in y_values), \
+            f"Reference period should be normalized to y=0, got y_values={y_values}"
+
+        # Verify normalized values are present
+        for period, norm_effect in expected_normalized.items():
+            if np.isfinite(norm_effect):
+                assert any(abs(y - norm_effect) < 0.05 for y in y_values), \
+                    f"Normalized effect {norm_effect:.3f} for period {period} " \
+                    f"not found in {y_values}"
+
+        # Verify reference period has no error bars (SE was set to NaN)
+        # Find error bar x-coordinates
+        periods_in_plot = sorted(original_effects.keys())
+        ref_x_idx = periods_in_plot.index(ref_period) if ref_period in periods_in_plot else None
+
+        if ref_x_idx is not None:
+            errorbar_x_coords = set()
+            for child in ax.get_children():
+                if hasattr(child, 'get_segments'):
+                    segments = child.get_segments()
+                    for seg in segments:
+                        if len(seg) >= 2:
+                            errorbar_x_coords.add(round(seg[0][0], 1))
+
+            # Reference period should NOT have error bars
+            assert ref_x_idx not in errorbar_x_coords, \
+                f"Reference period at x={ref_x_idx} should have no error bars"
+
+        plt.close()
+
 
 class TestPlotEventStudyIntegration:
     """Integration tests for event study plotting."""