From bcfd50e7bf13b633b52ca2cbd10a10decb236eb0 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Thu, 23 Apr 2026 21:55:20 -0700
Subject: [PATCH 1/6] fix ptq for fused experts

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py                    | 17 ++++++++++++
 modelopt/torch/export/unified_export_hf.py    |  9 +++++++
 modelopt/torch/quantization/conversion.py     | 27 ++++++++++++++++++-
 .../ptq/nvfp4_experts_only-fp8_kv.yaml        |  4 ++-
 4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index d660c1de4c..5586648297 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -441,6 +441,23 @@ def load_model(args: argparse.Namespace):
             )
         calibration_only = True
 
+    # Force `_experts_implementation = "eager"` so HF's `@use_experts_implementation`
+    # dispatcher calls the original F.linear-based forward (not torch._grouped_mm /
+    # torch.bmm). ModelOpt's `_QuantFusedExperts` intercepts F.linear to feed the
+    # shared input/weight quantizers; non-eager backends bypass the hook entirely,
+    # leaving expert quantizers uncalibrated (no amax → no input_scale).
+    def _force_eager_experts(cfg):
+        if cfg is None:
+            return
+        if hasattr(cfg, "_experts_implementation"):
+            cfg._experts_implementation = "eager"
+        for sub in ("text_config", "vision_config", "audio_config", "speech_config"):
+            if hasattr(cfg, sub):
+                _force_eager_experts(getattr(cfg, sub))
+
+    if hasattr(full_model, "config"):
+        _force_eager_experts(full_model.config)
+
     model_type = get_model_type(full_model)
 
     device = full_model.device
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index af936a3002..1be8ee41cb 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -646,6 +646,15 @@ def _process_quantized_modules(
             "QuantFP8Linear" in type(sub_module).__name__ and sub_module.weight.element_size() <= 1
         ):
             sub_module.unpack_weight()
+        # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList),
+        # which get_quantization_format's singular-weight_quantizer check misses. Handle
+        # it explicitly before the format gate so fused-experts get split + quantized.
+        if hasattr(sub_module, "gate_up_proj_weight_quantizers"):
+            from modelopt.torch.export.moe_utils import _export_fused_experts
+
+            with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                _export_fused_experts(sub_module, dtype)
+            continue
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             # Skip QuantMoELinear - it's handled separately in _reconstruct_fused_moe_linear
             if type(sub_module).__name__ == "QuantMoELinear":
diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py
index 55f7fdf6fc..55382352c6 100644
--- a/modelopt/torch/quantization/conversion.py
+++ b/modelopt/torch/quantization/conversion.py
@@ -16,6 +16,7 @@
 """Quantization conversion/restore utilities."""
 
 import fnmatch
+import re
 import warnings
 from collections.abc import Callable
 from contextlib import contextmanager
@@ -286,6 +287,26 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType
             set_quantizer_attributes_full(quant_model, quantizer_name, attributes, parent_class)
 
 
+_FUSED_EXPERTS_QUANTIZER_LIST_RE = re.compile(r"(weight_quantizers|input_quantizers)\.\d+(?=$|\.)")
+
+
+def _normalize_fused_experts_quantizer_name(name: str) -> str:
+    """Strip the per-expert index from ``_QuantFusedExperts`` ModuleList quantizer names.
+
+    ``_QuantFusedExperts`` registers per-expert weight/input quantizers as
+    ``nn.ModuleList``s named e.g. ``gate_up_proj_weight_quantizers`` — its children
+    get dotted names like ``...gate_up_proj_weight_quantizers.0``. These don't match
+    the singular-suffix wildcards (``*weight_quantizer``) used in the stock configs,
+    so the experts stay at their defaults. Return a normalized name where
+    ``weight_quantizers.N`` / ``input_quantizers.N`` collapse to their singular form
+    so the standard wildcards match.
+    """
+    return _FUSED_EXPERTS_QUANTIZER_LIST_RE.sub(
+        lambda m: m.group(1)[:-1],
+        name,  # "weight_quantizers" -> "weight_quantizer"
+    )
+
+
 def _match_quantizer(
     wildcard_or_filter_func: str | Callable,
     name: str,
@@ -296,7 +317,11 @@ def _match_quantizer(
     if not isinstance(module, (TensorQuantizer, SequentialQuantizer)):
         return False
     if isinstance(wildcard_or_filter_func, str):
-        if not fnmatch.fnmatch(name, wildcard_or_filter_func):
+        normalized = _normalize_fused_experts_quantizer_name(name)
+        if not (
+            fnmatch.fnmatch(name, wildcard_or_filter_func)
+            or (normalized != name and fnmatch.fnmatch(normalized, wildcard_or_filter_func))
+        ):
             return False
     elif callable(wildcard_or_filter_func):
         if not wildcard_or_filter_func(name):
diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml
index 220d062232..7c55703963 100644
--- a/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml
@@ -20,7 +20,9 @@ quantize:
   algorithm:
     method: max
     # Max calibration is fast and does not typically need checkpointing.
-    layerwise: true
+    # layerwise=false required for VLMs where the decoder layers are nested under
+    # `model.language_model.layers` (layerwise_calibrate can't find them otherwise).
+    layerwise: false
   quant_cfg:
     - quantizer_name: '*'
       enable: false

From 99e387d3927564f457b0c5ed0115e63f8b8012de Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 24 Apr 2026 09:40:18 -0700
Subject: [PATCH 2/6] refactor adhoc design/add test

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py                    |  17 ---
 .../torch/quantization/plugins/huggingface.py |  33 ++++
 .../plugins/test_fused_experts.py             | 142 ++++++++++++++++++
 3 files changed, 175 insertions(+), 17 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 5586648297..d660c1de4c 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -441,23 +441,6 @@ def load_model(args: argparse.Namespace):
             )
         calibration_only = True
 
-    # Force `_experts_implementation = "eager"` so HF's `@use_experts_implementation`
-    # dispatcher calls the original F.linear-based forward (not torch._grouped_mm /
-    # torch.bmm). ModelOpt's `_QuantFusedExperts` intercepts F.linear to feed the
-    # shared input/weight quantizers; non-eager backends bypass the hook entirely,
-    # leaving expert quantizers uncalibrated (no amax → no input_scale).
-    def _force_eager_experts(cfg):
-        if cfg is None:
-            return
-        if hasattr(cfg, "_experts_implementation"):
-            cfg._experts_implementation = "eager"
-        for sub in ("text_config", "vision_config", "audio_config", "speech_config"):
-            if hasattr(cfg, sub):
-                _force_eager_experts(getattr(cfg, sub))
-
-    if hasattr(full_model, "config"):
-        _force_eager_experts(full_model.config)
-
     model_type = get_model_type(full_model)
 
     device = full_model.device
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index ba40128fa5..7ea372f1fc 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -1438,6 +1438,38 @@ def register_fused_experts_on_the_fly(model):
             QuantModuleRegistry.register({mod_type: f"hf.{mod_type.__name__}"})(_QuantFusedExperts)
 
 
+def force_eager_experts_impl_on_the_fly(model):
+    """Force HF fused-experts modules onto the eager ``F.linear``-based forward.
+
+    HF transformers 5.0+ decorates fused-experts forwards with
+    ``@use_experts_implementation``, which may dispatch to ``torch._grouped_mm``
+    or ``torch.bmm`` backends. Those backends bypass ``F.linear`` and so bypass
+    ``_QuantFusedExperts``'s input/weight quantizer hooks — calibration silently
+    does nothing, no ``input_scale`` / ``amax`` is collected, and the exported
+    checkpoint produces garbage at inference.
+
+    Sets ``config._experts_implementation = "eager"`` on the model config (and
+    recursively on ``text_config`` / ``vision_config`` / ``audio_config`` /
+    ``speech_config``) whenever a fused-experts module is present.
+    """
+    if not any(_is_fused_experts_module(m) for m in model.modules()):
+        return
+
+    nested_cfg_attrs = ("text_config", "vision_config", "audio_config", "speech_config")
+
+    def _force(cfg):
+        if cfg is None:
+            return
+        if hasattr(cfg, "_experts_implementation"):
+            cfg._experts_implementation = "eager"
+        for sub in nested_cfg_attrs:
+            if hasattr(cfg, sub):
+                _force(getattr(cfg, sub))
+
+    if hasattr(model, "config"):
+        _force(model.config)
+
+
 def _is_supported_hf_model(model):
     """Check if the model a valid model for transformers quantization specific support."""
     supported_models = [transformers.PreTrainedModel]
@@ -1665,6 +1697,7 @@ def _reconstruct_fused_moe_linear(model: nn.Module) -> None:
         register_dbrx_moe_on_the_fly,
         register_step3p5_moe_on_the_fly,
         register_fused_experts_on_the_fly,
+        force_eager_experts_impl_on_the_fly,
         register_sparse_moe_on_the_fly,
         register_hf_attentions_on_the_fly,
         convert_hf_parallel_linears_on_the_fly,
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
index 7e77bf1151..0e90be5080 100644
--- a/tests/unit/torch/quantization/plugins/test_fused_experts.py
+++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -27,6 +27,7 @@
     _is_fused_experts_module,
     _is_sparse_sequaential_moe_block,
     _QuantFusedExperts,
+    force_eager_experts_impl_on_the_fly,
     register_fused_experts_on_the_fly,
     register_sparse_moe_on_the_fly,
 )
@@ -297,3 +298,144 @@ def test_export_creates_per_expert_submodules(self):
 
         if QuantModuleRegistry.get(expert_type) is not None:
             QuantModuleRegistry.unregister(expert_type)
+
+
+# ---------------------------------------------------------------------------
+# Tests for force_eager_experts_impl_on_the_fly
+# ---------------------------------------------------------------------------
+class _StubConfig:
+    """Minimal stand-in for HF PretrainedConfig with optional nested sub-configs."""
+
+    def __init__(self, impl=None, **nested):
+        if impl is not None:
+            self._experts_implementation = impl
+        for key, value in nested.items():
+            setattr(self, key, value)
+
+
+class _TinyMoEModelWithConfig(_TinyMoEModel):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+
+class _NonMoEModelWithConfig(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.linear = nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+        self.config = config
+
+
+class TestForceEagerExpertsImpl:
+    def test_sets_eager_on_moe_model(self):
+        """Non-eager backend on an MoE model gets flipped to eager."""
+        cfg = _StubConfig(impl="kernels")
+        model = _TinyMoEModelWithConfig(cfg)
+        force_eager_experts_impl_on_the_fly(model)
+        assert cfg._experts_implementation == "eager"
+
+    def test_recurses_into_nested_configs(self):
+        """VLM-style nested text_config / vision_config are also flipped."""
+        text_cfg = _StubConfig(impl="grouped_mm")
+        vision_cfg = _StubConfig(impl="bmm")
+        root_cfg = _StubConfig(text_config=text_cfg, vision_config=vision_cfg)
+        model = _TinyMoEModelWithConfig(root_cfg)
+        force_eager_experts_impl_on_the_fly(model)
+        assert text_cfg._experts_implementation == "eager"
+        assert vision_cfg._experts_implementation == "eager"
+
+    def test_skips_model_without_fused_experts(self):
+        """Non-MoE models must not have their config silently mutated."""
+        cfg = _StubConfig(impl="kernels")
+        model = _NonMoEModelWithConfig(cfg)
+        force_eager_experts_impl_on_the_fly(model)
+        assert cfg._experts_implementation == "kernels"
+
+    def test_no_crash_when_config_missing(self):
+        """Model without a ``config`` attribute must not raise."""
+        force_eager_experts_impl_on_the_fly(_TinyMoEModel())  # no-op, no error
+
+    def test_no_crash_when_impl_attr_missing(self):
+        """Config without ``_experts_implementation`` must not raise."""
+        cfg = _StubConfig()  # no impl attr
+        model = _TinyMoEModelWithConfig(cfg)
+        force_eager_experts_impl_on_the_fly(model)
+        assert not hasattr(cfg, "_experts_implementation")
+
+    def test_leaves_eager_value_unchanged(self):
+        cfg = _StubConfig(impl="eager")
+        model = _TinyMoEModelWithConfig(cfg)
+        force_eager_experts_impl_on_the_fly(model)
+        assert cfg._experts_implementation == "eager"
+
+
+# ---------------------------------------------------------------------------
+# End-to-end PTQ calibration test — guards the full fused-experts path:
+#   register_fused_experts_on_the_fly → _QuantFusedExperts.{_setup, forward} →
+#   plural ModuleList name normalization in conversion._match_quantizer →
+#   TensorQuantizer amax collection via the F.linear hook.
+# If any link breaks, quantizer `amax` stays None and this test fails.
+# ---------------------------------------------------------------------------
+class TestFusedExpertsCalibration:
+    @staticmethod
+    def _cleanup_registry(mod_type):
+        if QuantModuleRegistry.get(mod_type) is not None:
+            QuantModuleRegistry.unregister(mod_type)
+
+    def test_calibration_populates_all_expert_quantizers(self):
+        """After PTQ, every input/weight quantizer on the fused-experts module has amax set."""
+        import modelopt.torch.quantization as mtq
+
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+
+        quant_cfg = {
+            "quant_cfg": [
+                {"quantizer_name": "*", "enable": False},
+                {
+                    "quantizer_name": "*gate_up_proj_input_quantizer",
+                    "cfg": {"num_bits": 8, "axis": None},
+                },
+                {
+                    "quantizer_name": "*down_proj_input_quantizer",
+                    "cfg": {"num_bits": 8, "axis": None},
+                },
+                {
+                    "quantizer_name": "*gate_up_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+                {
+                    "quantizer_name": "*down_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+            ],
+            "algorithm": "max",
+        }
+
+        def forward_loop(m):
+            torch.manual_seed(0)
+            for _ in range(2):
+                x = torch.randn(1, 4, HIDDEN_DIM)
+                m(x)
+
+        mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
+
+        experts = model.moe.experts
+        assert experts.gate_up_proj_input_quantizer.amax is not None, (
+            "Shared gate_up_proj input quantizer was not calibrated — "
+            "F.linear hook likely bypassed by non-eager experts_implementation."
+        )
+        assert experts.down_proj_input_quantizer.amax is not None, (
+            "Shared down_proj input quantizer was not calibrated."
+        )
+        for idx in range(NUM_EXPERTS):
+            assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, (
+                f"gate_up_proj_weight_quantizers[{idx}].amax is None — "
+                "plural ModuleList name normalization in _match_quantizer likely broken."
+            )
+            assert experts.down_proj_weight_quantizers[idx].amax is not None, (
+                f"down_proj_weight_quantizers[{idx}].amax is None."
+            )
+
+        self._cleanup_registry(expert_type)

From d3e600c5e85eef94b241f8203dcc5537007b6b84 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 24 Apr 2026 12:59:49 -0700
Subject: [PATCH 3/6] address comments/fix mixed precision

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 modelopt/torch/export/quant_utils.py          |   9 +-
 modelopt/torch/export/unified_export_hf.py    |  19 +-
 modelopt/torch/quantization/conversion.py     |  33 ++--
 modelopt/torch/quantization/utils/__init__.py |   1 +
 .../torch/quantization/utils/core_utils.py    |  56 ++++--
 .../plugins/test_fused_experts.py             | 173 ++++++++++++++++++
 6 files changed, 249 insertions(+), 42 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 4ceb51cd2c..76f304a478 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -42,6 +42,7 @@
     QuantizerAttrNames,
     quantizer_attr_names,
     reduce_block_amax,
+    representative_weight_quantizer,
     weight_attr_names,
 )
 from modelopt.torch.utils import clear_cuda_cache
@@ -546,7 +547,7 @@ def _compute_kv_cache_dtype(
 
 def get_weight_block_size(module: nn.Module, weight_name: str = "weight") -> int:
     """Returns the weight block size."""
-    weight_quantizer = getattr(module, quantizer_attr_names(weight_name).weight_quantizer, None)
+    weight_quantizer = representative_weight_quantizer(module, weight_name)
 
     if weight_quantizer is None:
         return 0
@@ -572,7 +573,11 @@ def get_quantization_format(module) -> str | None:
     """
 
     def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames):
-        weight_quantizer = getattr(layer, quantizer_attr_names.weight_quantizer, None)
+        # Singular form first, plural ModuleList fallback (fused-experts).
+        # Strip the "_weight_quantizer" suffix to recover the weight attr name.
+        weight_attr = quantizer_attr_names.weight_quantizer
+        weight_name = weight_attr[: -len("_weight_quantizer")].rstrip("_") or "weight"
+        weight_quantizer = representative_weight_quantizer(layer, weight_name)
         input_quantizer = getattr(layer, quantizer_attr_names.input_quantizer, None)
 
         if weight_quantizer is None or not weight_quantizer.is_enabled:
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 1be8ee41cb..1db5069c39 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -88,6 +88,7 @@
     QUANTIZATION_W4A8_NVFP4_FP8,
 )
 from .model_utils import get_language_model_from_vl, is_multimodal_model
+from .moe_utils import _export_fused_experts
 from .plugins import SpeculativeDecodingExporter, has_spec_opt
 from .quant_utils import (
     fuse_prequant_layernorm,
@@ -646,15 +647,14 @@ def _process_quantized_modules(
             "QuantFP8Linear" in type(sub_module).__name__ and sub_module.weight.element_size() <= 1
         ):
             sub_module.unpack_weight()
-        # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList),
-        # which get_quantization_format's singular-weight_quantizer check misses. Handle
-        # it explicitly before the format gate so fused-experts get split + quantized.
-        if hasattr(sub_module, "gate_up_proj_weight_quantizers"):
-            from modelopt.torch.export.moe_utils import _export_fused_experts
-
+        elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
+            # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList),
+            # which get_quantization_format's singular-weight_quantizer check misses. Handle
+            # it explicitly before the format gate so fused-experts get split + quantized.
             with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                 _export_fused_experts(sub_module, dtype)
             continue
+
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             # Skip QuantMoELinear - it's handled separately in _reconstruct_fused_moe_linear
             if type(sub_module).__name__ == "QuantMoELinear":
@@ -686,13 +686,6 @@ def _process_quantized_modules(
                 with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                     for weight_name in ["gate_up_proj", "down_proj"]:
                         _export_quantized_weight(sub_module, dtype, weight_name)
-            elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
-                # Generic fused MoE experts (_QuantFusedExperts) with per-expert
-                # quantizer ModuleLists. Split into per-expert modules and export.
-                from modelopt.torch.export.moe_utils import _export_fused_experts
-
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    _export_fused_experts(sub_module, dtype)
 
 
 def _export_transformers_checkpoint(
diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py
index 55382352c6..3f97f8380b 100644
--- a/modelopt/torch/quantization/conversion.py
+++ b/modelopt/torch/quantization/conversion.py
@@ -287,24 +287,31 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType
             set_quantizer_attributes_full(quant_model, quantizer_name, attributes, parent_class)
 
 
-_FUSED_EXPERTS_QUANTIZER_LIST_RE = re.compile(r"(weight_quantizers|input_quantizers)\.\d+(?=$|\.)")
+_FUSED_EXPERTS_QUANTIZER_LIST_RE = re.compile(
+    r"(weight_quantizers?|input_quantizers?)\.\d+(?=$|\.)"
+)
 
 
 def _normalize_fused_experts_quantizer_name(name: str) -> str:
-    """Strip the per-expert index from ``_QuantFusedExperts`` ModuleList quantizer names.
-
-    ``_QuantFusedExperts`` registers per-expert weight/input quantizers as
-    ``nn.ModuleList``s named e.g. ``gate_up_proj_weight_quantizers`` — its children
-    get dotted names like ``...gate_up_proj_weight_quantizers.0``. These don't match
-    the singular-suffix wildcards (``*weight_quantizer``) used in the stock configs,
-    so the experts stay at their defaults. Return a normalized name where
-    ``weight_quantizers.N`` / ``input_quantizers.N`` collapse to their singular form
+    """Strip the per-expert index from per-expert quantizer ModuleList names.
+
+    Fused-experts modules register per-expert weight/input quantizers in a
+    ``nn.ModuleList``; its children surface as dotted names like
+    ``...gate_up_proj_weight_quantizers.0`` (plural) or — if a variant uses
+    singular naming — ``...gate_up_proj_weight_quantizer.0``. Neither matches
+    the singular-suffix wildcards (``*weight_quantizer``) used in the stock
+    configs, so the experts stay at their defaults.
+
+    Return a normalized name where either ``weight_quantizer[s]?.N`` or
+    ``input_quantizer[s]?.N`` collapses to the singular form without the index
     so the standard wildcards match.
     """
-    return _FUSED_EXPERTS_QUANTIZER_LIST_RE.sub(
-        lambda m: m.group(1)[:-1],
-        name,  # "weight_quantizers" -> "weight_quantizer"
-    )
+
+    def _repl(m: re.Match) -> str:
+        base = m.group(1)
+        return base.removesuffix("s")
+
+    return _FUSED_EXPERTS_QUANTIZER_LIST_RE.sub(_repl, name)
 
 
 def _match_quantizer(
diff --git a/modelopt/torch/quantization/utils/__init__.py b/modelopt/torch/quantization/utils/__init__.py
index dfc23c42ee..dc6daa0084 100644
--- a/modelopt/torch/quantization/utils/__init__.py
+++ b/modelopt/torch/quantization/utils/__init__.py
@@ -30,6 +30,7 @@
     "reduce_amax",
     "reduce_sum",
     "replace_function",
+    "representative_weight_quantizer",
     "update_quant_cfg_with_kv_cache_quant",
     "weight_attr_names",
 ]
diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py
index 29661e18f5..a9b59763ba 100644
--- a/modelopt/torch/quantization/utils/core_utils.py
+++ b/modelopt/torch/quantization/utils/core_utils.py
@@ -202,27 +202,55 @@ def reduce_sum(input, axis=None, keepdims=True):
     return output
 
 
-def weight_attr_names(module: nn.Module) -> "Generator[str, None, None]":
-    """Get the weight param attribute names in a converted module, non-recursive.
+def representative_weight_quantizer(module: nn.Module, weight_name: str = "weight"):
+    """Return the representative weight quantizer for ``weight_name`` on ``module``.
+
+    Handles two layouts:
+    - singular ``<name>_weight_quantizer`` — standard ``nn.Linear`` / ``_QuantLinear``.
+    - plural ``<name>_weight_quantizers`` (``nn.ModuleList``) — fused-experts modules
+      (``_QuantFusedExperts``) hold one ``TensorQuantizer`` per expert. Per-expert
+      formats are identical, so the first element is representative.
 
-    We consider the following two cases for each weight param attribute:
-    - The standard weight attribute (e.g. nn.Linear).
-    - The custom `weight_attr_name`. (e.g. Llama4TextExperts has weight attributes `gate_up_proj` and `down_proj`)
+    Returns ``None`` if no matching quantizer is found.
     """
     from ..nn import SequentialQuantizer, TensorQuantizer
 
-    # the standard weight and quantizer case
-    weight = getattr(module, "weight", None)
-    weight_quantizer = getattr(module, "weight_quantizer", None)
-    if weight is not None and isinstance(weight_quantizer, (TensorQuantizer, SequentialQuantizer)):
-        yield "weight"
+    singular = quantizer_attr_names(weight_name).weight_quantizer
+    q = getattr(module, singular, None)
+    if isinstance(q, (TensorQuantizer, SequentialQuantizer)):
+        return q
 
-    # other weight and quantizer case
+    plural = getattr(module, singular + "s", None)
+    if isinstance(plural, nn.ModuleList) and len(plural) > 0:
+        first = plural[0]
+        if isinstance(first, (TensorQuantizer, SequentialQuantizer)):
+            return first
+    return None
+
+
+def weight_attr_names(module: nn.Module) -> "Generator[str, None, None]":
+    """Get the weight param attribute names in a converted module, non-recursive.
+
+    Covers three layouts:
+    - standard ``nn.Linear``: ``weight`` + ``weight_quantizer``.
+    - custom per-weight quantizer (e.g. ``Llama4TextExperts`` with ``gate_up_proj`` +
+      ``gate_up_proj_weight_quantizer``).
+    - fused-experts ``nn.ModuleList`` quantizers (``_QuantFusedExperts`` with
+      ``gate_up_proj`` + ``gate_up_proj_weight_quantizers`` plural list).
+    """
+    # standard: "weight" + "weight_quantizer" (singular) or "weight_quantizers" (plural)
+    if getattr(module, "weight", None) is not None:
+        if representative_weight_quantizer(module, "weight") is not None:
+            yield "weight"
+
+    # per-parameter custom attr names
     for name, _ in module.named_parameters(recurse=False):
+        if name == "weight":
+            continue
         weight = getattr(module, name, None)
-        weight_quantizer = getattr(module, f"{name}_weight_quantizer", None)
-        if isinstance(weight, nn.Parameter) and isinstance(
-            weight_quantizer, (TensorQuantizer, SequentialQuantizer)
+        if (
+            isinstance(weight, nn.Parameter)
+            and representative_weight_quantizer(module, name) is not None
         ):
             yield name
 
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
index 0e90be5080..2943582774 100644
--- a/tests/unit/torch/quantization/plugins/test_fused_experts.py
+++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -22,6 +22,7 @@
 
 pytest.importorskip("transformers")
 
+from modelopt.torch.quantization.conversion import _normalize_fused_experts_quantizer_name
 from modelopt.torch.quantization.nn import QuantModuleRegistry
 from modelopt.torch.quantization.plugins.huggingface import (
     _is_fused_experts_module,
@@ -439,3 +440,175 @@ def forward_loop(m):
             )
 
         self._cleanup_registry(expert_type)
+
+
+# ---------------------------------------------------------------------------
+# Tests for export enumeration — guards the bug where fused-experts were
+# silently skipped by get_quant_config because their weight quantizers live
+# on a plural nn.ModuleList instead of the singular *_weight_quantizer attr.
+# Missed enumeration → experts don't appear in quantized_layers →
+# quantization_formats has only 1 entry from the non-expert modules →
+# quant_algo lands on that format instead of "MIXED_PRECISION".
+# ---------------------------------------------------------------------------
+class _MixedPrecisionModel(nn.Module):
+    """A model with both a fused-experts block AND a standard Linear, so a
+    mixed-precision recipe should produce two distinct format groups."""
+
+    def __init__(self):
+        super().__init__()
+        self.moe = _SyntheticSparseMoeBlock()
+        self.dense = nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+
+    def forward(self, x):
+        return self.dense(self.moe(x))
+
+
+class TestMixedPrecisionExport:
+    @staticmethod
+    def _cleanup_registry(mod_type):
+        if QuantModuleRegistry.get(mod_type) is not None:
+            QuantModuleRegistry.unregister(mod_type)
+
+    def test_weight_attr_names_yields_fused_expert_params(self):
+        """weight_attr_names must yield gate_up_proj / down_proj on fused experts
+        even though their quantizers are a plural ModuleList, not singular."""
+        from modelopt.torch.quantization.utils.core_utils import weight_attr_names
+
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+
+        register_fused_experts_on_the_fly(model)
+        converted = QuantModuleRegistry.convert(model.moe.experts)
+
+        yielded = list(weight_attr_names(converted))
+        assert set(yielded) == {"gate_up_proj", "down_proj"}, (
+            f"Expected both fused weight attrs, got {yielded}. "
+            "Likely regression in representative_weight_quantizer plural fallback."
+        )
+
+        self._cleanup_registry(expert_type)
+
+    def test_mixed_precision_config_export(self):
+        """Mixed-precision recipe (experts FP8 + dense Linear FP8 per-channel) should
+        show both modules in quantized_layers. Using two distinct formats would
+        trigger MIXED_PRECISION; using same-format still exercises enumeration."""
+        import modelopt.torch.quantization as mtq
+        from modelopt.torch.export.quant_utils import get_quant_config
+
+        model = _MixedPrecisionModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+
+        # FP8 per-tensor for experts; FP8 per-channel for dense — two distinct
+        # format strings in quantization_formats, so quant_algo must become
+        # MIXED_PRECISION.
+        quant_cfg = {
+            "quant_cfg": [
+                {"quantizer_name": "*", "enable": False},
+                {
+                    "quantizer_name": "*gate_up_proj_input_quantizer",
+                    "cfg": {"num_bits": (4, 3), "axis": None},
+                },
+                {
+                    "quantizer_name": "*down_proj_input_quantizer",
+                    "cfg": {"num_bits": (4, 3), "axis": None},
+                },
+                {
+                    "quantizer_name": "*gate_up_proj_weight_quantizer",
+                    "cfg": {"num_bits": (4, 3), "axis": None},
+                },
+                {
+                    "quantizer_name": "*down_proj_weight_quantizer",
+                    "cfg": {"num_bits": (4, 3), "axis": None},
+                },
+                {
+                    "quantizer_name": "*dense.input_quantizer",
+                    "cfg": {"num_bits": (4, 3), "axis": None},
+                },
+                {
+                    "quantizer_name": "*dense.weight_quantizer",
+                    "cfg": {"num_bits": (4, 3), "axis": 0},  # per-channel → FP8_PC_PT
+                },
+            ],
+            "algorithm": "max",
+        }
+
+        def forward_loop(m):
+            torch.manual_seed(0)
+            for _ in range(2):
+                x = torch.randn(1, 4, HIDDEN_DIM)
+                m(x)
+
+        mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
+
+        cfg = get_quant_config(model)
+        q = cfg["quantization"]
+
+        # The fused-experts module MUST appear in quantized_layers. This is the
+        # central guard: regressions of weight_attr_names plural fallback would
+        # make experts disappear here.
+        layer_names = set(q.get("quantized_layers", {}).keys())
+        assert any("moe.experts" in n for n in layer_names), (
+            f"Fused-experts module missing from quantized_layers: {layer_names}. "
+            "weight_attr_names likely not yielding plural-ModuleList weight attrs."
+        )
+        assert any(n.endswith("dense") for n in layer_names), (
+            f"Dense Linear missing from quantized_layers: {layer_names}."
+        )
+
+        # Two distinct formats → MIXED_PRECISION at top level.
+        assert q["quant_algo"] == "MIXED_PRECISION", (
+            f"Expected MIXED_PRECISION (fused-experts FP8 per-tensor + dense "
+            f"FP8 per-channel), got quant_algo={q['quant_algo']}. "
+            f"quantized_layers={q.get('quantized_layers')}"
+        )
+
+        self._cleanup_registry(expert_type)
+
+
+# ---------------------------------------------------------------------------
+# Tests for the fused-experts quantizer-name normalizer used by
+# conversion._match_quantizer. Covers both plural (actual _QuantFusedExperts
+# layout) and singular (defensive: future variants may name the ModuleList
+# without the trailing `s`) forms.
+# ---------------------------------------------------------------------------
+class TestNormalizeFusedExpertsQuantizerName:
+    def test_plural_weight_quantizers_stripped(self):
+        assert (
+            _normalize_fused_experts_quantizer_name("moe.experts.gate_up_proj_weight_quantizers.7")
+            == "moe.experts.gate_up_proj_weight_quantizer"
+        )
+
+    def test_plural_input_quantizers_stripped(self):
+        assert (
+            _normalize_fused_experts_quantizer_name("moe.experts.down_proj_input_quantizers.3")
+            == "moe.experts.down_proj_input_quantizer"
+        )
+
+    def test_singular_weight_quantizer_with_index_stripped(self):
+        """Defensive: handle variants that name the ModuleList singular."""
+        assert (
+            _normalize_fused_experts_quantizer_name("moe.experts.gate_up_proj_weight_quantizer.2")
+            == "moe.experts.gate_up_proj_weight_quantizer"
+        )
+
+    def test_singular_input_quantizer_with_index_stripped(self):
+        assert (
+            _normalize_fused_experts_quantizer_name("moe.experts.down_proj_input_quantizer.0")
+            == "moe.experts.down_proj_input_quantizer"
+        )
+
+    def test_non_indexed_name_unchanged(self):
+        """Plain singular names (no index) must be passed through untouched."""
+        assert (
+            _normalize_fused_experts_quantizer_name("moe.experts.gate_up_proj_weight_quantizer")
+            == "moe.experts.gate_up_proj_weight_quantizer"
+        )
+
+    def test_unrelated_dotted_number_unchanged(self):
+        """Dotted numbers that aren't inside a quantizer-list context are left alone."""
+        assert (
+            _normalize_fused_experts_quantizer_name("moe.layers.3.gate.weight")
+            == "moe.layers.3.gate.weight"
+        )

From 0139897a57f21d16170d033565c13390b589d723 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 24 Apr 2026 14:59:37 -0700
Subject: [PATCH 4/6] address review: split unpack_weight preprocessing from
 fused-experts/standard export branches

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 modelopt/torch/export/unified_export_hf.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 1db5069c39..a76783ac17 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -643,19 +643,20 @@ def _process_quantized_modules(
         if is_modelopt_qlora and (hasattr(sub_module, "base_layer")):
             continue
 
+        # Preprocessing: restore unpacked weight so the export path can read
+        # the live quantizer state. Falls through to the export branches below.
         if hasattr(sub_module, "weight_packed") or (
             "QuantFP8Linear" in type(sub_module).__name__ and sub_module.weight.element_size() <= 1
         ):
             sub_module.unpack_weight()
-        elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
+
+        if hasattr(sub_module, "gate_up_proj_weight_quantizers"):
             # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList),
             # which get_quantization_format's singular-weight_quantizer check misses. Handle
             # it explicitly before the format gate so fused-experts get split + quantized.
             with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                 _export_fused_experts(sub_module, dtype)
-            continue
-
-        if get_quantization_format(sub_module) != QUANTIZATION_NONE:
+        elif get_quantization_format(sub_module) != QUANTIZATION_NONE:
             # Skip QuantMoELinear - it's handled separately in _reconstruct_fused_moe_linear
             if type(sub_module).__name__ == "QuantMoELinear":
                 continue

From 713e76fbdcf026799d262604fded766c4b150d41 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:13:14 -0700
Subject: [PATCH 5/6] fix: fold_weight for _QuantFusedExperts and sphinx
 warnings

The base QuantModule.fold_weight only walks singular *_weight_quantizer
attributes, so per-expert quantizers in _QuantFusedExperts'
gate_up_proj_weight_quantizers / down_proj_weight_quantizers ModuleLists
are never folded, leaving _amax behind. The vLLM fake-quant export test
test_hf_vllm_export_tiny_qwen3_moe[FP8] surfaces this. Override
fold_weight on _QuantFusedExperts to walk the per-expert ModuleList,
apply each quantizer to its 3-D slice, disable, and drop _amax /
_pre_quant_scale.

Also add the blank line between docstring intro and bullet list in
representative_weight_quantizer / weight_attr_names so sphinx stops
emitting "Block quote ends without a blank line" warnings during
build-docs.

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../torch/quantization/plugins/huggingface.py | 27 +++++++++++++++++++
 .../torch/quantization/utils/core_utils.py    |  2 ++
 2 files changed, 29 insertions(+)

diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index 7ea372f1fc..77f26b2060 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -900,6 +900,33 @@ def forward(self, *args, **kwargs):
         self._down_proj_linear = False
         return super().forward(*args, **kwargs)
 
+    def fold_weight(self, keep_attrs: bool = False):
+        """Fold per-expert weight quantizers into the fused 3-D weights.
+
+        The base ``fold_weight`` only handles singular ``*_weight_quantizer``
+        attributes. Fused experts use ``nn.ModuleList`` of per-expert quantizers
+        (``gate_up_proj_weight_quantizers``, ``down_proj_weight_quantizers``),
+        which would otherwise be skipped, leaving ``_amax`` on every quantizer.
+        """
+        for weight_name, quantizers_name in (
+            ("gate_up_proj", "gate_up_proj_weight_quantizers"),
+            ("down_proj", "down_proj_weight_quantizers"),
+        ):
+            weight = getattr(self, weight_name, None)
+            quantizers = getattr(self, quantizers_name, None)
+            if weight is None or quantizers is None:
+                continue
+            for idx, q in enumerate(quantizers):
+                if not (isinstance(q, TensorQuantizer) and q.fake_quant):
+                    continue
+                slice_ = weight.data[idx]
+                slice_.copy_(q(slice_.float()).to(weight.dtype))
+                q.disable()
+                if not keep_attrs:
+                    for attr_name in ("_pre_quant_scale", "_amax"):
+                        if hasattr(q, attr_name):
+                            delattr(q, attr_name)
+
 
 class _QuantDbrxFFN(_QuantSparseSequentialMoe):
     @property
diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py
index a9b59763ba..1a177e04dc 100644
--- a/modelopt/torch/quantization/utils/core_utils.py
+++ b/modelopt/torch/quantization/utils/core_utils.py
@@ -206,6 +206,7 @@ def representative_weight_quantizer(module: nn.Module, weight_name: str = "weigh
     """Return the representative weight quantizer for ``weight_name`` on ``module``.
 
     Handles two layouts:
+
     - singular ``<name>_weight_quantizer`` — standard ``nn.Linear`` / ``_QuantLinear``.
     - plural ``<name>_weight_quantizers`` (``nn.ModuleList``) — fused-experts modules
       (``_QuantFusedExperts``) hold one ``TensorQuantizer`` per expert. Per-expert
@@ -232,6 +233,7 @@ def weight_attr_names(module: nn.Module) -> "Generator[str, None, None]":
     """Get the weight param attribute names in a converted module, non-recursive.
 
     Covers three layouts:
+
     - standard ``nn.Linear``: ``weight`` + ``weight_quantizer``.
     - custom per-weight quantizer (e.g. ``Llama4TextExperts`` with ``gate_up_proj`` +
       ``gate_up_proj_weight_quantizer``).

From 3eb0f212bd1455a7a8b7d5dea0629d0df9bf03b8 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Mon, 27 Apr 2026 15:10:00 -0700
Subject: [PATCH 6/6] test: cover _QuantFusedExperts.fold_weight per-expert
 path

Adds a parametrized unit test that quantizes a synthetic fused-experts
module via mtq.quantize, snapshots the 3-D weights, calls fold_weight
with both keep_attrs=False and keep_attrs=True, and asserts that
weights moved (fake-quant applied), every per-expert quantizer in both
gate_up_proj_weight_quantizers and down_proj_weight_quantizers
ModuleLists is disabled, and _amax is dropped (or preserved with
keep_attrs=True).

Covers the lines added by the prior commit (fold_weight override on
_QuantFusedExperts) so codecov patch coverage no longer flags them.

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../plugins/test_fused_experts.py             | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
index 2943582774..b5cbac414d 100644
--- a/tests/unit/torch/quantization/plugins/test_fused_experts.py
+++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -251,6 +251,63 @@ def test_expert_index_recovery(self):
             assert recovered_idx == idx, f"Expected {idx}, got {recovered_idx}"
         self._cleanup_registry(expert_type)
 
+    @pytest.mark.parametrize("keep_attrs", [False, True])
+    def test_fold_weight_per_expert(self, keep_attrs):
+        """fold_weight on _QuantFusedExperts must walk the per-expert ModuleList,
+        apply fake-quant to each 3-D slice, disable the quantizer, and (unless
+        keep_attrs) drop _amax. The base QuantModule.fold_weight only handles
+        singular *_weight_quantizer attrs and would silently skip these."""
+        import modelopt.torch.quantization as mtq
+
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+
+        quant_cfg = {
+            "quant_cfg": [
+                {"quantizer_name": "*", "enable": False},
+                {
+                    "quantizer_name": "*gate_up_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+                {
+                    "quantizer_name": "*down_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+            ],
+            "algorithm": "max",
+        }
+
+        def forward_loop(m):
+            torch.manual_seed(0)
+            x = torch.randn(1, 4, HIDDEN_DIM)
+            m(x)
+
+        mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
+        experts = model.moe.experts
+
+        gate_up_before = experts.gate_up_proj.detach().clone()
+        down_before = experts.down_proj.detach().clone()
+        for idx in range(NUM_EXPERTS):
+            assert experts.gate_up_proj_weight_quantizers[idx].is_enabled
+            assert experts.gate_up_proj_weight_quantizers[idx]._amax is not None
+
+        experts.fold_weight(keep_attrs=keep_attrs)
+
+        # Weights must have moved to the fake-quantized values.
+        assert not torch.equal(gate_up_before, experts.gate_up_proj)
+        assert not torch.equal(down_before, experts.down_proj)
+
+        for quantizers_name in ("gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"):
+            for q in getattr(experts, quantizers_name):
+                assert not q.is_enabled, f"{quantizers_name} entry not disabled after fold"
+                if keep_attrs:
+                    assert q._amax is not None, "keep_attrs=True must preserve _amax"
+                else:
+                    assert not hasattr(q, "_amax"), "_amax must be deleted after fold"
+
+        self._cleanup_registry(expert_type)
+
 
 # ---------------------------------------------------------------------------
 # Tests for export