From 1b1fcedcf173b2e0bfc1513213ed2fe8777ffc4a Mon Sep 17 00:00:00 2001
From: Lenny Potato <lenny@tinkeredapps.com>
Date: Wed, 22 Apr 2026 14:36:21 -0400
Subject: [PATCH 01/17] Fix NVFP4 quantization for Qwen3.x MoE models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four bugs prevent NVFP4 export from producing quantized weights for
Qwen3.5/3.6 MoE models (and potentially other fused MoE architectures).
All produce silent failures — no errors, just bfloat16 output identical
to input.

Bug 1: is_multimodal_model() crashes when config.architectures is None
  - model_utils.py: add 'or []' fallback for NoneType iteration

Bug 3: get_quantization_format() doesn't recognize _QuantFusedExperts
  - quant_utils.py: add check for plural ModuleList quantizers
    (gate_up_proj_weight_quantizers, down_proj_weight_quantizers)
    before the singular weight_quantizer loop

Bug 4: NVFP4 config wildcards don't match plural quantizer names
  - config.py: _nvfp4_selective_quant_cfg() only generates patterns
    for singular 'weight_quantizer', but _QuantFusedExperts creates
    plural ModuleList quantizers. Add wildcard entries for both
    gate_up_proj_weight_quantizers* and down_proj_weight_quantizers*

Bug 5: _process_quantized_modules elif order sends fused MoE to wrong path
  - unified_export_hf.py: swap elif branches so hasattr check for
    gate_up_proj_weight_quantizers comes before type-name checks.
    Without this, QuantQwen3_5MoeExperts hits the singular-attribute
    branch and crashes with AttributeError

Tested on: Qwen3.6-35B-A3B (MoE), NVIDIA DGX Spark (GB10),
modelopt 0.45.0 dev, transformers 5.5.4
Output: 20.5 GB NVFP4 (down from 66 GB bfloat16)
---
 modelopt/torch/export/model_utils.py       |  2 +-
 modelopt/torch/export/quant_utils.py       | 19 +++++++++++++++++++
 modelopt/torch/export/unified_export_hf.py | 17 ++++++++++-------
 modelopt/torch/quantization/config.py      |  6 ++++++
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 3bd72d9de91..aa81f8213f7 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -107,7 +107,7 @@ def is_multimodal_model(model):
     config = model.config
 
     # Check for Nemotron-Parse encoder-decoder architecture
-    architectures = getattr(config, "architectures", [])
+    architectures = getattr(config, "architectures", []) or []
     is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
 
     return (
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 4ceb51cd2c0..51c2e8ce93d 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -665,6 +665,25 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
             f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}"
         )
 
+    # Handle _QuantFusedExperts modules (e.g. Qwen3.x MoE) which use plural
+    # ModuleList quantizers (gate_up_proj_weight_quantizers, down_proj_weight_quantizers)
+    # instead of singular weight_quantizer attributes.
+    for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]:
+        quantizer_list = getattr(module, quantizer_list_name, None)
+        if quantizer_list is not None and len(quantizer_list) > 0:
+            # Check the first quantizer in the list — all share the same config
+            q = quantizer_list[0]
+            if hasattr(q, "is_enabled") and q.is_enabled:
+                num_bits = getattr(q, "num_bits", None)
+                block_sizes = getattr(q, "block_sizes", None)
+                scale_bits = (
+                    block_sizes.get("scale_bits", (8, 0))
+                    if isinstance(block_sizes, dict) and "scale_bits" in block_sizes
+                    else (8, 0)
+                )
+                if num_bits == (2, 1) and scale_bits == (4, 3):
+                    return QUANTIZATION_NVFP4
+
     for weight_name in weight_attr_names(module):
         quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name))
         if quantization != QUANTIZATION_NONE:
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index af936a3002a..c0145d16eaa 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -658,6 +658,16 @@ def _process_quantized_modules(
                     raise AssertionError(
                         f"Failed to export module '{name}' (type={type(sub_module).__name__}): {e}"
                     ) from e
+            elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
+                # Generic fused MoE experts (_QuantFusedExperts) with per-expert
+                # quantizer ModuleLists. Split into per-expert modules and export.
+                # NOTE: This check must come before type-name checks (e.g. Llama4,
+                # GptOss) because _QuantFusedExperts wrapping renames quantizers
+                # to plural ModuleLists (e.g. gate_up_proj_weight_quantizers).
+                from modelopt.torch.export.moe_utils import _export_fused_experts
+
+                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                    _export_fused_experts(sub_module, dtype)
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
                 or "GptOssExperts" in type(sub_module).__name__
@@ -677,13 +687,6 @@ def _process_quantized_modules(
                 with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                     for weight_name in ["gate_up_proj", "down_proj"]:
                         _export_quantized_weight(sub_module, dtype, weight_name)
-            elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
-                # Generic fused MoE experts (_QuantFusedExperts) with per-expert
-                # quantizer ModuleLists. Split into per-expert modules and export.
-                from modelopt.torch.export.moe_utils import _export_fused_experts
-
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    _export_fused_experts(sub_module, dtype)
 
 
 def _export_transformers_checkpoint(
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
index 186ff1c7edd..850c1eeb423 100644
--- a/modelopt/torch/quantization/config.py
+++ b/modelopt/torch/quantization/config.py
@@ -578,6 +578,12 @@ def _nvfp4_selective_quant_cfg(
         quant_cfg.append(
             {"quantizer_name": f"{pattern}weight_quantizer", "cfg": copy.deepcopy(quantizer)}
         )
+        # Also match plural ModuleList quantizers used by _QuantFusedExperts
+        # (e.g. gate_up_proj_weight_quantizers.N) for fused MoE architectures.
+        for suffix in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]:
+            quant_cfg.append(
+                {"quantizer_name": f"{pattern}{suffix}*", "cfg": copy.deepcopy(quantizer)}
+            )
         if not weight_only:
             quant_cfg.append(
                 {"quantizer_name": f"{pattern}input_quantizer", "cfg": copy.deepcopy(quantizer)}

From 5d5c4925bc335730a332e2a1925ddf8bbb9ebb83 Mon Sep 17 00:00:00 2001
From: Lenny Potato <lenny@tinkeredapps.com>
Date: Wed, 22 Apr 2026 14:46:54 -0400
Subject: [PATCH 02/17] fix: iterate quantizer list to find first enabled
 quantizer

CodeRabbit review: expert 0 may be disabled when uncalibrated, so
checking only quantizer_list[0] can miss the actual NVFP4 config.
Now iterates to find the first enabled quantizer in the list.
---
 modelopt/torch/export/quant_utils.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 51c2e8ce93d..c0f2174b24d 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -671,18 +671,20 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
     for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]:
         quantizer_list = getattr(module, quantizer_list_name, None)
         if quantizer_list is not None and len(quantizer_list) > 0:
-            # Check the first quantizer in the list — all share the same config
-            q = quantizer_list[0]
-            if hasattr(q, "is_enabled") and q.is_enabled:
-                num_bits = getattr(q, "num_bits", None)
-                block_sizes = getattr(q, "block_sizes", None)
-                scale_bits = (
-                    block_sizes.get("scale_bits", (8, 0))
-                    if isinstance(block_sizes, dict) and "scale_bits" in block_sizes
-                    else (8, 0)
-                )
-                if num_bits == (2, 1) and scale_bits == (4, 3):
-                    return QUANTIZATION_NVFP4
+            # Find the first enabled quantizer — expert 0 may be disabled if
+            # uncalibrated, so we iterate rather than checking index 0 only.
+            for q in quantizer_list:
+                if hasattr(q, "is_enabled") and q.is_enabled:
+                    num_bits = getattr(q, "num_bits", None)
+                    block_sizes = getattr(q, "block_sizes", None)
+                    scale_bits = (
+                        block_sizes.get("scale_bits", (8, 0))
+                        if isinstance(block_sizes, dict) and "scale_bits" in block_sizes
+                        else (8, 0)
+                    )
+                    if num_bits == (2, 1) and scale_bits == (4, 3):
+                        return QUANTIZATION_NVFP4
+                    break
 
     for weight_name in weight_attr_names(module):
         quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name))

From c1e09d80972d910badd055a625da079bf54ab03e Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 00:22:31 -0400
Subject: [PATCH 03/17] =?UTF-8?q?fix:=20Qwen3.6=20MoE=20export=20=E2=80=94?=
 =?UTF-8?q?=20architecture=20mapping,=20GDN=20handling,=20isinstance=20che?=
 =?UTF-8?q?ck?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Qwen3_5MoeForConditionalGeneration to export/import mappings
- Add Qwen3VLModel + HybridModel to GPTModelExporter isinstance check
- Handle GatedDeltaNet layers in _get_transformer_layer_state_dict
- Fix quantizer format detection for disabled quantizers
---
 modelopt/torch/export/plugins/mcore_common.py |  2 ++
 modelopt/torch/export/quant_utils.py          | 35 ++++++++++++-------
 .../torch/export/unified_export_megatron.py   | 21 +++++++++--
 3 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/modelopt/torch/export/plugins/mcore_common.py b/modelopt/torch/export/plugins/mcore_common.py
index d5bab9b4ece..ec262bcf094 100644
--- a/modelopt/torch/export/plugins/mcore_common.py
+++ b/modelopt/torch/export/plugins/mcore_common.py
@@ -52,6 +52,7 @@
     "LlamaForCausalLMEagle3Deep": eagle3_deep_llama_causal_lm_export,
     "Qwen3ForCausalLM": qwen3_causal_lm_export,
     "Qwen3MoeForCausalLM": qwen3_causal_lm_export,
+    "Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_export,
     "Qwen2ForCausalLM": qwen25_causal_lm_export,
     "GptOssForCausalLM": gptoss_causal_lm_export,
 }
@@ -64,6 +65,7 @@
     "NemotronHForCausalLM": nemotron_h_causal_lm_import,
     "Qwen3ForCausalLM": qwen3_causal_lm_import,
     "Qwen3MoeForCausalLM": qwen3_causal_lm_import,
+    "Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_import,
     "Qwen2ForCausalLM": qwen25_causal_lm_import,
     "GptOssForCausalLM": gptoss_causal_lm_import,
 }
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index c0f2174b24d..cc55545699d 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -668,23 +668,32 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
     # Handle _QuantFusedExperts modules (e.g. Qwen3.x MoE) which use plural
     # ModuleList quantizers (gate_up_proj_weight_quantizers, down_proj_weight_quantizers)
     # instead of singular weight_quantizer attributes.
+    # The quantization format is determined at module setup time, not per-expert.
+    # Check any quantizer in the list (even disabled ones) to determine the format,
+    # since calibration may not have activated all experts.
     for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]:
         quantizer_list = getattr(module, quantizer_list_name, None)
         if quantizer_list is not None and len(quantizer_list) > 0:
-            # Find the first enabled quantizer — expert 0 may be disabled if
-            # uncalibrated, so we iterate rather than checking index 0 only.
-            for q in quantizer_list:
-                if hasattr(q, "is_enabled") and q.is_enabled:
-                    num_bits = getattr(q, "num_bits", None)
-                    block_sizes = getattr(q, "block_sizes", None)
-                    scale_bits = (
-                        block_sizes.get("scale_bits", (8, 0))
-                        if isinstance(block_sizes, dict) and "scale_bits" in block_sizes
-                        else (8, 0)
-                    )
-                    if num_bits == (2, 1) and scale_bits == (4, 3):
-                        return QUANTIZATION_NVFP4
+            # Check any quantizer — enabled or not — for format config.
+            # Prefer enabled ones first, but fall back to any if none are enabled.
+            q = None
+            for candidate in quantizer_list:
+                if hasattr(candidate, "is_enabled") and candidate.is_enabled:
+                    q = candidate
                     break
+            if q is None:
+                q = quantizer_list[0]
+
+            num_bits = getattr(q, "num_bits", None)
+            block_sizes = getattr(q, "block_sizes", None)
+            scale_bits = (
+                block_sizes.get("scale_bits", (8, 0))
+                if isinstance(block_sizes, dict) and "scale_bits" in block_sizes
+                else (8, 0)
+            )
+            if num_bits == (2, 1) and scale_bits == (4, 3):
+                return QUANTIZATION_NVFP4
+            # Add other expert quantization format checks here as needed
 
     for weight_name in weight_attr_names(module):
         quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name))
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 62053e549c8..d5a911afb4f 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -73,6 +73,8 @@
     from megatron.core.models.gpt import GPTModel
     from megatron.core.models.mamba import MambaModel
     from megatron.core.models.multimodal.llava_model import LLaVAModel
+    from megatron.core.models.gpt.hybrid_model import HybridModel
+    from megatron.bridge.models.qwen_vl import Qwen3VLModel
     from megatron.core.parallel_state import (
         get_pipeline_model_parallel_rank,
         get_pipeline_model_parallel_world_size,
@@ -121,7 +123,7 @@ def __init__(
         moe_router_dtype: str | None = None,
     ):
         """Create a GPTModel exporter instance."""
-        if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)):
+        if not isinstance(model, (GPTModel, MambaModel, HybridModel, LLaVAModel, Qwen3VLModel)):
             raise ValueError("Input to GPTModelExport must be a megatron.core.models.GPTModel!")
 
         self._state_dict = OrderedDict()
@@ -460,8 +462,21 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                 self.rules["linear_kv_layernorm"](layer.self_attention.kv_layernorm, layer_id)
                 self.rules["linear_kv_up_proj"](layer.self_attention.linear_kv_up_proj, layer_id)
                 self.rules["linear_proj"](layer.self_attention.linear_proj, layer_id)
+            elif "GatedDeltaNet" in str(type(layer.self_attention)):
+                # GatedDeltaNet (linear attention) has in_proj, out_norm, out_proj
+                # instead of linear_qkv, q_layernorm, etc.
+                if "gated_delta_net_in_proj" in self.rules:
+                    self.rules["gated_delta_net_in_proj"](layer.self_attention.in_proj, layer_id)
+                else:
+                    self.rules["linear_qkv"](layer.self_attention.in_proj, layer_id)
+                if hasattr(layer.self_attention, "out_norm") and not isinstance(
+                    layer.self_attention.out_norm, IdentityOp
+                ):
+                    if "gated_delta_net_out_norm" in self.rules:
+                        self.rules["gated_delta_net_out_norm"](layer.self_attention.out_norm, layer_id)
+                self.rules["linear_proj"](layer.self_attention.out_proj, layer_id)
             else:
-                if layer.self_attention.q_layernorm is not None and not isinstance(
+                if hasattr(layer.self_attention, "q_layernorm") and layer.self_attention.q_layernorm is not None and not isinstance(
                     layer.self_attention.q_layernorm, (IdentityOp, L2Norm)
                 ):
                     self.rules["q_layernorm"](layer.self_attention.q_layernorm, layer_id)
@@ -473,7 +488,7 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                 ):  # KV cache quant export
                     self.rules["core_attention"](layer.self_attention.core_attention, layer_id)
                 self.rules["linear_proj"](layer.self_attention.linear_proj, layer_id)
-                if getattr(layer.self_attention.core_attention, "softmax_offset", None) is not None:
+                if hasattr(layer.self_attention, "core_attention") and getattr(layer.self_attention.core_attention, "softmax_offset", None) is not None:
                     self.rules["softmax_offset"](
                         layer.self_attention.core_attention.softmax_offset, layer_id
                     )

From 0ddd356f3174d5b3e71cb5e7a888849d49b3fbc4 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 00:26:41 -0400
Subject: [PATCH 04/17] fix: correct HybridModel import path
 (hybrid.hybrid_model, not gpt.hybrid_model)

---
 modelopt/torch/export/unified_export_megatron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index d5a911afb4f..2d59df8e349 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -73,7 +73,7 @@
     from megatron.core.models.gpt import GPTModel
     from megatron.core.models.mamba import MambaModel
     from megatron.core.models.multimodal.llava_model import LLaVAModel
-    from megatron.core.models.gpt.hybrid_model import HybridModel
+    from megatron.core.models.hybrid.hybrid_model import HybridModel
     from megatron.bridge.models.qwen_vl import Qwen3VLModel
     from megatron.core.parallel_state import (
         get_pipeline_model_parallel_rank,

From 9510d89822ab860353582a5ace0fb9fc9edd2973 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 00:30:14 -0400
Subject: [PATCH 05/17] fix: GDN in_proj uses dedicated rule (no QKV slicing),
 add GDN rules to qwen3 mapping

---
 modelopt/torch/export/plugins/mcore_qwen.py      | 3 +++
 modelopt/torch/export/unified_export_megatron.py | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py
index 5c4ae0647d8..4e9b77a6c78 100644
--- a/modelopt/torch/export/plugins/mcore_qwen.py
+++ b/modelopt/torch/export/plugins/mcore_qwen.py
@@ -68,6 +68,9 @@
     "router": NameRemapping("model.layers.{}.mlp.gate."),
     "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."),
     "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."),
+    # GatedDeltaNet (linear attention) — no QKV slicing, direct name remap
+    "gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."),
+    "gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."),
 }
 
 qwen25_causal_lm_import: dict[str, CustomModuleMapping] = {
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 2d59df8e349..f5a5726a04c 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -465,10 +465,9 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
             elif "GatedDeltaNet" in str(type(layer.self_attention)):
                 # GatedDeltaNet (linear attention) has in_proj, out_norm, out_proj
                 # instead of linear_qkv, q_layernorm, etc.
+                # Use dedicated GDN rules if available (no QKV slicing), else skip.
                 if "gated_delta_net_in_proj" in self.rules:
                     self.rules["gated_delta_net_in_proj"](layer.self_attention.in_proj, layer_id)
-                else:
-                    self.rules["linear_qkv"](layer.self_attention.in_proj, layer_id)
                 if hasattr(layer.self_attention, "out_norm") and not isinstance(
                     layer.self_attention.out_norm, IdentityOp
                 ):

From 3b863b74c135c4200b8a4525adac3d8f25b33c33 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 00:32:23 -0400
Subject: [PATCH 06/17] fix: add shared_experts rules to qwen3 export mapping
 for Qwen3.6 MoE

---
 modelopt/torch/export/plugins/mcore_qwen.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py
index 4e9b77a6c78..f75ee0b301d 100644
--- a/modelopt/torch/export/plugins/mcore_qwen.py
+++ b/modelopt/torch/export/plugins/mcore_qwen.py
@@ -68,6 +68,9 @@
     "router": NameRemapping("model.layers.{}.mlp.gate."),
     "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."),
     "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."),
+    # Shared experts (Qwen3.6 MoE)
+    "shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."),
+    "shared_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.shared_experts.down_proj."),
     # GatedDeltaNet (linear attention) — no QKV slicing, direct name remap
     "gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."),
     "gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."),

From 344badbf065220812ee08be0e6e33f67580858af Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 00:34:41 -0400
Subject: [PATCH 07/17] fix: handle empty tensors in NVFP4QTensor.quantize
 (TP/EP sharding zero-slice)

---
 modelopt/torch/quantization/qtensor/nvfp4_tensor.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index 6ff31424c77..8c2b9cfb0d7 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -286,6 +286,14 @@ def quantize(
                 input, block_size, weights_scaling_factor_2
             )
 
+        # Handle empty tensors (e.g. from TP/EP sharding where this rank has no slice)
+        if input.numel() == 0:
+            return (
+                cls(input_shape, input_dtype, input),
+                torch.zeros(*input.shape[:-1], device=input.device, dtype=torch.float8_e4m3fn),
+                torch.zeros(1, device=input.device, dtype=torch.float32),
+            )
+
         # Reshape the weight and scale factors
         original_shape = input.shape
         input = input.view((*tuple(input.shape[:-1]), -1, block_size))

From 1ca8f7a5e58f42cbeef5e2c13c674059d6bb6ca2 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 00:50:04 -0400
Subject: [PATCH 08/17] fix: GDN out_proj uses dedicated rule
 (linear_attn.out_proj not self_attn.o_proj)

---
 modelopt/torch/export/plugins/mcore_qwen.py      | 1 +
 modelopt/torch/export/unified_export_megatron.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py
index f75ee0b301d..59c402f6bcf 100644
--- a/modelopt/torch/export/plugins/mcore_qwen.py
+++ b/modelopt/torch/export/plugins/mcore_qwen.py
@@ -74,6 +74,7 @@
     # GatedDeltaNet (linear attention) — no QKV slicing, direct name remap
     "gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."),
     "gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."),
+    "gated_delta_net_out_proj": NameRemapping("model.layers.{}.linear_attn.out_proj."),
 }
 
 qwen25_causal_lm_import: dict[str, CustomModuleMapping] = {
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index f5a5726a04c..590fe989b6c 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -473,7 +473,10 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                 ):
                     if "gated_delta_net_out_norm" in self.rules:
                         self.rules["gated_delta_net_out_norm"](layer.self_attention.out_norm, layer_id)
-                self.rules["linear_proj"](layer.self_attention.out_proj, layer_id)
+                if "gated_delta_net_out_proj" in self.rules:
+                    self.rules["gated_delta_net_out_proj"](layer.self_attention.out_proj, layer_id)
+                else:
+                    self.rules["linear_proj"](layer.self_attention.out_proj, layer_id)
             else:
                 if hasattr(layer.self_attention, "q_layernorm") and layer.self_attention.q_layernorm is not None and not isinstance(
                     layer.self_attention.q_layernorm, (IdentityOp, L2Norm)

From 4992fb071f533f6afad9451a0b29aebbe0030240 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 01:02:19 -0400
Subject: [PATCH 09/17] fix: add EP rank offset to expert_id during export
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With EP=2, local_experts are indexed 0..127 per rank but global IDs
must account for EP rank. rank 0 → 0-127, rank 1 → 128-255.
---
 modelopt/torch/export/unified_export_megatron.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 590fe989b6c..3a3a70a159e 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -520,8 +520,15 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                         layer.mlp.shared_experts.linear_fc2, layer_id
                     )
                 if hasattr(layer.mlp.experts, "local_experts"):
+                    # With expert parallelism, local_experts are indexed 0..N-1 per rank,
+                    # but the global expert ID needs the EP rank offset.
+                    from megatron.core.parallel_state import get_expert_model_parallel_rank, get_expert_model_parallel_world_size
+                    ep_rank = get_expert_model_parallel_rank()
+                    ep_size = get_expert_model_parallel_world_size()
+                    num_local = len(layer.mlp.experts.local_experts)
                     if not self.rules.get("use_packed_local_experts", False):
-                        for expert_id, expert in enumerate(layer.mlp.experts.local_experts):
+                        for local_id, expert in enumerate(layer.mlp.experts.local_experts):
+                            expert_id = ep_rank * num_local + local_id
                             self.rules["local_experts.linear_fc1"](
                                 expert.linear_fc1, layer_id, expert_id
                             )

From e254f81088cb7f5c043ab0c4b5e0c7525eac3b20 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 01:23:29 -0400
Subject: [PATCH 10/17] debug: add layer timing logs to export _get_state_dict

---
 modelopt/torch/export/unified_export_megatron.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 3a3a70a159e..6aa61c311b0 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -395,6 +395,8 @@ def extra_state_dict(self):
 
     def _get_state_dict(self):
         model = self.model
+        import time as _time
+        _start = _time.time()
 
         # Embedding
         if hasattr(model, "embedding"):

From 5cfadba1b29618c1e6da9d603e317b1dab8c351c Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 01:56:43 -0400
Subject: [PATCH 11/17] debug: add expert iteration logging to export

---
 modelopt/torch/export/unified_export_megatron.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 6aa61c311b0..a51589293a6 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -528,9 +528,11 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                     ep_rank = get_expert_model_parallel_rank()
                     ep_size = get_expert_model_parallel_world_size()
                     num_local = len(layer.mlp.experts.local_experts)
+                    print(f"[export] layer {layer_id}: {num_local} local_experts, ep_rank={ep_rank}, ep_size={ep_size}", flush=True)
                     if not self.rules.get("use_packed_local_experts", False):
                         for local_id, expert in enumerate(layer.mlp.experts.local_experts):
                             expert_id = ep_rank * num_local + local_id
+                            print(f"[export]   expert {local_id} -> global {expert_id}, linear_fc1={type(expert.linear_fc1).__name__}", flush=True)
                             self.rules["local_experts.linear_fc1"](
                                 expert.linear_fc1, layer_id, expert_id
                             )

From 83c067602088fc4089fc02a27f0f3b4c9caf3d09 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 02:09:48 -0400
Subject: [PATCH 12/17] debug: more trace prints around export state dict
 building

---
 modelopt/torch/export/unified_export_megatron.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index a51589293a6..dcf64189e38 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -271,7 +271,9 @@ def save_pretrained(
         is_last_stage_main_rank = pp_rank == pp_size - 1 and tp_rank == 0
 
         # Main export process
+        print("[export] About to build layer_state_dicts...", flush=True)
         layer_state_dicts = self.layer_state_dicts
+        print(f"[export] Built {len(layer_state_dicts)} layer state dicts", flush=True)
 
         quantization_format = self._get_quantization_format(self.model)
         quantization = None
@@ -394,6 +396,7 @@ def extra_state_dict(self):
         return self._state_dict
 
     def _get_state_dict(self):
+        print("[export] _get_state_dict called", flush=True)
         model = self.model
         import time as _time
         _start = _time.time()
@@ -403,6 +406,7 @@ def _get_state_dict(self):
             self.rules["word_embeddings"](model.embedding.word_embeddings)
 
         # Decoder layers
+        print(f"[export] Iterating {len(model.decoder.layers)} decoder layers", flush=True)
         for layer in model.decoder.layers:
             layer_id = layer.layer_number - 1
             if isinstance(layer, MambaLayer):

From f1e294497a8b7ce7067aeb2ee7a7de2c07a0b88d Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 02:17:28 -0400
Subject: [PATCH 13/17] debug: clean MLP diagnostic inside
 _get_transformer_layer_state_dict

---
 modelopt/torch/export/unified_export_megatron.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index dcf64189e38..4c461d2278f 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -442,6 +442,19 @@ def _get_fused_norm_weight(self, module):
         return getattr(module, "layer_norm_weight", None)
 
     def _get_transformer_layer_state_dict(self, layer, layer_id):
+        if layer_id == 0:
+            print(f"[diag] layer.mlp type: {type(layer.mlp).__name__}", flush=True)
+            print(f"[diag] mlp attrs: {[a for a in dir(layer.mlp) if not a.startswith('_')][:25]}", flush=True)
+            print(f"[diag] hasattr mlp.experts: {hasattr(layer.mlp, 'experts')}", flush=True)
+            if hasattr(layer.mlp, 'experts'):
+                print(f"[diag] experts type: {type(layer.mlp.experts).__name__}", flush=True)
+                print(f"[diag] hasattr local_experts: {hasattr(layer.mlp.experts, 'local_experts')}", flush=True)
+                if hasattr(layer.mlp.experts, 'local_experts'):
+                    print(f"[diag] num local_experts: {len(layer.mlp.experts.local_experts)}", flush=True)
+            print(f"[diag] hasattr shared_experts: {hasattr(layer.mlp, 'shared_experts')}", flush=True)
+            if hasattr(layer.mlp, 'config'):
+                print(f"[diag] mlp.config.num_experts: {getattr(layer.mlp.config, 'num_experts', 'N/A')}", flush=True)
+
         if not isinstance(layer.input_layernorm, IdentityOp):
             self.rules["input_layernorm"](layer.input_layernorm, layer_id)
         elif (

From f693f35210ae1cda236cb7a3442ed8541b24dff6 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 10:28:38 -0400
Subject: [PATCH 14/17] fix: add GroupedMLPSlicing for TEGroupedMLP export +
 bypass broken lambda dispatch

---
 modelopt/torch/export/plugins/mcore_qwen.py      |  4 ++++
 modelopt/torch/export/unified_export_megatron.py | 15 +++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py
index 59c402f6bcf..c26275e7517 100644
--- a/modelopt/torch/export/plugins/mcore_qwen.py
+++ b/modelopt/torch/export/plugins/mcore_qwen.py
@@ -24,6 +24,7 @@
     CustomModuleMapping,
     GatedMLPMerging,
     GatedMLPSlicing,
+    GroupedMLPSlicing,
     NameRemapping,
     QKVMerging,
     QKVSlicing,
@@ -68,6 +69,9 @@
     "router": NameRemapping("model.layers.{}.mlp.gate."),
     "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."),
     "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."),
+    # Grouped experts (TEGroupedMLP: fused per-expert weights via grouped GEMM)
+    "experts.linear_fc1": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.up_proj"),
+    "experts.linear_fc2": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.down_proj"),
     # Shared experts (Qwen3.6 MoE)
     "shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."),
     "shared_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.shared_experts.down_proj."),
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 4c461d2278f..ce709dec67b 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -567,10 +567,17 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                 elif "experts.linear_fc1" in self.rules:
                     # TEGroupedMLP: experts use fused grouped GEMM with a single
                     # linear_fc1/linear_fc2 for all experts (no local_experts attribute).
-                    # Uses "experts.linear_fc1" rule (GroupedMLPMerging) instead of
-                    # "local_experts.linear_fc1" which expects per-expert iteration.
-                    self.rules["experts.linear_fc1"](layer.mlp.experts.linear_fc1, layer_id)
-                    self.rules["experts.linear_fc2"](layer.mlp.experts.linear_fc2, layer_id)
+                    # Call _grouped_mlp_slicing directly because the lambda-based dispatch
+                    # cannot handle two-placeholder prefixes (layer_id + expert_id).
+                    raw_mappings = all_mcore_hf_export_mapping[self.arch]
+                    fc1_prefix = raw_mappings["experts.linear_fc1"].target_name_or_prefix
+                    fc2_prefix = raw_mappings["experts.linear_fc2"].target_name_or_prefix
+                    self._grouped_mlp_slicing(
+                        layer.mlp.experts.linear_fc1, fc1_prefix.format(layer_id)
+                    )
+                    self._grouped_mlp_slicing(
+                        layer.mlp.experts.linear_fc2, fc2_prefix.format(layer_id)
+                    )
             else:
                 self.rules["linear_fc1"](layer.mlp.linear_fc1, layer_id)
                 self.rules["linear_fc2"](layer.mlp.linear_fc2, layer_id)

From 8c9bf6e39bbea23c6f7bd154426d9da30ed477f0 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 12:26:14 -0400
Subject: [PATCH 15/17] fix: EP-aware MoE export with per-rank write and merge

Three bugs fixed for multi-rank EP MoE export (Qwen3.6-35B-A3B):

1. Format string bug: fc1/fc2 prefix has two {} placeholders (layer_id, expert_id).
   Using .format(layer_id) fails. Fixed with re.sub to fill only first {}.

2. Expert offset bug: _grouped_mlp_slicing had no EP rank awareness. Both ranks
   wrote experts 0-127 with overlapping keys. Added expert_offset param from
   get_expert_model_parallel_rank() * num_local_experts.

3. weight_key bug: used global expert_id for module lookup instead of local_expert_id.
   Module has weight0..weight127, not weight128..weight255.

4. Save strategy: all_gather_object causes OOM (pickle overhead on ~40k tensors).
   Each rank now writes to separate NFS dir, then rank 0 merges safetensors
   shard-by-shard with low memory footprint.
---
 .../torch/export/unified_export_megatron.py   | 77 +++++++++++++++++--
 1 file changed, 69 insertions(+), 8 deletions(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index ce709dec67b..3b997932010 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -1,3 +1,4 @@
+import re
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -367,14 +368,63 @@ def save_pretrained(
             with open(config_json_file, "w") as f:
                 json.dump(config_dict, f, indent=4)
 
-        # save_safetensors(state_dict, save_directory)
+        # Each EP rank writes to its own subdirectory to avoid OOM from gathering
+        if torch.distributed.is_initialized():
+            world_size = torch.distributed.get_world_size()
+            rank = torch.distributed.get_rank()
+        else:
+            world_size = 1
+            rank = 0
+
+        rank_save_dir = save_directory + "_rank" + str(rank)
+        os.makedirs(rank_save_dir, exist_ok=True)
+
+        # Each rank writes its own layer shards
         save_safetensors_by_layer_index(
             layer_state_dicts=layer_state_dicts,
             total_layers=self.model.config.num_layers,
-            save_directory=save_directory,
+            save_directory=rank_save_dir,
             name_template="model-{:05d}-of-{:05d}",
         )
 
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+        # Rank 0 merges per-shard safetensors from all rank dirs
+        if rank == 0 and world_size > 1:
+            print("[export] Merging shard files from all ranks...", flush=True)
+            from safetensors import safe_open as _safe_open
+            from safetensors.torch import save_file as _save_file
+            for layer_idx in range(self.model.config.num_layers):
+                shard_name = "model-{:05d}-of-{:05d}".format(layer_idx + 1, self.model.config.num_layers)
+                ckpt_name = shard_name + ".safetensors"
+                meta_name = shard_name + ".json"
+                merged_dict = {}
+                for r in range(world_size):
+                    rdir = save_directory + "_rank" + str(r)
+                    fpath = os.path.join(rdir, ckpt_name)
+                    if os.path.exists(fpath):
+                        with _safe_open(fpath, framework="pt") as f:
+                            for k in f.keys():
+                                merged_dict[k] = f.get_tensor(k)
+                # Write merged shard
+                os.makedirs(save_directory, exist_ok=True)
+                _save_file(merged_dict, os.path.join(save_directory, ckpt_name), metadata={"format": "pt"})
+                # Build metadata
+                weight_map = {}
+                total_size = 0
+                for k, v in merged_dict.items():
+                    weight_map[k] = ckpt_name
+                    total_size += v.numel() * v.element_size()
+                with open(os.path.join(save_directory, meta_name), "w") as f:
+                    json.dump({"metadata": {"total_size": total_size}, "weight_map": weight_map}, f, indent=4)
+            print(f"[export] Merged {len(merged_dict)} keys per layer across {world_size} ranks", flush=True)
+        elif rank == 0:
+            # Single rank, just rename dir
+            import shutil
+            if os.path.exists(save_directory + "_rank0"):
+                shutil.move(save_directory + "_rank0", save_directory)
+
     @property
     def state_dict(self):
         """Return the real quantized state_dict of the base model."""
@@ -572,11 +622,20 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                     raw_mappings = all_mcore_hf_export_mapping[self.arch]
                     fc1_prefix = raw_mappings["experts.linear_fc1"].target_name_or_prefix
                     fc2_prefix = raw_mappings["experts.linear_fc2"].target_name_or_prefix
+                    # Fill only the first {} (layer_id), leave second {} for expert_id in _grouped_mlp_slicing
+                    fc1_prefix_partial = re.sub(r'\{\}', str(layer_id), fc1_prefix, count=1)
+                    fc2_prefix_partial = re.sub(r'\{\}', str(layer_id), fc2_prefix, count=1)
+                    # With EP>1, each rank only has a subset of experts. Offset the expert IDs
+                    # by ep_rank * num_local_experts so all ranks write to non-overlapping keys.
+                    from megatron.core.parallel_state import get_expert_model_parallel_rank
+                    ep_rank = get_expert_model_parallel_rank()
+                    expert_offset = ep_rank * layer.mlp.experts.linear_fc1.num_gemms
+                    print(f"[export] layer {layer_id}: TEGroupedMLP, ep_rank={ep_rank}, expert_offset={expert_offset}", flush=True)
                     self._grouped_mlp_slicing(
-                        layer.mlp.experts.linear_fc1, fc1_prefix.format(layer_id)
+                        layer.mlp.experts.linear_fc1, fc1_prefix_partial, expert_offset=expert_offset
                     )
                     self._grouped_mlp_slicing(
-                        layer.mlp.experts.linear_fc2, fc2_prefix.format(layer_id)
+                        layer.mlp.experts.linear_fc2, fc2_prefix_partial, expert_offset=expert_offset
                     )
             else:
                 self.rules["linear_fc1"](layer.mlp.linear_fc1, layer_id)
@@ -1003,7 +1062,7 @@ def _gated_mlp_slicing(
                 self._state_dict[gate_proj_key] = val.detach().clone()
                 self._state_dict[up_proj_key] = val.detach().clone()
 
-    def _grouped_mlp_slicing(self, module, prefix, parallel_config=None):
+    def _grouped_mlp_slicing(self, module, prefix, parallel_config=None, expert_offset=0):
         """Export TEGroupedMLP weights by splitting per-expert weights into individual HF weights.
 
         TEGroupedMLP (via TEGroupedLinear) stores weights as weight0, weight1, ..., weight{N-1}
@@ -1033,9 +1092,10 @@ def _grouped_mlp_slicing(self, module, prefix, parallel_config=None):
 
         state_dict = module.state_dict()
 
-        for expert_id in range(num_experts):
+        for local_expert_id in range(num_experts):
+            expert_id = expert_offset + local_expert_id
             expert_prefix = prefix.format(expert_id) + "."
-            weight_key = f"weight{expert_id}"
+            weight_key = f"weight{local_expert_id}"
 
             if weight_key not in state_dict:
                 raise ValueError(f"Missing expected TEGroupedMLP expert weight: {weight_key}")
@@ -1060,7 +1120,8 @@ def _grouped_mlp_slicing(self, module, prefix, parallel_config=None):
         for key, val in name_to_value.items():
             if key == "output_scale":
                 continue
-            for expert_id in range(num_experts):
+            for local_expert_id in range(num_experts):
+                expert_id = expert_offset + local_expert_id
                 expert_prefix = prefix.format(expert_id) + "."
                 self._state_dict[expert_prefix + key] = val.detach().clone()
 

From d37ee10c68170fe18883a672c6ef30bdc58d4760 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 16:12:24 -0400
Subject: [PATCH 16/17] fix: GroupedGatedMLPSlicing class + shared tensor clone
 + qwen export dispatch

- Add GroupedGatedMLPSlicing class to mcore_custom.py for TEGroupedMLP gate/up split
- Add _grouped_gated_mlp_slicing method to GPTModelExporter
- Clone shared-storage tensors before safetensors save (NVFP4 weight_scale broadcast)
- Dispatch fc1 slicing based on mapping func_name for correct expert handling
---
 modelopt/torch/export/plugins/mcore_custom.py | 20 +++++
 modelopt/torch/export/plugins/mcore_qwen.py   |  3 +-
 .../torch/export/unified_export_megatron.py   | 87 ++++++++++++++++++-
 3 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/export/plugins/mcore_custom.py b/modelopt/torch/export/plugins/mcore_custom.py
index 204ff012c71..8ba4c5ea81c 100644
--- a/modelopt/torch/export/plugins/mcore_custom.py
+++ b/modelopt/torch/export/plugins/mcore_custom.py
@@ -175,6 +175,17 @@ def __init__(self, target_name_or_prefix: str = "", func_kwargs: dict[str, Any]
         )
 
 
+class GroupedGatedMLPSlicing(CustomModuleMapping):
+    """A custom module mapping for TEGroupedMLP that splits fused gate_up into gate_proj + up_proj per expert."""
+
+    def __init__(self, target_name_or_prefix: str = "", func_kwargs: dict[str, Any] = {}):
+        """Create a custom module mapping for grouped gated MLP slicing."""
+        super().__init__(
+            func_name="grouped_gated_mlp_slicing",
+            target_name_or_prefix=target_name_or_prefix,
+            func_kwargs=func_kwargs,
+        )
+
 class PackNameRemapping(CustomModuleMapping):
     """A custom module mapping that packs module after name remapping."""
 
@@ -318,6 +329,15 @@ def save_safetensors_by_layer_index(
                 f,
                 indent=4,
             )
+        # Clone tensors that share storage (NVFP4 weight_scale broadcast causes this)
+        seen_storages = {}
+        for _key, _val in layer_state_dict.items():
+            _sid = id(_val.storage())
+            if _sid in seen_storages:
+                layer_state_dict[_key] = _val.clone()
+            else:
+                seen_storages[_sid] = _key
+
         save_file(layer_state_dict, save_directory + "/" + ckpt_filename, metadata={"format": "pt"})
 
     # [TODO]: this global barrier needs to be replaced with something safer
diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py
index c26275e7517..1c8b0d414ae 100644
--- a/modelopt/torch/export/plugins/mcore_qwen.py
+++ b/modelopt/torch/export/plugins/mcore_qwen.py
@@ -25,6 +25,7 @@
     GatedMLPMerging,
     GatedMLPSlicing,
     GroupedMLPSlicing,
+    GroupedGatedMLPSlicing,
     NameRemapping,
     QKVMerging,
     QKVSlicing,
@@ -70,7 +71,7 @@
     "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."),
     "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."),
     # Grouped experts (TEGroupedMLP: fused per-expert weights via grouped GEMM)
-    "experts.linear_fc1": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.up_proj"),
+    "experts.linear_fc1": GroupedGatedMLPSlicing("model.layers.{}.mlp.experts.{}"),
     "experts.linear_fc2": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.down_proj"),
     # Shared experts (Qwen3.6 MoE)
     "shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."),
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 3b997932010..c00d56d54ca 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -631,7 +631,11 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
                     ep_rank = get_expert_model_parallel_rank()
                     expert_offset = ep_rank * layer.mlp.experts.linear_fc1.num_gemms
                     print(f"[export] layer {layer_id}: TEGroupedMLP, ep_rank={ep_rank}, expert_offset={expert_offset}", flush=True)
-                    self._grouped_mlp_slicing(
+                    # Dispatch based on the mapping func_name — grouped_gated_mlp_slicing
+                    # splits fused gate_up into gate_proj + up_proj per expert.
+                    fc1_func_name = raw_mappings["experts.linear_fc1"].func_name
+                    fc1_method = getattr(self, f"_{fc1_func_name}")
+                    fc1_method(
                         layer.mlp.experts.linear_fc1, fc1_prefix_partial, expert_offset=expert_offset
                     )
                     self._grouped_mlp_slicing(
@@ -835,6 +839,7 @@ def _custom_mapping_to_lambda(mapping):
                 "self_attention_scaling": self._self_attention_scaling,
                 "gated_mlp_slicing": self._gated_mlp_slicing,
                 "grouped_mlp_slicing": self._grouped_mlp_slicing,
+                "grouped_gated_mlp_slicing": self._grouped_gated_mlp_slicing,
                 "pack_name_remapping": self._pack_name_remapping,
                 "pack_name_remapping_gpt_oss": self._pack_name_remapping_gpt_oss,
             }
@@ -1125,6 +1130,86 @@ def _grouped_mlp_slicing(self, module, prefix, parallel_config=None, expert_offs
                 expert_prefix = prefix.format(expert_id) + "."
                 self._state_dict[expert_prefix + key] = val.detach().clone()
 
+
+    def _grouped_gated_mlp_slicing(self, module, prefix, parallel_config=None, expert_offset=0):
+        """Export TEGroupedMLP fused gate_up weights, splitting into per-expert gate_proj + up_proj.
+
+        Like _grouped_mlp_slicing but handles the fused gate_up (linear_fc1) case:
+        each expert weight is [2*ffn_hidden_size, hidden_size], split into
+        gate_proj [ffn_hidden_size, hidden_size] and up_proj [ffn_hidden_size, hidden_size].
+
+        Produces per-expert gate_proj and up_proj that vLLM expects for MoE models
+        with packed_modules_mapping = {"gate_up_proj": ["gate_proj", "up_proj"]}.
+        """
+        num_experts = module.num_gemms
+
+        has_weight = hasattr(module, "weight")
+        if not has_weight:
+            module.weight = module.weight0
+        try:
+            name_to_value, qformat, block_size = self._get_quantized_state(
+                module, self.dtype, prefix=prefix
+            )
+            weight_scale, weight_scale_2 = self._get_weight_scales(name_to_value, qformat)
+            name_to_value.pop("weight", None)
+        finally:
+            if not has_weight and hasattr(module, "weight"):
+                delattr(module, "weight")
+
+        state_dict = module.state_dict()
+        ffn_hidden_size = module.config.ffn_hidden_size
+        # For gated linear unit, ffn_hidden_size is already doubled (2 * moe_intermediate_size).
+        # We need the un-doubled per-projection size for the gate/up split.
+        gated_split = ffn_hidden_size // 2
+
+        for local_expert_id in range(num_experts):
+            expert_id = expert_offset + local_expert_id
+            expert_prefix = prefix.format(expert_id) + "."
+            weight_key = f"weight{local_expert_id}"
+
+            if weight_key not in state_dict:
+                raise ValueError(f"Missing expected TEGroupedMLP expert weight: {weight_key}")
+
+            weight = state_dict[weight_key].to(self.dtype).cpu()
+            gate_weight = weight[:gated_split, :]
+            up_weight = weight[gated_split:, :]
+
+            gate_prefix = expert_prefix + "gate_proj."
+            up_prefix = expert_prefix + "up_proj."
+
+            if weight_scale is None:
+                self._state_dict[gate_prefix + "weight"] = gate_weight
+                self._state_dict[up_prefix + "weight"] = up_weight
+            else:
+                if len(weight_scale.shape) == 0:
+                    gate_weight_scale = weight_scale.detach().clone()
+                    up_weight_scale = weight_scale.detach().clone()
+                else:
+                    gate_weight_scale = weight_scale[:gated_split]
+                    up_weight_scale = weight_scale[gated_split:]
+
+                self._state_dict[gate_prefix + "weight"] = to_quantized_weight(
+                    gate_weight, gate_weight_scale, qformat, weight_scale_2, block_size,
+                )
+                self._state_dict[up_prefix + "weight"] = to_quantized_weight(
+                    up_weight, up_weight_scale, qformat, weight_scale_2, block_size,
+                )
+                self._state_dict[gate_prefix + "weight_scale"] = gate_weight_scale
+                self._state_dict[up_prefix + "weight_scale"] = up_weight_scale
+
+            if weight_scale_2 is not None:
+                self._state_dict[gate_prefix + "weight_scale_2"] = weight_scale_2.detach().clone()
+                self._state_dict[up_prefix + "weight_scale_2"] = weight_scale_2.detach().clone()
+
+        for key, val in name_to_value.items():
+            if key == "output_scale":
+                continue
+            for local_expert_id in range(num_experts):
+                expert_id = expert_offset + local_expert_id
+                expert_prefix = prefix.format(expert_id) + "."
+                self._state_dict[expert_prefix + "gate_proj." + key] = val.detach().clone()
+                self._state_dict[expert_prefix + "up_proj." + key] = val.detach().clone()
+
     def _qkv_slicing(
         self,
         module,

From 26bec7d873a1436a3ce5d62be6896f1e39ce7762 Mon Sep 17 00:00:00 2001
From: Lenny <lenny@tinkeredapps.com>
Date: Wed, 29 Apr 2026 17:32:36 -0400
Subject: [PATCH 17/17] fix: derive gated_split from actual weight shape
 instead of config.ffn_hidden_size

Both GatedMLPSlicing and GroupedGatedMLPSlicing used module.config.ffn_hidden_size
to determine the gate/up split point. For MoE models, ffn_hidden_size is often set
to hidden_size (2048) rather than the per-expert intermediate size (512), causing
gate_proj to receive the full fused weight and up_proj to be empty [0, N].

Now derives gated_split from the actual weight tensor shape (rows // 2).
---
 .../torch/export/unified_export_megatron.py   | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index c00d56d54ca..c643b7fe969 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -1020,9 +1020,12 @@ def _gated_mlp_slicing(
         gate_proj_prefix = prefix + gate_proj_name + "."
         up_proj_prefix = prefix + up_proj_name + "."
 
-        ffn_hidden_size = module.config.ffn_hidden_size
-        gate_proj_weight = weight[:ffn_hidden_size, :]
-        up_proj_weight = weight[ffn_hidden_size:, :]
+        # Derive split point from actual weight shape instead of config.ffn_hidden_size.
+        # For MoE models, ffn_hidden_size may not match the per-expert intermediate size.
+        gated_split = weight.shape[0] // 2
+        print(f"[PATCH] GatedMLPSlicing: actual_rows={weight.shape[0]}, gated_split={gated_split}, config.ffn_hidden_size={module.config.ffn_hidden_size}")
+        gate_proj_weight = weight[:gated_split, :]
+        up_proj_weight = weight[gated_split:, :]
 
         if weight_scale is None:
             self._state_dict[gate_proj_prefix + "weight"] = gate_proj_weight
@@ -1032,8 +1035,8 @@ def _gated_mlp_slicing(
                 gate_proj_weight_scale = weight_scale.detach().clone()
                 up_proj_weight_scale = weight_scale.detach().clone()
             else:
-                gate_proj_weight_scale = weight_scale[:ffn_hidden_size]
-                up_proj_weight_scale = weight_scale[ffn_hidden_size:]
+                gate_proj_weight_scale = weight_scale[:gated_split]
+                up_proj_weight_scale = weight_scale[gated_split:]
             self._state_dict[gate_proj_prefix + "weight"] = to_quantized_weight(
                 gate_proj_weight,
                 gate_proj_weight_scale,
@@ -1157,10 +1160,13 @@ def _grouped_gated_mlp_slicing(self, module, prefix, parallel_config=None, exper
                 delattr(module, "weight")
 
         state_dict = module.state_dict()
-        ffn_hidden_size = module.config.ffn_hidden_size
-        # For gated linear unit, ffn_hidden_size is already doubled (2 * moe_intermediate_size).
-        # We need the un-doubled per-projection size for the gate/up split.
-        gated_split = ffn_hidden_size // 2
+        # Derive gated_split from actual weight shape instead of config.ffn_hidden_size.
+        # For MoE models, ffn_hidden_size may not reflect the per-expert intermediate size.
+        # The fused gate_up weight is [2 * intermediate_size, hidden_size], so split at midpoint.
+        first_weight_key = f"weight0"
+        actual_rows = state_dict[first_weight_key].shape[0]
+        gated_split = actual_rows // 2
+        print(f"[PATCH] GroupedGatedMLPSlicing: actual_rows={actual_rows}, gated_split={gated_split}, config.ffn_hidden_size={module.config.ffn_hidden_size}")
 
         for local_expert_id in range(num_experts):
             expert_id = expert_offset + local_expert_id