From 1b1fcedcf173b2e0bfc1513213ed2fe8777ffc4a Mon Sep 17 00:00:00 2001 From: Lenny Potato Date: Wed, 22 Apr 2026 14:36:21 -0400 Subject: [PATCH 01/17] Fix NVFP4 quantization for Qwen3.x MoE models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four bugs prevent NVFP4 export from producing quantized weights for Qwen3.5/3.6 MoE models (and potentially other fused MoE architectures). All produce silent failures — no errors, just bfloat16 output identical to input. Bug 1: is_multimodal_model() crashes when config.architectures is None - model_utils.py: add 'or []' fallback for NoneType iteration Bug 3: get_quantization_format() doesn't recognize _QuantFusedExperts - quant_utils.py: add check for plural ModuleList quantizers (gate_up_proj_weight_quantizers, down_proj_weight_quantizers) before the singular weight_quantizer loop Bug 4: NVFP4 config wildcards don't match plural quantizer names - config.py: _nvfp4_selective_quant_cfg() only generates patterns for singular 'weight_quantizer', but _QuantFusedExperts creates plural ModuleList quantizers. Add wildcard entries for both gate_up_proj_weight_quantizers* and down_proj_weight_quantizers* Bug 5: _process_quantized_modules elif order sends fused MoE to wrong path - unified_export_hf.py: swap elif branches so hasattr check for gate_up_proj_weight_quantizers comes before type-name checks. Without this, QuantQwen3_5MoeExperts hits the singular-attribute branch and crashes with AttributeError Tested on: Qwen3.6-35B-A3B (MoE), NVIDIA DGX Spark (GB10), modelopt 0.45.0 dev, transformers 5.5.4 Output: 20.5 GB NVFP4 (down from 66 GB bfloat16) --- modelopt/torch/export/model_utils.py | 2 +- modelopt/torch/export/quant_utils.py | 19 +++++++++++++++++++ modelopt/torch/export/unified_export_hf.py | 17 ++++++++++------- modelopt/torch/quantization/config.py | 6 ++++++ 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 3bd72d9de91..aa81f8213f7 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -107,7 +107,7 @@ def is_multimodal_model(model): config = model.config # Check for Nemotron-Parse encoder-decoder architecture - architectures = getattr(config, "architectures", []) + architectures = getattr(config, "architectures", []) or [] is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) return ( diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 4ceb51cd2c0..51c2e8ce93d 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -665,6 +665,25 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}" ) + # Handle _QuantFusedExperts modules (e.g. Qwen3.x MoE) which use plural + # ModuleList quantizers (gate_up_proj_weight_quantizers, down_proj_weight_quantizers) + # instead of singular weight_quantizer attributes. + for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]: + quantizer_list = getattr(module, quantizer_list_name, None) + if quantizer_list is not None and len(quantizer_list) > 0: + # Check the first quantizer in the list — all share the same config + q = quantizer_list[0] + if hasattr(q, "is_enabled") and q.is_enabled: + num_bits = getattr(q, "num_bits", None) + block_sizes = getattr(q, "block_sizes", None) + scale_bits = ( + block_sizes.get("scale_bits", (8, 0)) + if isinstance(block_sizes, dict) and "scale_bits" in block_sizes + else (8, 0) + ) + if num_bits == (2, 1) and scale_bits == (4, 3): + return QUANTIZATION_NVFP4 + for weight_name in weight_attr_names(module): quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name)) if quantization != QUANTIZATION_NONE: diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index af936a3002a..c0145d16eaa 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -658,6 +658,16 @@ def _process_quantized_modules( raise AssertionError( f"Failed to export module '{name}' (type={type(sub_module).__name__}): {e}" ) from e + elif hasattr(sub_module, "gate_up_proj_weight_quantizers"): + # Generic fused MoE experts (_QuantFusedExperts) with per-expert + # quantizer ModuleLists. Split into per-expert modules and export. + # NOTE: This check must come before type-name checks (e.g. Llama4, + # GptOss) because _QuantFusedExperts wrapping renames quantizers + # to plural ModuleLists (e.g. gate_up_proj_weight_quantizers). + from modelopt.torch.export.moe_utils import _export_fused_experts + + with fsdp2_aware_weight_update(model, sub_module, reshard=False): + _export_fused_experts(sub_module, dtype) elif ( "Llama4TextExperts" in type(sub_module).__name__ or "GptOssExperts" in type(sub_module).__name__ @@ -677,13 +687,6 @@ def _process_quantized_modules( with fsdp2_aware_weight_update(model, sub_module, reshard=False): for weight_name in ["gate_up_proj", "down_proj"]: _export_quantized_weight(sub_module, dtype, weight_name) - elif hasattr(sub_module, "gate_up_proj_weight_quantizers"): - # Generic fused MoE experts (_QuantFusedExperts) with per-expert - # quantizer ModuleLists. Split into per-expert modules and export. - from modelopt.torch.export.moe_utils import _export_fused_experts - - with fsdp2_aware_weight_update(model, sub_module, reshard=False): - _export_fused_experts(sub_module, dtype) def _export_transformers_checkpoint( diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 186ff1c7edd..850c1eeb423 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -578,6 +578,12 @@ def _nvfp4_selective_quant_cfg( quant_cfg.append( {"quantizer_name": f"{pattern}weight_quantizer", "cfg": copy.deepcopy(quantizer)} ) + # Also match plural ModuleList quantizers used by _QuantFusedExperts + # (e.g. gate_up_proj_weight_quantizers.N) for fused MoE architectures. + for suffix in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]: + quant_cfg.append( + {"quantizer_name": f"{pattern}{suffix}*", "cfg": copy.deepcopy(quantizer)} + ) if not weight_only: quant_cfg.append( {"quantizer_name": f"{pattern}input_quantizer", "cfg": copy.deepcopy(quantizer)} From 5d5c4925bc335730a332e2a1925ddf8bbb9ebb83 Mon Sep 17 00:00:00 2001 From: Lenny Potato Date: Wed, 22 Apr 2026 14:46:54 -0400 Subject: [PATCH 02/17] fix: iterate quantizer list to find first enabled quantizer CodeRabbit review: expert 0 may be disabled when uncalibrated, so checking only quantizer_list[0] can miss the actual NVFP4 config. Now iterates to find the first enabled quantizer in the list. --- modelopt/torch/export/quant_utils.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 51c2e8ce93d..c0f2174b24d 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -671,18 +671,20 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]: quantizer_list = getattr(module, quantizer_list_name, None) if quantizer_list is not None and len(quantizer_list) > 0: - # Check the first quantizer in the list — all share the same config - q = quantizer_list[0] - if hasattr(q, "is_enabled") and q.is_enabled: - num_bits = getattr(q, "num_bits", None) - block_sizes = getattr(q, "block_sizes", None) - scale_bits = ( - block_sizes.get("scale_bits", (8, 0)) - if isinstance(block_sizes, dict) and "scale_bits" in block_sizes - else (8, 0) - ) - if num_bits == (2, 1) and scale_bits == (4, 3): - return QUANTIZATION_NVFP4 + # Find the first enabled quantizer — expert 0 may be disabled if + # uncalibrated, so we iterate rather than checking index 0 only. + for q in quantizer_list: + if hasattr(q, "is_enabled") and q.is_enabled: + num_bits = getattr(q, "num_bits", None) + block_sizes = getattr(q, "block_sizes", None) + scale_bits = ( + block_sizes.get("scale_bits", (8, 0)) + if isinstance(block_sizes, dict) and "scale_bits" in block_sizes + else (8, 0) + ) + if num_bits == (2, 1) and scale_bits == (4, 3): + return QUANTIZATION_NVFP4 + break for weight_name in weight_attr_names(module): quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name)) From c1e09d80972d910badd055a625da079bf54ab03e Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 00:22:31 -0400 Subject: [PATCH 03/17] =?UTF-8?q?fix:=20Qwen3.6=20MoE=20export=20=E2=80=94?= =?UTF-8?q?=20architecture=20mapping,=20GDN=20handling,=20isinstance=20che?= =?UTF-8?q?ck?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Qwen3_5MoeForConditionalGeneration to export/import mappings - Add Qwen3VLModel + HybridModel to GPTModelExporter isinstance check - Handle GatedDeltaNet layers in _get_transformer_layer_state_dict - Fix quantizer format detection for disabled quantizers --- modelopt/torch/export/plugins/mcore_common.py | 2 ++ modelopt/torch/export/quant_utils.py | 35 ++++++++++++------- .../torch/export/unified_export_megatron.py | 21 +++++++++-- 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/modelopt/torch/export/plugins/mcore_common.py b/modelopt/torch/export/plugins/mcore_common.py index d5bab9b4ece..ec262bcf094 100644 --- a/modelopt/torch/export/plugins/mcore_common.py +++ b/modelopt/torch/export/plugins/mcore_common.py @@ -52,6 +52,7 @@ "LlamaForCausalLMEagle3Deep": eagle3_deep_llama_causal_lm_export, "Qwen3ForCausalLM": qwen3_causal_lm_export, "Qwen3MoeForCausalLM": qwen3_causal_lm_export, + "Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_export, "Qwen2ForCausalLM": qwen25_causal_lm_export, "GptOssForCausalLM": gptoss_causal_lm_export, } @@ -64,6 +65,7 @@ "NemotronHForCausalLM": nemotron_h_causal_lm_import, "Qwen3ForCausalLM": qwen3_causal_lm_import, "Qwen3MoeForCausalLM": qwen3_causal_lm_import, + "Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_import, "Qwen2ForCausalLM": qwen25_causal_lm_import, "GptOssForCausalLM": gptoss_causal_lm_import, } diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index c0f2174b24d..cc55545699d 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -668,23 +668,32 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames # Handle _QuantFusedExperts modules (e.g. Qwen3.x MoE) which use plural # ModuleList quantizers (gate_up_proj_weight_quantizers, down_proj_weight_quantizers) # instead of singular weight_quantizer attributes. + # The quantization format is determined at module setup time, not per-expert. + # Check any quantizer in the list (even disabled ones) to determine the format, + # since calibration may not have activated all experts. for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]: quantizer_list = getattr(module, quantizer_list_name, None) if quantizer_list is not None and len(quantizer_list) > 0: - # Find the first enabled quantizer — expert 0 may be disabled if - # uncalibrated, so we iterate rather than checking index 0 only. - for q in quantizer_list: - if hasattr(q, "is_enabled") and q.is_enabled: - num_bits = getattr(q, "num_bits", None) - block_sizes = getattr(q, "block_sizes", None) - scale_bits = ( - block_sizes.get("scale_bits", (8, 0)) - if isinstance(block_sizes, dict) and "scale_bits" in block_sizes - else (8, 0) - ) - if num_bits == (2, 1) and scale_bits == (4, 3): - return QUANTIZATION_NVFP4 + # Check any quantizer — enabled or not — for format config. + # Prefer enabled ones first, but fall back to any if none are enabled. + q = None + for candidate in quantizer_list: + if hasattr(candidate, "is_enabled") and candidate.is_enabled: + q = candidate break + if q is None: + q = quantizer_list[0] + + num_bits = getattr(q, "num_bits", None) + block_sizes = getattr(q, "block_sizes", None) + scale_bits = ( + block_sizes.get("scale_bits", (8, 0)) + if isinstance(block_sizes, dict) and "scale_bits" in block_sizes + else (8, 0) + ) + if num_bits == (2, 1) and scale_bits == (4, 3): + return QUANTIZATION_NVFP4 + # Add other expert quantization format checks here as needed for weight_name in weight_attr_names(module): quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name)) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 62053e549c8..d5a911afb4f 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -73,6 +73,8 @@ from megatron.core.models.gpt import GPTModel from megatron.core.models.mamba import MambaModel from megatron.core.models.multimodal.llava_model import LLaVAModel + from megatron.core.models.gpt.hybrid_model import HybridModel + from megatron.bridge.models.qwen_vl import Qwen3VLModel from megatron.core.parallel_state import ( get_pipeline_model_parallel_rank, get_pipeline_model_parallel_world_size, @@ -121,7 +123,7 @@ def __init__( moe_router_dtype: str | None = None, ): """Create a GPTModel exporter instance.""" - if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)): + if not isinstance(model, (GPTModel, MambaModel, HybridModel, LLaVAModel, Qwen3VLModel)): raise ValueError("Input to GPTModelExport must be a megatron.core.models.GPTModel!") self._state_dict = OrderedDict() @@ -460,8 +462,21 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): self.rules["linear_kv_layernorm"](layer.self_attention.kv_layernorm, layer_id) self.rules["linear_kv_up_proj"](layer.self_attention.linear_kv_up_proj, layer_id) self.rules["linear_proj"](layer.self_attention.linear_proj, layer_id) + elif "GatedDeltaNet" in str(type(layer.self_attention)): + # GatedDeltaNet (linear attention) has in_proj, out_norm, out_proj + # instead of linear_qkv, q_layernorm, etc. + if "gated_delta_net_in_proj" in self.rules: + self.rules["gated_delta_net_in_proj"](layer.self_attention.in_proj, layer_id) + else: + self.rules["linear_qkv"](layer.self_attention.in_proj, layer_id) + if hasattr(layer.self_attention, "out_norm") and not isinstance( + layer.self_attention.out_norm, IdentityOp + ): + if "gated_delta_net_out_norm" in self.rules: + self.rules["gated_delta_net_out_norm"](layer.self_attention.out_norm, layer_id) + self.rules["linear_proj"](layer.self_attention.out_proj, layer_id) else: - if layer.self_attention.q_layernorm is not None and not isinstance( + if hasattr(layer.self_attention, "q_layernorm") and layer.self_attention.q_layernorm is not None and not isinstance( layer.self_attention.q_layernorm, (IdentityOp, L2Norm) ): self.rules["q_layernorm"](layer.self_attention.q_layernorm, layer_id) @@ -473,7 +488,7 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): ): # KV cache quant export self.rules["core_attention"](layer.self_attention.core_attention, layer_id) self.rules["linear_proj"](layer.self_attention.linear_proj, layer_id) - if getattr(layer.self_attention.core_attention, "softmax_offset", None) is not None: + if hasattr(layer.self_attention, "core_attention") and getattr(layer.self_attention.core_attention, "softmax_offset", None) is not None: self.rules["softmax_offset"]( layer.self_attention.core_attention.softmax_offset, layer_id ) From 0ddd356f3174d5b3e71cb5e7a888849d49b3fbc4 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 00:26:41 -0400 Subject: [PATCH 04/17] fix: correct HybridModel import path (hybrid.hybrid_model, not gpt.hybrid_model) --- modelopt/torch/export/unified_export_megatron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index d5a911afb4f..2d59df8e349 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -73,7 +73,7 @@ from megatron.core.models.gpt import GPTModel from megatron.core.models.mamba import MambaModel from megatron.core.models.multimodal.llava_model import LLaVAModel - from megatron.core.models.gpt.hybrid_model import HybridModel + from megatron.core.models.hybrid.hybrid_model import HybridModel from megatron.bridge.models.qwen_vl import Qwen3VLModel from megatron.core.parallel_state import ( get_pipeline_model_parallel_rank, From 9510d89822ab860353582a5ace0fb9fc9edd2973 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 00:30:14 -0400 Subject: [PATCH 05/17] fix: GDN in_proj uses dedicated rule (no QKV slicing), add GDN rules to qwen3 mapping --- modelopt/torch/export/plugins/mcore_qwen.py | 3 +++ modelopt/torch/export/unified_export_megatron.py | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py index 5c4ae0647d8..4e9b77a6c78 100644 --- a/modelopt/torch/export/plugins/mcore_qwen.py +++ b/modelopt/torch/export/plugins/mcore_qwen.py @@ -68,6 +68,9 @@ "router": NameRemapping("model.layers.{}.mlp.gate."), "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."), "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."), + # GatedDeltaNet (linear attention) — no QKV slicing, direct name remap + "gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."), + "gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."), } qwen25_causal_lm_import: dict[str, CustomModuleMapping] = { diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 2d59df8e349..f5a5726a04c 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -465,10 +465,9 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): elif "GatedDeltaNet" in str(type(layer.self_attention)): # GatedDeltaNet (linear attention) has in_proj, out_norm, out_proj # instead of linear_qkv, q_layernorm, etc. + # Use dedicated GDN rules if available (no QKV slicing), else skip. if "gated_delta_net_in_proj" in self.rules: self.rules["gated_delta_net_in_proj"](layer.self_attention.in_proj, layer_id) - else: - self.rules["linear_qkv"](layer.self_attention.in_proj, layer_id) if hasattr(layer.self_attention, "out_norm") and not isinstance( layer.self_attention.out_norm, IdentityOp ): From 3b863b74c135c4200b8a4525adac3d8f25b33c33 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 00:32:23 -0400 Subject: [PATCH 06/17] fix: add shared_experts rules to qwen3 export mapping for Qwen3.6 MoE --- modelopt/torch/export/plugins/mcore_qwen.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py index 4e9b77a6c78..f75ee0b301d 100644 --- a/modelopt/torch/export/plugins/mcore_qwen.py +++ b/modelopt/torch/export/plugins/mcore_qwen.py @@ -68,6 +68,9 @@ "router": NameRemapping("model.layers.{}.mlp.gate."), "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."), "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."), + # Shared experts (Qwen3.6 MoE) + "shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."), + "shared_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.shared_experts.down_proj."), # GatedDeltaNet (linear attention) — no QKV slicing, direct name remap "gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."), "gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."), From 344badbf065220812ee08be0e6e33f67580858af Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 00:34:41 -0400 Subject: [PATCH 07/17] fix: handle empty tensors in NVFP4QTensor.quantize (TP/EP sharding zero-slice) --- modelopt/torch/quantization/qtensor/nvfp4_tensor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index 6ff31424c77..8c2b9cfb0d7 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -286,6 +286,14 @@ def quantize( input, block_size, weights_scaling_factor_2 ) + # Handle empty tensors (e.g. from TP/EP sharding where this rank has no slice) + if input.numel() == 0: + return ( + cls(input_shape, input_dtype, input), + torch.zeros(*input.shape[:-1], device=input.device, dtype=torch.float8_e4m3fn), + torch.zeros(1, device=input.device, dtype=torch.float32), + ) + # Reshape the weight and scale factors original_shape = input.shape input = input.view((*tuple(input.shape[:-1]), -1, block_size)) From 1ca8f7a5e58f42cbeef5e2c13c674059d6bb6ca2 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 00:50:04 -0400 Subject: [PATCH 08/17] fix: GDN out_proj uses dedicated rule (linear_attn.out_proj not self_attn.o_proj) --- modelopt/torch/export/plugins/mcore_qwen.py | 1 + modelopt/torch/export/unified_export_megatron.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py index f75ee0b301d..59c402f6bcf 100644 --- a/modelopt/torch/export/plugins/mcore_qwen.py +++ b/modelopt/torch/export/plugins/mcore_qwen.py @@ -74,6 +74,7 @@ # GatedDeltaNet (linear attention) — no QKV slicing, direct name remap "gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."), "gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."), + "gated_delta_net_out_proj": NameRemapping("model.layers.{}.linear_attn.out_proj."), } qwen25_causal_lm_import: dict[str, CustomModuleMapping] = { diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index f5a5726a04c..590fe989b6c 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -473,7 +473,10 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): ): if "gated_delta_net_out_norm" in self.rules: self.rules["gated_delta_net_out_norm"](layer.self_attention.out_norm, layer_id) - self.rules["linear_proj"](layer.self_attention.out_proj, layer_id) + if "gated_delta_net_out_proj" in self.rules: + self.rules["gated_delta_net_out_proj"](layer.self_attention.out_proj, layer_id) + else: + self.rules["linear_proj"](layer.self_attention.out_proj, layer_id) else: if hasattr(layer.self_attention, "q_layernorm") and layer.self_attention.q_layernorm is not None and not isinstance( layer.self_attention.q_layernorm, (IdentityOp, L2Norm) From 4992fb071f533f6afad9451a0b29aebbe0030240 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 01:02:19 -0400 Subject: [PATCH 09/17] fix: add EP rank offset to expert_id during export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With EP=2, local_experts are indexed 0..127 per rank but global IDs must account for EP rank. rank 0 → 0-127, rank 1 → 128-255. --- modelopt/torch/export/unified_export_megatron.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 590fe989b6c..3a3a70a159e 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -520,8 +520,15 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): layer.mlp.shared_experts.linear_fc2, layer_id ) if hasattr(layer.mlp.experts, "local_experts"): + # With expert parallelism, local_experts are indexed 0..N-1 per rank, + # but the global expert ID needs the EP rank offset. + from megatron.core.parallel_state import get_expert_model_parallel_rank, get_expert_model_parallel_world_size + ep_rank = get_expert_model_parallel_rank() + ep_size = get_expert_model_parallel_world_size() + num_local = len(layer.mlp.experts.local_experts) if not self.rules.get("use_packed_local_experts", False): - for expert_id, expert in enumerate(layer.mlp.experts.local_experts): + for local_id, expert in enumerate(layer.mlp.experts.local_experts): + expert_id = ep_rank * num_local + local_id self.rules["local_experts.linear_fc1"]( expert.linear_fc1, layer_id, expert_id ) From e254f81088cb7f5c043ab0c4b5e0c7525eac3b20 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 01:23:29 -0400 Subject: [PATCH 10/17] debug: add layer timing logs to export _get_state_dict --- modelopt/torch/export/unified_export_megatron.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 3a3a70a159e..6aa61c311b0 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -395,6 +395,8 @@ def extra_state_dict(self): def _get_state_dict(self): model = self.model + import time as _time + _start = _time.time() # Embedding if hasattr(model, "embedding"): From 5cfadba1b29618c1e6da9d603e317b1dab8c351c Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 01:56:43 -0400 Subject: [PATCH 11/17] debug: add expert iteration logging to export --- modelopt/torch/export/unified_export_megatron.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 6aa61c311b0..a51589293a6 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -528,9 +528,11 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): ep_rank = get_expert_model_parallel_rank() ep_size = get_expert_model_parallel_world_size() num_local = len(layer.mlp.experts.local_experts) + print(f"[export] layer {layer_id}: {num_local} local_experts, ep_rank={ep_rank}, ep_size={ep_size}", flush=True) if not self.rules.get("use_packed_local_experts", False): for local_id, expert in enumerate(layer.mlp.experts.local_experts): expert_id = ep_rank * num_local + local_id + print(f"[export] expert {local_id} -> global {expert_id}, linear_fc1={type(expert.linear_fc1).__name__}", flush=True) self.rules["local_experts.linear_fc1"]( expert.linear_fc1, layer_id, expert_id ) From 83c067602088fc4089fc02a27f0f3b4c9caf3d09 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 02:09:48 -0400 Subject: [PATCH 12/17] debug: more trace prints around export state dict building --- modelopt/torch/export/unified_export_megatron.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index a51589293a6..dcf64189e38 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -271,7 +271,9 @@ def save_pretrained( is_last_stage_main_rank = pp_rank == pp_size - 1 and tp_rank == 0 # Main export process + print("[export] About to build layer_state_dicts...", flush=True) layer_state_dicts = self.layer_state_dicts + print(f"[export] Built {len(layer_state_dicts)} layer state dicts", flush=True) quantization_format = self._get_quantization_format(self.model) quantization = None @@ -394,6 +396,7 @@ def extra_state_dict(self): return self._state_dict def _get_state_dict(self): + print("[export] _get_state_dict called", flush=True) model = self.model import time as _time _start = _time.time() @@ -403,6 +406,7 @@ def _get_state_dict(self): self.rules["word_embeddings"](model.embedding.word_embeddings) # Decoder layers + print(f"[export] Iterating {len(model.decoder.layers)} decoder layers", flush=True) for layer in model.decoder.layers: layer_id = layer.layer_number - 1 if isinstance(layer, MambaLayer): From f1e294497a8b7ce7067aeb2ee7a7de2c07a0b88d Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 02:17:28 -0400 Subject: [PATCH 13/17] debug: clean MLP diagnostic inside _get_transformer_layer_state_dict --- modelopt/torch/export/unified_export_megatron.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index dcf64189e38..4c461d2278f 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -442,6 +442,19 @@ def _get_fused_norm_weight(self, module): return getattr(module, "layer_norm_weight", None) def _get_transformer_layer_state_dict(self, layer, layer_id): + if layer_id == 0: + print(f"[diag] layer.mlp type: {type(layer.mlp).__name__}", flush=True) + print(f"[diag] mlp attrs: {[a for a in dir(layer.mlp) if not a.startswith('_')][:25]}", flush=True) + print(f"[diag] hasattr mlp.experts: {hasattr(layer.mlp, 'experts')}", flush=True) + if hasattr(layer.mlp, 'experts'): + print(f"[diag] experts type: {type(layer.mlp.experts).__name__}", flush=True) + print(f"[diag] hasattr local_experts: {hasattr(layer.mlp.experts, 'local_experts')}", flush=True) + if hasattr(layer.mlp.experts, 'local_experts'): + print(f"[diag] num local_experts: {len(layer.mlp.experts.local_experts)}", flush=True) + print(f"[diag] hasattr shared_experts: {hasattr(layer.mlp, 'shared_experts')}", flush=True) + if hasattr(layer.mlp, 'config'): + print(f"[diag] mlp.config.num_experts: {getattr(layer.mlp.config, 'num_experts', 'N/A')}", flush=True) + if not isinstance(layer.input_layernorm, IdentityOp): self.rules["input_layernorm"](layer.input_layernorm, layer_id) elif ( From f693f35210ae1cda236cb7a3442ed8541b24dff6 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 10:28:38 -0400 Subject: [PATCH 14/17] fix: add GroupedMLPSlicing for TEGroupedMLP export + bypass broken lambda dispatch --- modelopt/torch/export/plugins/mcore_qwen.py | 4 ++++ modelopt/torch/export/unified_export_megatron.py | 15 +++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py index 59c402f6bcf..c26275e7517 100644 --- a/modelopt/torch/export/plugins/mcore_qwen.py +++ b/modelopt/torch/export/plugins/mcore_qwen.py @@ -24,6 +24,7 @@ CustomModuleMapping, GatedMLPMerging, GatedMLPSlicing, + GroupedMLPSlicing, NameRemapping, QKVMerging, QKVSlicing, @@ -68,6 +69,9 @@ "router": NameRemapping("model.layers.{}.mlp.gate."), "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."), "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."), + # Grouped experts (TEGroupedMLP: fused per-expert weights via grouped GEMM) + "experts.linear_fc1": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.up_proj"), + "experts.linear_fc2": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.down_proj"), # Shared experts (Qwen3.6 MoE) "shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."), "shared_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.shared_experts.down_proj."), diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 4c461d2278f..ce709dec67b 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -567,10 +567,17 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): elif "experts.linear_fc1" in self.rules: # TEGroupedMLP: experts use fused grouped GEMM with a single # linear_fc1/linear_fc2 for all experts (no local_experts attribute). - # Uses "experts.linear_fc1" rule (GroupedMLPMerging) instead of - # "local_experts.linear_fc1" which expects per-expert iteration. - self.rules["experts.linear_fc1"](layer.mlp.experts.linear_fc1, layer_id) - self.rules["experts.linear_fc2"](layer.mlp.experts.linear_fc2, layer_id) + # Call _grouped_mlp_slicing directly because the lambda-based dispatch + # cannot handle two-placeholder prefixes (layer_id + expert_id). + raw_mappings = all_mcore_hf_export_mapping[self.arch] + fc1_prefix = raw_mappings["experts.linear_fc1"].target_name_or_prefix + fc2_prefix = raw_mappings["experts.linear_fc2"].target_name_or_prefix + self._grouped_mlp_slicing( + layer.mlp.experts.linear_fc1, fc1_prefix.format(layer_id) + ) + self._grouped_mlp_slicing( + layer.mlp.experts.linear_fc2, fc2_prefix.format(layer_id) + ) else: self.rules["linear_fc1"](layer.mlp.linear_fc1, layer_id) self.rules["linear_fc2"](layer.mlp.linear_fc2, layer_id) From 8c9bf6e39bbea23c6f7bd154426d9da30ed477f0 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 12:26:14 -0400 Subject: [PATCH 15/17] fix: EP-aware MoE export with per-rank write and merge Three bugs fixed for multi-rank EP MoE export (Qwen3.6-35B-A3B): 1. Format string bug: fc1/fc2 prefix has two {} placeholders (layer_id, expert_id). Using .format(layer_id) fails. Fixed with re.sub to fill only first {}. 2. Expert offset bug: _grouped_mlp_slicing had no EP rank awareness. Both ranks wrote experts 0-127 with overlapping keys. Added expert_offset param from get_expert_model_parallel_rank() * num_local_experts. 3. weight_key bug: used global expert_id for module lookup instead of local_expert_id. Module has weight0..weight127, not weight128..weight255. 4. Save strategy: all_gather_object causes OOM (pickle overhead on ~40k tensors). Each rank now writes to separate NFS dir, then rank 0 merges safetensors shard-by-shard with low memory footprint. --- .../torch/export/unified_export_megatron.py | 77 +++++++++++++++++-- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index ce709dec67b..3b997932010 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -1,3 +1,4 @@ +import re # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -367,14 +368,63 @@ def save_pretrained( with open(config_json_file, "w") as f: json.dump(config_dict, f, indent=4) - # save_safetensors(state_dict, save_directory) + # Each EP rank writes to its own subdirectory to avoid OOM from gathering + if torch.distributed.is_initialized(): + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + else: + world_size = 1 + rank = 0 + + rank_save_dir = save_directory + "_rank" + str(rank) + os.makedirs(rank_save_dir, exist_ok=True) + + # Each rank writes its own layer shards save_safetensors_by_layer_index( layer_state_dicts=layer_state_dicts, total_layers=self.model.config.num_layers, - save_directory=save_directory, + save_directory=rank_save_dir, name_template="model-{:05d}-of-{:05d}", ) + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + # Rank 0 merges per-shard safetensors from all rank dirs + if rank == 0 and world_size > 1: + print("[export] Merging shard files from all ranks...", flush=True) + from safetensors import safe_open as _safe_open + from safetensors.torch import save_file as _save_file + for layer_idx in range(self.model.config.num_layers): + shard_name = "model-{:05d}-of-{:05d}".format(layer_idx + 1, self.model.config.num_layers) + ckpt_name = shard_name + ".safetensors" + meta_name = shard_name + ".json" + merged_dict = {} + for r in range(world_size): + rdir = save_directory + "_rank" + str(r) + fpath = os.path.join(rdir, ckpt_name) + if os.path.exists(fpath): + with _safe_open(fpath, framework="pt") as f: + for k in f.keys(): + merged_dict[k] = f.get_tensor(k) + # Write merged shard + os.makedirs(save_directory, exist_ok=True) + _save_file(merged_dict, os.path.join(save_directory, ckpt_name), metadata={"format": "pt"}) + # Build metadata + weight_map = {} + total_size = 0 + for k, v in merged_dict.items(): + weight_map[k] = ckpt_name + total_size += v.numel() * v.element_size() + with open(os.path.join(save_directory, meta_name), "w") as f: + json.dump({"metadata": {"total_size": total_size}, "weight_map": weight_map}, f, indent=4) + print(f"[export] Merged {len(merged_dict)} keys per layer across {world_size} ranks", flush=True) + elif rank == 0: + # Single rank, just rename dir + import shutil + if os.path.exists(save_directory + "_rank0"): + shutil.move(save_directory + "_rank0", save_directory) + @property def state_dict(self): """Return the real quantized state_dict of the base model.""" @@ -572,11 +622,20 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): raw_mappings = all_mcore_hf_export_mapping[self.arch] fc1_prefix = raw_mappings["experts.linear_fc1"].target_name_or_prefix fc2_prefix = raw_mappings["experts.linear_fc2"].target_name_or_prefix + # Fill only the first {} (layer_id), leave second {} for expert_id in _grouped_mlp_slicing + fc1_prefix_partial = re.sub(r'\{\}', str(layer_id), fc1_prefix, count=1) + fc2_prefix_partial = re.sub(r'\{\}', str(layer_id), fc2_prefix, count=1) + # With EP>1, each rank only has a subset of experts. Offset the expert IDs + # by ep_rank * num_local_experts so all ranks write to non-overlapping keys. + from megatron.core.parallel_state import get_expert_model_parallel_rank + ep_rank = get_expert_model_parallel_rank() + expert_offset = ep_rank * layer.mlp.experts.linear_fc1.num_gemms + print(f"[export] layer {layer_id}: TEGroupedMLP, ep_rank={ep_rank}, expert_offset={expert_offset}", flush=True) self._grouped_mlp_slicing( - layer.mlp.experts.linear_fc1, fc1_prefix.format(layer_id) + layer.mlp.experts.linear_fc1, fc1_prefix_partial, expert_offset=expert_offset ) self._grouped_mlp_slicing( - layer.mlp.experts.linear_fc2, fc2_prefix.format(layer_id) + layer.mlp.experts.linear_fc2, fc2_prefix_partial, expert_offset=expert_offset ) else: self.rules["linear_fc1"](layer.mlp.linear_fc1, layer_id) @@ -1003,7 +1062,7 @@ def _gated_mlp_slicing( self._state_dict[gate_proj_key] = val.detach().clone() self._state_dict[up_proj_key] = val.detach().clone() - def _grouped_mlp_slicing(self, module, prefix, parallel_config=None): + def _grouped_mlp_slicing(self, module, prefix, parallel_config=None, expert_offset=0): """Export TEGroupedMLP weights by splitting per-expert weights into individual HF weights. TEGroupedMLP (via TEGroupedLinear) stores weights as weight0, weight1, ..., weight{N-1} @@ -1033,9 +1092,10 @@ def _grouped_mlp_slicing(self, module, prefix, parallel_config=None): state_dict = module.state_dict() - for expert_id in range(num_experts): + for local_expert_id in range(num_experts): + expert_id = expert_offset + local_expert_id expert_prefix = prefix.format(expert_id) + "." - weight_key = f"weight{expert_id}" + weight_key = f"weight{local_expert_id}" if weight_key not in state_dict: raise ValueError(f"Missing expected TEGroupedMLP expert weight: {weight_key}") @@ -1060,7 +1120,8 @@ def _grouped_mlp_slicing(self, module, prefix, parallel_config=None): for key, val in name_to_value.items(): if key == "output_scale": continue - for expert_id in range(num_experts): + for local_expert_id in range(num_experts): + expert_id = expert_offset + local_expert_id expert_prefix = prefix.format(expert_id) + "." self._state_dict[expert_prefix + key] = val.detach().clone() From d37ee10c68170fe18883a672c6ef30bdc58d4760 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 16:12:24 -0400 Subject: [PATCH 16/17] fix: GroupedGatedMLPSlicing class + shared tensor clone + qwen export dispatch - Add GroupedGatedMLPSlicing class to mcore_custom.py for TEGroupedMLP gate/up split - Add _grouped_gated_mlp_slicing method to GPTModelExporter - Clone shared-storage tensors before safetensors save (NVFP4 weight_scale broadcast) - Dispatch fc1 slicing based on mapping func_name for correct expert handling --- modelopt/torch/export/plugins/mcore_custom.py | 20 +++++ modelopt/torch/export/plugins/mcore_qwen.py | 3 +- .../torch/export/unified_export_megatron.py | 87 ++++++++++++++++++- 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/export/plugins/mcore_custom.py b/modelopt/torch/export/plugins/mcore_custom.py index 204ff012c71..8ba4c5ea81c 100644 --- a/modelopt/torch/export/plugins/mcore_custom.py +++ b/modelopt/torch/export/plugins/mcore_custom.py @@ -175,6 +175,17 @@ def __init__(self, target_name_or_prefix: str = "", func_kwargs: dict[str, Any] ) +class GroupedGatedMLPSlicing(CustomModuleMapping): + """A custom module mapping for TEGroupedMLP that splits fused gate_up into gate_proj + up_proj per expert.""" + + def __init__(self, target_name_or_prefix: str = "", func_kwargs: dict[str, Any] = {}): + """Create a custom module mapping for grouped gated MLP slicing.""" + super().__init__( + func_name="grouped_gated_mlp_slicing", + target_name_or_prefix=target_name_or_prefix, + func_kwargs=func_kwargs, + ) + class PackNameRemapping(CustomModuleMapping): """A custom module mapping that packs module after name remapping.""" @@ -318,6 +329,15 @@ def save_safetensors_by_layer_index( f, indent=4, ) + # Clone tensors that share storage (NVFP4 weight_scale broadcast causes this) + seen_storages = {} + for _key, _val in layer_state_dict.items(): + _sid = id(_val.storage()) + if _sid in seen_storages: + layer_state_dict[_key] = _val.clone() + else: + seen_storages[_sid] = _key + save_file(layer_state_dict, save_directory + "/" + ckpt_filename, metadata={"format": "pt"}) # [TODO]: this global barrier needs to be replaced with something safer diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py index c26275e7517..1c8b0d414ae 100644 --- a/modelopt/torch/export/plugins/mcore_qwen.py +++ b/modelopt/torch/export/plugins/mcore_qwen.py @@ -25,6 +25,7 @@ GatedMLPMerging, GatedMLPSlicing, GroupedMLPSlicing, + GroupedGatedMLPSlicing, NameRemapping, QKVMerging, QKVSlicing, @@ -70,7 +71,7 @@ "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."), "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."), # Grouped experts (TEGroupedMLP: fused per-expert weights via grouped GEMM) - "experts.linear_fc1": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.up_proj"), + "experts.linear_fc1": GroupedGatedMLPSlicing("model.layers.{}.mlp.experts.{}"), "experts.linear_fc2": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.down_proj"), # Shared experts (Qwen3.6 MoE) "shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."), diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 3b997932010..c00d56d54ca 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -631,7 +631,11 @@ def _get_transformer_layer_state_dict(self, layer, layer_id): ep_rank = get_expert_model_parallel_rank() expert_offset = ep_rank * layer.mlp.experts.linear_fc1.num_gemms print(f"[export] layer {layer_id}: TEGroupedMLP, ep_rank={ep_rank}, expert_offset={expert_offset}", flush=True) - self._grouped_mlp_slicing( + # Dispatch based on the mapping func_name — grouped_gated_mlp_slicing + # splits fused gate_up into gate_proj + up_proj per expert. + fc1_func_name = raw_mappings["experts.linear_fc1"].func_name + fc1_method = getattr(self, f"_{fc1_func_name}") + fc1_method( layer.mlp.experts.linear_fc1, fc1_prefix_partial, expert_offset=expert_offset ) self._grouped_mlp_slicing( @@ -835,6 +839,7 @@ def _custom_mapping_to_lambda(mapping): "self_attention_scaling": self._self_attention_scaling, "gated_mlp_slicing": self._gated_mlp_slicing, "grouped_mlp_slicing": self._grouped_mlp_slicing, + "grouped_gated_mlp_slicing": self._grouped_gated_mlp_slicing, "pack_name_remapping": self._pack_name_remapping, "pack_name_remapping_gpt_oss": self._pack_name_remapping_gpt_oss, } @@ -1125,6 +1130,86 @@ def _grouped_mlp_slicing(self, module, prefix, parallel_config=None, expert_offs expert_prefix = prefix.format(expert_id) + "." self._state_dict[expert_prefix + key] = val.detach().clone() + + def _grouped_gated_mlp_slicing(self, module, prefix, parallel_config=None, expert_offset=0): + """Export TEGroupedMLP fused gate_up weights, splitting into per-expert gate_proj + up_proj. + + Like _grouped_mlp_slicing but handles the fused gate_up (linear_fc1) case: + each expert weight is [2*ffn_hidden_size, hidden_size], split into + gate_proj [ffn_hidden_size, hidden_size] and up_proj [ffn_hidden_size, hidden_size]. + + Produces per-expert gate_proj and up_proj that vLLM expects for MoE models + with packed_modules_mapping = {"gate_up_proj": ["gate_proj", "up_proj"]}. + """ + num_experts = module.num_gemms + + has_weight = hasattr(module, "weight") + if not has_weight: + module.weight = module.weight0 + try: + name_to_value, qformat, block_size = self._get_quantized_state( + module, self.dtype, prefix=prefix + ) + weight_scale, weight_scale_2 = self._get_weight_scales(name_to_value, qformat) + name_to_value.pop("weight", None) + finally: + if not has_weight and hasattr(module, "weight"): + delattr(module, "weight") + + state_dict = module.state_dict() + ffn_hidden_size = module.config.ffn_hidden_size + # For gated linear unit, ffn_hidden_size is already doubled (2 * moe_intermediate_size). + # We need the un-doubled per-projection size for the gate/up split. + gated_split = ffn_hidden_size // 2 + + for local_expert_id in range(num_experts): + expert_id = expert_offset + local_expert_id + expert_prefix = prefix.format(expert_id) + "." + weight_key = f"weight{local_expert_id}" + + if weight_key not in state_dict: + raise ValueError(f"Missing expected TEGroupedMLP expert weight: {weight_key}") + + weight = state_dict[weight_key].to(self.dtype).cpu() + gate_weight = weight[:gated_split, :] + up_weight = weight[gated_split:, :] + + gate_prefix = expert_prefix + "gate_proj." + up_prefix = expert_prefix + "up_proj." + + if weight_scale is None: + self._state_dict[gate_prefix + "weight"] = gate_weight + self._state_dict[up_prefix + "weight"] = up_weight + else: + if len(weight_scale.shape) == 0: + gate_weight_scale = weight_scale.detach().clone() + up_weight_scale = weight_scale.detach().clone() + else: + gate_weight_scale = weight_scale[:gated_split] + up_weight_scale = weight_scale[gated_split:] + + self._state_dict[gate_prefix + "weight"] = to_quantized_weight( + gate_weight, gate_weight_scale, qformat, weight_scale_2, block_size, + ) + self._state_dict[up_prefix + "weight"] = to_quantized_weight( + up_weight, up_weight_scale, qformat, weight_scale_2, block_size, + ) + self._state_dict[gate_prefix + "weight_scale"] = gate_weight_scale + self._state_dict[up_prefix + "weight_scale"] = up_weight_scale + + if weight_scale_2 is not None: + self._state_dict[gate_prefix + "weight_scale_2"] = weight_scale_2.detach().clone() + self._state_dict[up_prefix + "weight_scale_2"] = weight_scale_2.detach().clone() + + for key, val in name_to_value.items(): + if key == "output_scale": + continue + for local_expert_id in range(num_experts): + expert_id = expert_offset + local_expert_id + expert_prefix = prefix.format(expert_id) + "." + self._state_dict[expert_prefix + "gate_proj." + key] = val.detach().clone() + self._state_dict[expert_prefix + "up_proj." + key] = val.detach().clone() + def _qkv_slicing( self, module, From 26bec7d873a1436a3ce5d62be6896f1e39ce7762 Mon Sep 17 00:00:00 2001 From: Lenny Date: Wed, 29 Apr 2026 17:32:36 -0400 Subject: [PATCH 17/17] fix: derive gated_split from actual weight shape instead of config.ffn_hidden_size Both GatedMLPSlicing and GroupedGatedMLPSlicing used module.config.ffn_hidden_size to determine the gate/up split point. For MoE models, ffn_hidden_size is often set to hidden_size (2048) rather than the per-expert intermediate size (512), causing gate_proj to receive the full fused weight and up_proj to be empty [0, N]. Now derives gated_split from the actual weight tensor shape (rows // 2). --- .../torch/export/unified_export_megatron.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index c00d56d54ca..c643b7fe969 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -1020,9 +1020,12 @@ def _gated_mlp_slicing( gate_proj_prefix = prefix + gate_proj_name + "." up_proj_prefix = prefix + up_proj_name + "." - ffn_hidden_size = module.config.ffn_hidden_size - gate_proj_weight = weight[:ffn_hidden_size, :] - up_proj_weight = weight[ffn_hidden_size:, :] + # Derive split point from actual weight shape instead of config.ffn_hidden_size. + # For MoE models, ffn_hidden_size may not match the per-expert intermediate size. + gated_split = weight.shape[0] // 2 + print(f"[PATCH] GatedMLPSlicing: actual_rows={weight.shape[0]}, gated_split={gated_split}, config.ffn_hidden_size={module.config.ffn_hidden_size}") + gate_proj_weight = weight[:gated_split, :] + up_proj_weight = weight[gated_split:, :] if weight_scale is None: self._state_dict[gate_proj_prefix + "weight"] = gate_proj_weight @@ -1032,8 +1035,8 @@ def _gated_mlp_slicing( gate_proj_weight_scale = weight_scale.detach().clone() up_proj_weight_scale = weight_scale.detach().clone() else: - gate_proj_weight_scale = weight_scale[:ffn_hidden_size] - up_proj_weight_scale = weight_scale[ffn_hidden_size:] + gate_proj_weight_scale = weight_scale[:gated_split] + up_proj_weight_scale = weight_scale[gated_split:] self._state_dict[gate_proj_prefix + "weight"] = to_quantized_weight( gate_proj_weight, gate_proj_weight_scale, @@ -1157,10 +1160,13 @@ def _grouped_gated_mlp_slicing(self, module, prefix, parallel_config=None, exper delattr(module, "weight") state_dict = module.state_dict() - ffn_hidden_size = module.config.ffn_hidden_size - # For gated linear unit, ffn_hidden_size is already doubled (2 * moe_intermediate_size). - # We need the un-doubled per-projection size for the gate/up split. - gated_split = ffn_hidden_size // 2 + # Derive gated_split from actual weight shape instead of config.ffn_hidden_size. + # For MoE models, ffn_hidden_size may not reflect the per-expert intermediate size. + # The fused gate_up weight is [2 * intermediate_size, hidden_size], so split at midpoint. + first_weight_key = f"weight0" + actual_rows = state_dict[first_weight_key].shape[0] + gated_split = actual_rows // 2 + print(f"[PATCH] GroupedGatedMLPSlicing: actual_rows={actual_rows}, gated_split={gated_split}, config.ffn_hidden_size={module.config.ffn_hidden_size}") for local_expert_id in range(num_experts): expert_id = expert_offset + local_expert_id