Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
1b1fced
Fix NVFP4 quantization for Qwen3.x MoE models
lennytinkeredapps Apr 22, 2026
5d5c492
fix: iterate quantizer list to find first enabled quantizer
lennytinkeredapps Apr 22, 2026
c1e09d8
fix: Qwen3.6 MoE export — architecture mapping, GDN handling, isinsta…
lennytinkeredapps Apr 29, 2026
0ddd356
fix: correct HybridModel import path (hybrid.hybrid_model, not gpt.hy…
lennytinkeredapps Apr 29, 2026
9510d89
fix: GDN in_proj uses dedicated rule (no QKV slicing), add GDN rules …
lennytinkeredapps Apr 29, 2026
3b863b7
fix: add shared_experts rules to qwen3 export mapping for Qwen3.6 MoE
lennytinkeredapps Apr 29, 2026
344badb
fix: handle empty tensors in NVFP4QTensor.quantize (TP/EP sharding ze…
lennytinkeredapps Apr 29, 2026
1ca8f7a
fix: GDN out_proj uses dedicated rule (linear_attn.out_proj not self_…
lennytinkeredapps Apr 29, 2026
4992fb0
fix: add EP rank offset to expert_id during export
lennytinkeredapps Apr 29, 2026
e254f81
debug: add layer timing logs to export _get_state_dict
lennytinkeredapps Apr 29, 2026
5cfadba
debug: add expert iteration logging to export
lennytinkeredapps Apr 29, 2026
83c0676
debug: more trace prints around export state dict building
lennytinkeredapps Apr 29, 2026
f1e2944
debug: clean MLP diagnostic inside _get_transformer_layer_state_dict
lennytinkeredapps Apr 29, 2026
f693f35
fix: add GroupedMLPSlicing for TEGroupedMLP export + bypass broken la…
lennytinkeredapps Apr 29, 2026
8c9bf6e
fix: EP-aware MoE export with per-rank write and merge
lennytinkeredapps Apr 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion modelopt/torch/export/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def is_multimodal_model(model):
config = model.config

# Check for Nemotron-Parse encoder-decoder architecture
architectures = getattr(config, "architectures", [])
architectures = getattr(config, "architectures", []) or []
is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)

return (
Expand Down
2 changes: 2 additions & 0 deletions modelopt/torch/export/plugins/mcore_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
"LlamaForCausalLMEagle3Deep": eagle3_deep_llama_causal_lm_export,
"Qwen3ForCausalLM": qwen3_causal_lm_export,
"Qwen3MoeForCausalLM": qwen3_causal_lm_export,
"Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_export,
"Qwen2ForCausalLM": qwen25_causal_lm_export,
"GptOssForCausalLM": gptoss_causal_lm_export,
}
Expand All @@ -64,6 +65,7 @@
"NemotronHForCausalLM": nemotron_h_causal_lm_import,
"Qwen3ForCausalLM": qwen3_causal_lm_import,
"Qwen3MoeForCausalLM": qwen3_causal_lm_import,
"Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_import,
"Qwen2ForCausalLM": qwen25_causal_lm_import,
"GptOssForCausalLM": gptoss_causal_lm_import,
}
11 changes: 11 additions & 0 deletions modelopt/torch/export/plugins/mcore_qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
CustomModuleMapping,
GatedMLPMerging,
GatedMLPSlicing,
GroupedMLPSlicing,
NameRemapping,
QKVMerging,
QKVSlicing,
Expand Down Expand Up @@ -68,6 +69,16 @@
"router": NameRemapping("model.layers.{}.mlp.gate."),
"local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."),
"local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."),
# Grouped experts (TEGroupedMLP: fused per-expert weights via grouped GEMM)
"experts.linear_fc1": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.up_proj"),
"experts.linear_fc2": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.down_proj"),
# Shared experts (Qwen3.6 MoE)
"shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."),
"shared_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.shared_experts.down_proj."),
# GatedDeltaNet (linear attention) — no QKV slicing, direct name remap
"gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."),
"gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."),
"gated_delta_net_out_proj": NameRemapping("model.layers.{}.linear_attn.out_proj."),
}

qwen25_causal_lm_import: dict[str, CustomModuleMapping] = {
Expand Down
30 changes: 30 additions & 0 deletions modelopt/torch/export/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,36 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}"
)

# Handle _QuantFusedExperts modules (e.g. Qwen3.x MoE) which use plural
# ModuleList quantizers (gate_up_proj_weight_quantizers, down_proj_weight_quantizers)
# instead of singular weight_quantizer attributes.
# The quantization format is determined at module setup time, not per-expert.
# Check any quantizer in the list (even disabled ones) to determine the format,
# since calibration may not have activated all experts.
for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]:
quantizer_list = getattr(module, quantizer_list_name, None)
if quantizer_list is not None and len(quantizer_list) > 0:
# Check any quantizer — enabled or not — for format config.
# Prefer enabled ones first, but fall back to any if none are enabled.
q = None
for candidate in quantizer_list:
if hasattr(candidate, "is_enabled") and candidate.is_enabled:
q = candidate
break
if q is None:
q = quantizer_list[0]

num_bits = getattr(q, "num_bits", None)
block_sizes = getattr(q, "block_sizes", None)
scale_bits = (
block_sizes.get("scale_bits", (8, 0))
if isinstance(block_sizes, dict) and "scale_bits" in block_sizes
else (8, 0)
)
if num_bits == (2, 1) and scale_bits == (4, 3):
return QUANTIZATION_NVFP4
# Add other expert quantization format checks here as needed

for weight_name in weight_attr_names(module):
quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name))
if quantization != QUANTIZATION_NONE:
Expand Down
17 changes: 10 additions & 7 deletions modelopt/torch/export/unified_export_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,16 @@ def _process_quantized_modules(
raise AssertionError(
f"Failed to export module '{name}' (type={type(sub_module).__name__}): {e}"
) from e
elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
# Generic fused MoE experts (_QuantFusedExperts) with per-expert
# quantizer ModuleLists. Split into per-expert modules and export.
# NOTE: This check must come before type-name checks (e.g. Llama4,
# GptOss) because _QuantFusedExperts wrapping renames quantizers
# to plural ModuleLists (e.g. gate_up_proj_weight_quantizers).
from modelopt.torch.export.moe_utils import _export_fused_experts

with fsdp2_aware_weight_update(model, sub_module, reshard=False):
_export_fused_experts(sub_module, dtype)
elif (
"Llama4TextExperts" in type(sub_module).__name__
or "GptOssExperts" in type(sub_module).__name__
Expand All @@ -677,13 +687,6 @@ def _process_quantized_modules(
with fsdp2_aware_weight_update(model, sub_module, reshard=False):
for weight_name in ["gate_up_proj", "down_proj"]:
_export_quantized_weight(sub_module, dtype, weight_name)
elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
# Generic fused MoE experts (_QuantFusedExperts) with per-expert
# quantizer ModuleLists. Split into per-expert modules and export.
from modelopt.torch.export.moe_utils import _export_fused_experts

with fsdp2_aware_weight_update(model, sub_module, reshard=False):
_export_fused_experts(sub_module, dtype)


def _export_transformers_checkpoint(
Expand Down
Loading