NVIDIA · erictinkeredapps · Apr 22, 2026 · Apr 22, 2026 · Apr 29, 2026 · Apr 29, 2026
@@ -107,7 +107,7 @@ def is_multimodal_model(model):
     config = model.config
 
     # Check for Nemotron-Parse encoder-decoder architecture
-    architectures = getattr(config, "architectures", [])
+    architectures = getattr(config, "architectures", []) or []
     is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
 
     return (

@@ -52,6 +52,7 @@
     "LlamaForCausalLMEagle3Deep": eagle3_deep_llama_causal_lm_export,
     "Qwen3ForCausalLM": qwen3_causal_lm_export,
     "Qwen3MoeForCausalLM": qwen3_causal_lm_export,
+    "Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_export,
     "Qwen2ForCausalLM": qwen25_causal_lm_export,
     "GptOssForCausalLM": gptoss_causal_lm_export,
 }
@@ -64,6 +65,7 @@
     "NemotronHForCausalLM": nemotron_h_causal_lm_import,
     "Qwen3ForCausalLM": qwen3_causal_lm_import,
     "Qwen3MoeForCausalLM": qwen3_causal_lm_import,
+    "Qwen3_5MoeForConditionalGeneration": qwen3_causal_lm_import,
     "Qwen2ForCausalLM": qwen25_causal_lm_import,
     "GptOssForCausalLM": gptoss_causal_lm_import,
 }
@@ -24,6 +24,7 @@
     CustomModuleMapping,
     GatedMLPMerging,
     GatedMLPSlicing,
+    GroupedMLPSlicing,
     NameRemapping,
     QKVMerging,
     QKVSlicing,
@@ -68,6 +69,16 @@
     "router": NameRemapping("model.layers.{}.mlp.gate."),
     "local_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.experts.{}."),
     "local_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.experts.{}.down_proj."),
+    # Grouped experts (TEGroupedMLP: fused per-expert weights via grouped GEMM)
+    "experts.linear_fc1": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.up_proj"),
+    "experts.linear_fc2": GroupedMLPSlicing("model.layers.{}.mlp.experts.{}.down_proj"),
+    # Shared experts (Qwen3.6 MoE)
+    "shared_experts.linear_fc1": GatedMLPSlicing("model.layers.{}.mlp.shared_experts."),
+    "shared_experts.linear_fc2": NameRemapping("model.layers.{}.mlp.shared_experts.down_proj."),
+    # GatedDeltaNet (linear attention) — no QKV slicing, direct name remap
+    "gated_delta_net_in_proj": NameRemapping("model.layers.{}.linear_attn.in_proj."),
+    "gated_delta_net_out_norm": NameRemapping("model.layers.{}.linear_attn.out_norm."),
+    "gated_delta_net_out_proj": NameRemapping("model.layers.{}.linear_attn.out_proj."),
 }
 
 qwen25_causal_lm_import: dict[str, CustomModuleMapping] = {

@@ -665,6 +665,36 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
             f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}"
         )
 
+    # Handle _QuantFusedExperts modules (e.g. Qwen3.x MoE) which use plural
+    # ModuleList quantizers (gate_up_proj_weight_quantizers, down_proj_weight_quantizers)
+    # instead of singular weight_quantizer attributes.
+    # The quantization format is determined at module setup time, not per-expert.
+    # Check any quantizer in the list (even disabled ones) to determine the format,
+    # since calibration may not have activated all experts.
+    for quantizer_list_name in ["gate_up_proj_weight_quantizers", "down_proj_weight_quantizers"]:
+        quantizer_list = getattr(module, quantizer_list_name, None)
+        if quantizer_list is not None and len(quantizer_list) > 0:
+            # Check any quantizer — enabled or not — for format config.
+            # Prefer enabled ones first, but fall back to any if none are enabled.
+            q = None
+            for candidate in quantizer_list:
+                if hasattr(candidate, "is_enabled") and candidate.is_enabled:
+                    q = candidate
+                    break
+            if q is None:
+                q = quantizer_list[0]
+
+            num_bits = getattr(q, "num_bits", None)
+            block_sizes = getattr(q, "block_sizes", None)
+            scale_bits = (
+                block_sizes.get("scale_bits", (8, 0))
+                if isinstance(block_sizes, dict) and "scale_bits" in block_sizes
+                else (8, 0)
+            )
+            if num_bits == (2, 1) and scale_bits == (4, 3):
+                return QUANTIZATION_NVFP4
+            # Add other expert quantization format checks here as needed
+
     for weight_name in weight_attr_names(module):
         quantization = _get_quantization_from_layer(module, quantizer_attr_names(weight_name))
         if quantization != QUANTIZATION_NONE:

@@ -658,6 +658,16 @@ def _process_quantized_modules(
                     raise AssertionError(
                         f"Failed to export module '{name}' (type={type(sub_module).__name__}): {e}"
                     ) from e
+            elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
+                # Generic fused MoE experts (_QuantFusedExperts) with per-expert
+                # quantizer ModuleLists. Split into per-expert modules and export.
+                # NOTE: This check must come before type-name checks (e.g. Llama4,
+                # GptOss) because _QuantFusedExperts wrapping renames quantizers
+                # to plural ModuleLists (e.g. gate_up_proj_weight_quantizers).
+                from modelopt.torch.export.moe_utils import _export_fused_experts
+
+                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                    _export_fused_experts(sub_module, dtype)
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
                 or "GptOssExperts" in type(sub_module).__name__
@@ -677,13 +687,6 @@ def _process_quantized_modules(
                 with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                     for weight_name in ["gate_up_proj", "down_proj"]:
                         _export_quantized_weight(sub_module, dtype, weight_name)
-            elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
-                # Generic fused MoE experts (_QuantFusedExperts) with per-expert
-                # quantizer ModuleLists. Split into per-expert modules and export.
-                from modelopt.torch.export.moe_utils import _export_fused_experts
-
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    _export_fused_experts(sub_module, dtype)
 
 
 def _export_transformers_checkpoint(