NVIDIA · hychiang-git · Apr 18, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -24,6 +24,7 @@ Changelog
 
 **New Features**
 
+- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format.
 - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics.
 - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/puzzletron>`_ for more details.
 - Added iterator interface using CalibrationDataReader in ONNX quantization workflow.

@@ -118,6 +118,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG,
     "mxfp8": mtq.MXFP8_DEFAULT_CFG,
     "nvfp4_local_hessian": mtq.NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG,
+    "nvfp4_w4a16": mtq.NVFP4_W4A16_CFG,
 }
-    "nvfp4_w4a16": mtq.NVFP4_W4A16_CFG,
-}
+        qformat
+        in [
+            "fp8",
+            "int8_sq",
+            "int8_wo",
+            "int4_awq",
+            "nvfp4",
+            "nvfp4_awq",
+            "nvfp4_mse",
+            "nvfp4_w4a16",
+            "w4a8_awq",
+            "fp8_pb_wo",
+            "w4a8_mxfp4_fp8",
+            "nvfp4_mlp_only",
+            "nvfp4_experts_only",
-    "nvfp4_w4a16": mtq.NVFP4_W4A16_CFG,
-}
+        qformat
+        in [
+            "fp8",
+            "int8_sq",
+            "int8_wo",
+            "int4_awq",
+            "nvfp4",
+            "nvfp4_awq",
+            "nvfp4_mse",
+            "nvfp4_w4a16",
+            "w4a8_awq",
+            "fp8_pb_wo",
+            "w4a8_mxfp4_fp8",
+            "nvfp4_mlp_only",
+            "nvfp4_experts_only",
 
 KV_QUANT_CFG_CHOICES = {
@@ -752,6 +753,12 @@ def export_quantized(
                     extra_state_dict=mtp_state_dict,
                 )
 
+                if args.qformat == "nvfp4_w4a16":
+                    warnings.warn(
+                        "TensorRT-LLM and SGLang do not support this format. "
+                        "To serve on vLLM, convert the NVFP4 W4A16 checkpoint to compressed-tensors format."
+                    )
+
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
             tokenizer.padding_side = default_padding_side
@@ -1073,6 +1080,18 @@ def quantize_main(
                 quant_cfg["quant_cfg"].append({"quantizer_name": pattern, "enable": False})
                 print(f"Excluding MTP layer from quantization: {pattern}")
 
+        # Apply user-requested per-module exclusions (--exclude_modules).
+        if args.exclude_modules:
+            quant_cfg = copy.deepcopy(quant_cfg)
+            for mod in args.exclude_modules:
+                quant_cfg["quant_cfg"].append(
+                    {"quantizer_name": f"*{mod}*.weight_quantizer", "enable": False}
+                )
+                quant_cfg["quant_cfg"].append(
+                    {"quantizer_name": f"*{mod}*.input_quantizer", "enable": False}
+                )
+                print(f"Excluding module from quantization: {mod}")
+
         # Use constant amax for KV quantizers when a cast format is selected.
         # Recipes are authoritative for KV cache config (including use_constant_amax),
         # so skip this post-hoc override when --recipe is used; rely on the YAML instead
@@ -1242,6 +1261,17 @@ def parse_args() -> argparse.Namespace:
         default=False,
         action="store_true",
     )
+    parser.add_argument(
+        "--exclude_modules",
+        nargs="+",
+        default=[],
+        metavar="MODULE",
+        help=(
+            "Module name patterns to exclude from quantization "
+            "(e.g. lm_head backbone.layers.0.mixer). "
+            "Appends a disable rule for each pattern's weight and input quantizers."
+        ),
+    )
     parser.add_argument(
         "--gpu_max_mem_percentage",
         help=(

@@ -53,9 +53,9 @@ esac
 IFS=","
 for qformat in $QFORMAT; do
     case $qformat in
-    fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian) ;;
+    fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian | nvfp4_w4a16) ;;
     *)
-        echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian]" >&2
+        echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian, nvfp4_w4a16]" >&2
         exit 1
         ;;
     esac
@@ -127,6 +127,10 @@ if $TRUST_REMOTE_CODE; then
     PTQ_ARGS+=" --trust_remote_code "
 fi
 
+if [ -n "${EXCLUDE_MODULES:-}" ]; then
+    PTQ_ARGS+=" --exclude_modules ${EXCLUDE_MODULES} "
+fi
+
 if $USE_SEQ_DEVICE_MAP; then
     PTQ_ARGS+=" --use_seq_device_map "
 fi
@@ -199,6 +203,12 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         exit 0
     fi
 
+    if [ "$QFORMAT" = "nvfp4_w4a16" ]; then
+        echo "nvfp4_w4a16 checkpoint exported to $SAVE_PATH"
+        echo "To serve on vLLM, convert to compressed-tensors"
+        exit 0
+    fi
+
     if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then
         cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)
 

@@ -57,6 +57,11 @@ def _quant_algo_to_group_config(quant_algo: str, group_size: int | None = None)
         return {
             "weights": {"dynamic": False, "num_bits": 4, "type": "int", "group_size": gs},
         }
+    elif quant_algo == "NVFP4_W4A16":
+        gs = group_size or 16
+        return {
+            "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": gs},
+        }
     elif quant_algo in ("NVFP4_AWQ", "W4A8_AWQ"):
         gs = group_size or 128
         return {
@@ -183,6 +188,14 @@ def convert_hf_quant_config_format(input_config: dict[str, Any]) -> dict[str, An
             "targets": ["Linear"],
         }
         new_config["config_groups"] = {"group_0": config_group_details}
+    elif quant_algo_value == "NVFP4_W4A16":
+        # Weight-only FP4
+        group_size = original_quantization_details.get("group_size", 16)
+        config_group_details = {
+            "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": group_size},
+            "targets": ["Linear"],
+        }
+        new_config["config_groups"] = {"group_0": config_group_details}
     elif quant_algo_value == "MIXED_PRECISION":
         quantized_layers = original_quantization_details.get("quantized_layers", {})
 

@@ -39,6 +39,7 @@
 QUANTIZATION_MXFP8 = "mxfp8"
 QUANTIZATION_W4A8_MXFP4_FP8 = "w4a8_mxfp4_fp8"
 QUANTIZATION_NVFP4_AWQ = "nvfp4_awq"
+QUANTIZATION_NVFP4_W4A16 = "nvfp4_w4a16"  # weight-only FP4
 QUANTIZATION_FP8_PB_REAL = "fp8_pb_real"
 QUANTIZATION_FP8_PB_WO = "fp8_pb_wo"
 QUANTIZATION_FP8_PC_PT = "fp8_pc_pt"

@@ -65,6 +65,7 @@
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
     QUANTIZATION_NVFP4_SVDQUANT,
+    QUANTIZATION_NVFP4_W4A16,
     QUANTIZATION_W4A8_AWQ,
     QUANTIZATION_W4A8_MXFP4_FP8,
     QUANTIZATION_W4A8_NVFP4_FP8,
@@ -358,6 +359,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
+        QUANTIZATION_NVFP4_W4A16,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
         # Calibrate weight quantizer if amax is not set
@@ -402,6 +404,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
+        QUANTIZATION_NVFP4_W4A16,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
         # Calibrate weight quantizer if amax is not set
@@ -636,6 +639,10 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
                 return QUANTIZATION_NVFP4_AWQ
             if getattr(layer, "fused_with_prequant", False):
                 return QUANTIZATION_NVFP4_AWQ
+            # W4A16 weight-only: input_quantizer absent or disabled
+            if input_quantizer is None or not input_quantizer.is_enabled:
+                if scale_bits == (4, 3):
+                    return QUANTIZATION_NVFP4_W4A16
             assert input_quantizer is not None, (
                 f"input_quantizer is None for {quantizer_attr_names}"
             )
@@ -803,6 +810,11 @@ def process_layer_quant_config(layer_config_dict):
                 "quant_algo": "NVFP4",
                 "group_size": block_size_value,
             }
+        elif v == "nvfp4_w4a16":
+            layer_config = {
+                "quant_algo": "NVFP4_W4A16",
+                "group_size": block_size_value,
+            }
         elif v == "nvfp4_awq":
             layer_config = {
                 "quant_algo": "NVFP4_AWQ",
@@ -980,6 +992,7 @@ def to_quantized_weight(
     if quantization in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_NVFP4_W4A16,
         QUANTIZATION_W4A8_NVFP4_FP8,
         QUANTIZATION_NVFP4_SVDQUANT,
     ]:

@@ -84,6 +84,7 @@
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
     QUANTIZATION_NVFP4_SVDQUANT,
+    QUANTIZATION_NVFP4_W4A16,
     QUANTIZATION_W4A8_AWQ,
     QUANTIZATION_W4A8_NVFP4_FP8,
 )
@@ -520,6 +521,7 @@ def _export_quantized_weight(
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
         QUANTIZATION_NVFP4,
+        QUANTIZATION_NVFP4_W4A16,
         QUANTIZATION_W4A8_AWQ,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
@@ -548,6 +550,7 @@ def _export_quantized_weight(
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
+        QUANTIZATION_NVFP4_W4A16,
     ]:
         # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
         # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization

@@ -794,6 +794,7 @@ def _nvfp4_selective_quant_cfg(
 NVFP4_EXPERTS_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp.experts*", "*block_sparse_moe*"])
 NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"])
 NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"])
+NVFP4_W4A16_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True)
 
 # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to
 # modelopt_recipes/general/ptq/ as a yaml file
@@ -828,6 +829,7 @@ def _nvfp4_selective_quant_cfg(
     "NVFP4_MLP_ONLY_CFG",
     "NVFP4_EXPERTS_ONLY_CFG",
     "NVFP4_OMLP_ONLY_CFG",
+    "NVFP4_W4A16_CFG",
     "MAMBA_MOE_NVFP4_CONSERVATIVE_CFG",
     "MAMBA_MOE_NVFP4_AGGRESSIVE_CFG",
     "MAMBA_MOE_FP8_CONSERVATIVE_CFG",

diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py
@@ -47,6 +47,7 @@
         ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True, False),
         ("int8_wo", "tiny_llama-int8-wo", False, False, False, False, False),
         ("nvfp4_svdquant", "tiny_llama-nvfp4-svdquant", True, False, True, True, True),
+        ("nvfp4_w4a16", "tiny_llama-nvfp4-w4a16", False, False, False, False, False),
         # MoE models (fused experts: Qwen3 MoE, GPT-OSS)
         ("nvfp4", "tiny_qwen3_moe-nvfp4", True, False, True, True, False),
         ("fp8", "tiny_gpt_oss-fp8", True, False, True, True, False),