NVIDIA · yueshen2016 · Jun 17, 2026 · coderabbitai · Jun 17, 2026 · coderabbitai
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Megatron-Bridge export: convert a quantized MCore checkpoint to HuggingFace format.
+# Wraps /opt/Megatron-Bridge/examples/quantization/export.py.
+# Assumes nvcr.io/nvidia/nemo:26.04+ container (megatron-bridge preinstalled at /opt/Megatron-Bridge).
+#
+# Required env:
+#   HF_MODEL_ID           HF model id used for architecture template + tokenizer.
+#   MEGATRON_LOAD_PATH    Quantized MCore ckpt dir produced by quantize.sh.
+# Optional env:
+#   OUTPUT_DIR            Parent dir for export (default: cwd).
+#   EXPORT_DIR            HF output dir
+#                         (default: ${OUTPUT_DIR}/<basename(HF_MODEL_ID)>_hf_export).
+#   TP, PP, EP, ETP       Parallelism degrees (defaults: 1, 1, 1, 1).
+#                         NOTE: HF exporter does not gather TP-sharded weights —
+#                         use PP > 1 to shard large models across GPUs.
+#   NPROC_PER_NODE        GPUs per node for torchrun (default: nvidia-smi GPU count).
+#   DTYPE                 Export dtype (default: bfloat16). One of bfloat16, float16, float32.
+#   EXPORT_EXTRA_MODULES  "true" to include Medusa / EAGLE / MTP heads.
+#   TRUST_REMOTE_CODE     "true" to pass --trust-remote-code.
+#
+# Extra positional args ("$@") are forwarded to export.py.
+
+set -e
+
+if [[ -z "${HF_MODEL_ID}" ]]; then
+    echo "[ERROR] HF_MODEL_ID is required" >&2
+    exit 1
+fi
+if [[ -z "${MEGATRON_LOAD_PATH}" ]]; then
+    echo "[ERROR] MEGATRON_LOAD_PATH is required" >&2
+    exit 1
+fi
+
+OUTPUT_DIR="${OUTPUT_DIR:-$(pwd)}"
+MODEL_NAME="$(basename "${HF_MODEL_ID}")"
+EXPORT_DIR="${EXPORT_DIR:-${OUTPUT_DIR}/${MODEL_NAME}_hf_export}"
+
+TP="${TP:-1}"
+PP="${PP:-1}"
+EP="${EP:-1}"
+ETP="${ETP:-1}"
+DTYPE="${DTYPE:-bfloat16}"
+
+if [[ -z "${NPROC_PER_NODE}" ]]; then
+    NPROC_PER_NODE=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1)
+fi
+
+# Multi-node torchrun: derive rendezvous from Slurm env. Falls back to standalone.
+NNODES="${SLURM_NNODES:-${NNODES:-1}}"
+NODE_RANK="${SLURM_NODEID:-${NODE_RANK:-0}}"
+if [[ "${NNODES}" -gt 1 ]]; then
+    if [[ -z "${MASTER_ADDR}" && -n "${SLURM_NODELIST}" ]]; then
+        MASTER_ADDR=$(scontrol show hostname "${SLURM_NODELIST}" 2>/dev/null | head -n 1)
+    fi
+    MASTER_PORT="${MASTER_PORT:-29500}"
+    RDZV_ARGS=("--nnodes=${NNODES}" "--node-rank=${NODE_RANK}" \
+               "--master-addr=${MASTER_ADDR}" "--master-port=${MASTER_PORT}")
+else
+    RDZV_ARGS=("--standalone" "--nnodes=1")
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+
+EXTRA_FLAGS=()
+[[ "${EXPORT_EXTRA_MODULES:-false}" == "true" ]] && EXTRA_FLAGS+=("--export-extra-modules")
+[[ "${TRUST_REMOTE_CODE:-false}" == "true" ]] && EXTRA_FLAGS+=("--trust-remote-code")
+
+cd /opt/Megatron-Bridge/examples/quantization
+
+echo "=== Exporting ${HF_MODEL_ID} (TP=${TP} PP=${PP} EP=${EP} ETP=${ETP}, ${NPROC_PER_NODE} GPUs, dtype=${DTYPE}) ==="
+echo "    load <- ${MEGATRON_LOAD_PATH}"
+echo "    save -> ${EXPORT_DIR}"
+
+python -m torch.distributed.run --nproc_per_node "${NPROC_PER_NODE}" "${RDZV_ARGS[@]}" export.py \
+    --hf-model-id "${HF_MODEL_ID}" \
+    --megatron-load-path "${MEGATRON_LOAD_PATH}" \
+    --export-dir "${EXPORT_DIR}" \
+    --tp "${TP}" \
+    --pp "${PP}" \
+    --ep "${EP}" \
+    --etp "${ETP}" \
+    --dtype "${DTYPE}" \
+    "${EXTRA_FLAGS[@]}" \
+    "$@"
+
+ls "${EXPORT_DIR}"
+if [[ -f "${EXPORT_DIR}/hf_quant_config.json" ]]; then
+    cat "${EXPORT_DIR}/hf_quant_config.json"
+fi
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Megatron-Bridge PTQ generation: load a quantized MCore checkpoint and run text generation.
+# Wraps /opt/Megatron-Bridge/examples/quantization/ptq_generate.py.
+# Assumes nvcr.io/nvidia/nemo:26.04+ container (megatron-bridge preinstalled at /opt/Megatron-Bridge).
+#
+# Required env:
+#   HF_MODEL_ID           HF model id used for tokenizer and architecture template.
+#   MEGATRON_LOAD_PATH    Quantized MCore ckpt dir produced by quantize.sh.
+# Optional env:
+#   TP, PP, EP, ETP       Parallelism degrees (defaults: 1, 1, 1, 1).
+#   NPROC_PER_NODE        GPUs per node for torchrun (default: nvidia-smi GPU count).
+#   PROMPTS               |-separated input prompts.
+#   OSL                   Output sequence length (default: 32).
+#   TRUST_REMOTE_CODE     "true" to pass --trust-remote-code.
+#
+# Extra positional args ("$@") are forwarded to ptq_generate.py.
+
+set -e
+
+if [[ -z "${HF_MODEL_ID}" ]]; then
+    echo "[ERROR] HF_MODEL_ID is required" >&2
+    exit 1
+fi
+if [[ -z "${MEGATRON_LOAD_PATH}" ]]; then
+    echo "[ERROR] MEGATRON_LOAD_PATH is required" >&2
+    exit 1
+fi
+
+TP="${TP:-1}"
+PP="${PP:-1}"
+EP="${EP:-1}"
+ETP="${ETP:-1}"
+OSL="${OSL:-32}"
+
+if [[ -z "${NPROC_PER_NODE}" ]]; then
+    NPROC_PER_NODE=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1)
+fi
+
+# Multi-node torchrun: derive rendezvous from Slurm env. Falls back to standalone.
+NNODES="${SLURM_NNODES:-${NNODES:-1}}"
+NODE_RANK="${SLURM_NODEID:-${NODE_RANK:-0}}"
+if [[ "${NNODES}" -gt 1 ]]; then
+    if [[ -z "${MASTER_ADDR}" && -n "${SLURM_NODELIST}" ]]; then
+        MASTER_ADDR=$(scontrol show hostname "${SLURM_NODELIST}" 2>/dev/null | head -n 1)
+    fi
+    MASTER_PORT="${MASTER_PORT:-29500}"
+    RDZV_ARGS=("--nnodes=${NNODES}" "--node-rank=${NODE_RANK}" \
+               "--master-addr=${MASTER_ADDR}" "--master-port=${MASTER_PORT}")
+else
+    RDZV_ARGS=("--standalone" "--nnodes=1")
+fi
+
+EXTRA_FLAGS=()
+[[ "${TRUST_REMOTE_CODE:-false}" == "true" ]] && EXTRA_FLAGS+=("--trust-remote-code")
+[[ -n "${PROMPTS}" ]] && EXTRA_FLAGS+=("--prompts" "${PROMPTS}")
+
+# ptq_generate.py imports `quantize` as a sibling module — run from its directory.
+cd /opt/Megatron-Bridge/examples/quantization
+
+echo "=== Generating with ${HF_MODEL_ID} (TP=${TP} PP=${PP} EP=${EP} ETP=${ETP}, ${NPROC_PER_NODE} GPUs) ==="
+echo "    load <- ${MEGATRON_LOAD_PATH}"
+
+exec python -m torch.distributed.run --nproc_per_node "${NPROC_PER_NODE}" "${RDZV_ARGS[@]}" ptq_generate.py \
+    --hf-model-id "${HF_MODEL_ID}" \
+    --megatron-load-path "${MEGATRON_LOAD_PATH}" \
+    --tp "${TP}" \
+    --pp "${PP}" \
+    --ep "${EP}" \
+    --etp "${ETP}" \
+    --osl "${OSL}" \
+    "${EXTRA_FLAGS[@]}" \
+    "$@"
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Megatron-Bridge PTQ quantization: HuggingFace -> quantized MCore checkpoint.
+# Wraps /opt/Megatron-Bridge/examples/quantization/quantize.py.
+# Assumes nvcr.io/nvidia/nemo:26.04+ container (megatron-bridge preinstalled at /opt/Megatron-Bridge).
+#
+# Required env: HF_MODEL_ID  (e.g. meta-llama/Llama-3.2-1B)
+# Optional env:
+#   OUTPUT_DIR             Parent dir for outputs (default: cwd).
+#   EXPORT_QUANT_CFG       ModelOpt quant config (default: fp8). Supported:
+#                          int8_sq, fp8, fp8_blockwise, int4_awq, w4a8_awq,
+#                          nvfp4, mamba_moe_fp8_aggressive, mamba_moe_fp8_conservative,
+#                          mamba_moe_nvfp4_aggressive, mamba_moe_nvfp4_conservative.
+#   MEGATRON_SAVE_PATH     Output MCore ckpt dir
+#                          (default: ${OUTPUT_DIR}/<basename(HF_MODEL_ID)>_quantized_${EXPORT_QUANT_CFG}).
+#   TP, PP, EP, ETP        Parallelism degrees (defaults: 1, 1, 1, 1).
+#   NPROC_PER_NODE         GPUs per node for torchrun (default: nvidia-smi GPU count).
+#   CALIB_SIZE             Calibration sample count (default: 512).
+#   COMPRESS               "true" to apply mtq.compress() for real low-bit weights.
+#   WEIGHT_ONLY            "true" to disable input quantization.
+#   EXPORT_KV_CACHE_QUANT  "true" to enable FP8 KV-cache quantization.
+#   TRUST_REMOTE_CODE      "true" to pass --trust-remote-code.
+#   PROMPTS                |-separated test prompts.
+#   DISABLE_HF_DATASETS_FILE_LOCK  "true" for read-only HF cache dirs.
+#
+# Extra positional args ("$@") are forwarded to quantize.py.
+
+set -e
+
+if [[ -z "${HF_MODEL_ID}" ]]; then
+    echo "[ERROR] HF_MODEL_ID is required" >&2
+    exit 1
+fi
+
+OUTPUT_DIR="${OUTPUT_DIR:-$(pwd)}"
+EXPORT_QUANT_CFG="${EXPORT_QUANT_CFG:-fp8}"
+MODEL_NAME="$(basename "${HF_MODEL_ID}")"
+MEGATRON_SAVE_PATH="${MEGATRON_SAVE_PATH:-${OUTPUT_DIR}/${MODEL_NAME}_quantized_${EXPORT_QUANT_CFG}}"
+
+TP="${TP:-1}"
+PP="${PP:-1}"
+EP="${EP:-1}"
+ETP="${ETP:-1}"
+CALIB_SIZE="${CALIB_SIZE:-512}"
+
+if [[ -z "${NPROC_PER_NODE}" ]]; then
+    NPROC_PER_NODE=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1)
+fi
+
+# Multi-node torchrun: derive rendezvous from Slurm env. Falls back to standalone.
+NNODES="${SLURM_NNODES:-${NNODES:-1}}"
+NODE_RANK="${SLURM_NODEID:-${NODE_RANK:-0}}"
+if [[ "${NNODES}" -gt 1 ]]; then
+    if [[ -z "${MASTER_ADDR}" && -n "${SLURM_NODELIST}" ]]; then
+        MASTER_ADDR=$(scontrol show hostname "${SLURM_NODELIST}" 2>/dev/null | head -n 1)
+    fi
+    MASTER_PORT="${MASTER_PORT:-29500}"
+    RDZV_ARGS=("--nnodes=${NNODES}" "--node-rank=${NODE_RANK}" \
+               "--master-addr=${MASTER_ADDR}" "--master-port=${MASTER_PORT}")
+else
+    RDZV_ARGS=("--standalone" "--nnodes=1")
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+
+EXTRA_FLAGS=()
+[[ "${COMPRESS:-false}" == "true" ]] && EXTRA_FLAGS+=("--compress")
+[[ "${WEIGHT_ONLY:-false}" == "true" ]] && EXTRA_FLAGS+=("--weight-only")
+[[ "${EXPORT_KV_CACHE_QUANT:-false}" == "true" ]] && EXTRA_FLAGS+=("--export-kv-cache-quant")
+[[ "${TRUST_REMOTE_CODE:-false}" == "true" ]] && EXTRA_FLAGS+=("--trust-remote-code")
+[[ "${DISABLE_HF_DATASETS_FILE_LOCK:-false}" == "true" ]] && EXTRA_FLAGS+=("--disable-hf-datasets-file-lock")
+[[ -n "${PROMPTS}" ]] && EXTRA_FLAGS+=("--prompts" "${PROMPTS}")
+
+# Workaround for upstream Megatron-Bridge using the deprecated dataset id
+# `cnn_dailymail` (no namespace). Newer huggingface_hub requires `namespace/name`
+# and rejects the bare form with HfUriError. Rewrite to `abisee/cnn_dailymail`,
+# which is the canonical id and is cached under /hf-local/abisee/cnn_dailymail.
+_UPSTREAM_QUANT=/opt/Megatron-Bridge/examples/quantization/quantize.py
+if [[ -w "${_UPSTREAM_QUANT}" ]] && grep -q 'load_dataset("cnn_dailymail"' "${_UPSTREAM_QUANT}"; then
+    sed -i 's|load_dataset("cnn_dailymail"|load_dataset("abisee/cnn_dailymail"|g' "${_UPSTREAM_QUANT}"
+fi
+
+# quantize.py imports `quantize_utils` as a sibling module — run from its directory.
+cd /opt/Megatron-Bridge/examples/quantization
+
+echo "=== Quantizing ${HF_MODEL_ID} with ${EXPORT_QUANT_CFG} (TP=${TP} PP=${PP} EP=${EP} ETP=${ETP}, ${NPROC_PER_NODE} GPUs) ==="
+echo "    save -> ${MEGATRON_SAVE_PATH}"
+
+exec python -m torch.distributed.run --nproc_per_node "${NPROC_PER_NODE}" "${RDZV_ARGS[@]}" quantize.py \
+    --hf-model-id "${HF_MODEL_ID}" \
+    --export-quant-cfg "${EXPORT_QUANT_CFG}" \
+    --megatron-save-path "${MEGATRON_SAVE_PATH}" \
+    --tp "${TP}" \
+    --pp "${PP}" \
+    --ep "${EP}" \
+    --etp "${ETP}" \
+    --calib-size "${CALIB_SIZE}" \
+    "${EXTRA_FLAGS[@]}" \
+    "$@"
@@ -0,0 +1,43 @@
+# Megatron-Bridge HF export for nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 (NVFP4).
+#
+# Converts the quantized MCore ckpt (from megatron_bridge_ptq.yaml) to a deployable
+# HuggingFace checkpoint. The HF exporter does not gather TP-sharded weights — TP=1
+# is required, PP is used to shard the model across GPUs.
+#
+# Cluster shape: 2 nodes x 4 GPUs = 8 ranks (PP=8, EP=1, TP=1).
+#
+# Usage:
+#   source .env-slurm
+#   cd tools/launcher
+#   uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/megatron_bridge_export.yaml --yes
+
+job_name: Nemotron-3-Nano_megatron_bridge_export
+pipeline:
+  skip: false
+  allow_to_fail: false
+  note: "Megatron-Bridge HF export for Nano-30B-A3B NVFP4: 2 nodes x 4 GPUs, TP=1 PP=8"
+
+  global_vars:
+    output_dir: /cicd/megatron-bridge
+
+  task_0:
+    script: common/megatron_bridge/export/export.sh
+    environment:
+      - HF_MODEL_ID: /hf-local/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+      - MEGATRON_LOAD_PATH: <<global_vars.output_dir>>/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4-MLM
+      - EXPORT_DIR: /cicd/export/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4-HF
+      - TP: "1"
+      - PP: "8"
+      - EP: "1"
+      - ETP: "1"
+      - DTYPE: bfloat16
+      - TRUST_REMOTE_CODE: "true"
-      - TRUST_REMOTE_CODE: "true"
+      - DTYPE: bfloat16
+    slurm_config:
-      - TRUST_REMOTE_CODE: "true"
+      - DTYPE: bfloat16
+    slurm_config:
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: nvcr.io/nvidia/nemo:26.04
+      modelopt_install_path: /opt/venv/lib/python3.12/site-packages/modelopt
+      partition: batch
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      time: "02:00:00"
@@ -0,0 +1,44 @@
+# Megatron-Bridge generation on quantized nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.
+#
+# Mirrors the user-provided reference command:
+#   torchrun --nproc_per_node 8 examples/quantization/ptq_generate.py \
+#     --megatron-load-path /models/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4-MLM \
+#     --hf-model-id /models/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \
+#     --trust-remote-code --tp 8 --ep 8
+#
+# Cluster shape: 2 nodes x 4 GPUs = 8 ranks.
+# Loads the MCore ckpt written by megatron_bridge_ptq.yaml (same OUTPUT_DIR).
+#
+# Usage:
+#   source .env-slurm
+#   cd tools/launcher
+#   uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/megatron_bridge_generate.yaml --yes
+
+job_name: Nemotron-3-Nano_megatron_bridge_generate
+pipeline:
+  skip: false
+  allow_to_fail: false
+  note: "Megatron-Bridge PTQ generate on Nano-30B-A3B (nvfp4): 2 nodes x 4 GPUs, TP=8 EP=8"
+
+  global_vars:
+    output_dir: /cicd/megatron-bridge
+
+  task_0:
+    script: common/megatron_bridge/generate/generate.sh
+    environment:
+      - HF_MODEL_ID: /hf-local/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+      - MEGATRON_LOAD_PATH: <<global_vars.output_dir>>/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4-MLM
+      - TP: "8"
+      - PP: "1"
+      - EP: "8"
+      - ETP: "1"
+      - TRUST_REMOTE_CODE: "true"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: nvcr.io/nvidia/nemo:26.04
+      modelopt_install_path: /opt/venv/lib/python3.12/site-packages/modelopt
+      partition: batch
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      time: "01:00:00"