From 3069868758668a20f3cbf9787dfd3fb618981a9a Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:42 -0800
Subject: [PATCH 01/16] qad, dataset, scripts

Signed-off-by: Wei-Ming Chen <weimingc@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_qad/configs/README.md            | 114 +++
 .../qwen3-30b-a3b-instruct-2507-moe.conf      |  56 ++
 .../llm_qad/configs/qwen3-30b-a3b-moe.conf    |  55 ++
 .../qwen3-30b-a3b-thinking-2507-moe.conf      |  56 ++
 .../llm_qad/configs/qwen3-8b-default.conf     |  53 +
 .../llm_qad/configs/qwen3-8b-nemotron.conf    |  53 +
 examples/llm_qad/configs/template.conf        | 124 +++
 .../data_utils/download_nemotron_v1.py        | 536 ++++++++++
 .../data_utils/download_nemotron_v2.py        | 544 ++++++++++
 .../data_utils/download_openscience.py        | 137 +++
 .../data_utils/process_all_datasets.sh        |  70 ++
 .../data_utils/process_nemotron_qwen3-8B.sh   | 105 ++
 .../process_nemotron_v1_qwen3-8B.sh           | 127 +++
 .../process_nemotron_v2_qwen3-8B.sh           | 137 +++
 .../process_openscience_qwen3-8B.sh           | 117 +++
 .../data_utils/process_slimorca_qwen3-8B.sh   |  67 ++
 examples/llm_qad/qwen_qad.sh                  | 932 ++++++++++++++++++
 examples/llm_qad/sbatch_qwen_qad.sh           | 276 ++++++
 18 files changed, 3559 insertions(+)
 create mode 100644 examples/llm_qad/configs/README.md
 create mode 100644 examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf
 create mode 100644 examples/llm_qad/configs/qwen3-30b-a3b-moe.conf
 create mode 100644 examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf
 create mode 100644 examples/llm_qad/configs/qwen3-8b-default.conf
 create mode 100644 examples/llm_qad/configs/qwen3-8b-nemotron.conf
 create mode 100644 examples/llm_qad/configs/template.conf
 create mode 100644 examples/llm_qad/data_utils/download_nemotron_v1.py
 create mode 100644 examples/llm_qad/data_utils/download_nemotron_v2.py
 create mode 100644 examples/llm_qad/data_utils/download_openscience.py
 create mode 100644 examples/llm_qad/data_utils/process_all_datasets.sh
 create mode 100644 examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh
 create mode 100644 examples/llm_qad/data_utils/process_nemotron_v1_qwen3-8B.sh
 create mode 100644 examples/llm_qad/data_utils/process_nemotron_v2_qwen3-8B.sh
 create mode 100644 examples/llm_qad/data_utils/process_openscience_qwen3-8B.sh
 create mode 100644 examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh
 create mode 100644 examples/llm_qad/qwen_qad.sh
 create mode 100755 examples/llm_qad/sbatch_qwen_qad.sh

diff --git a/examples/llm_qad/configs/README.md b/examples/llm_qad/configs/README.md
new file mode 100644
index 000000000..6a70a69ad
--- /dev/null
+++ b/examples/llm_qad/configs/README.md
@@ -0,0 +1,114 @@
+# QAD Training Configuration Files
+
+Configuration files for QAD (Quantization-Aware Distillation) training.
+Works with both `sbatch_qwen_qad.sh` (SLURM) and `qwen_qad.sh` (Docker/Interactive).
+
+## Quick Start
+
+### SLURM Batch Mode
+```bash
+sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
+sbatch --nodes=4 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
+sbatch --nodes=8 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-moe.conf
+```
+
+### Docker/Interactive Mode
+```bash
+bash qwen_qad.sh --config configs/qwen3-8b-default.conf
+bash qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
+
+# Override config values
+LR=1e-5 bash qwen_qad.sh --config configs/qwen3-8b-default.conf
+```
+
+## Available Configs
+
+| Config | Model | Dataset | Recommended SLURM |
+|--------|-------|---------|-------------------|
+| `qwen3-8b-default.conf` | Qwen3-8B | openscience | `--nodes=4 -t 4:00:00` |
+| `qwen3-8b-nemotron.conf` | Qwen3-8B | nemotron | `--nodes=4 -t 8:00:00` |
+| `qwen3-30b-a3b-moe.conf` | Qwen3-30B-A3B | nemotron | `--nodes=8 -t 8:00:00` |
+
+## Creating Custom Configs
+
+```bash
+cp configs/template.conf configs/my-experiment.conf
+# Edit my-experiment.conf (set STUDENT_CKPT and TEACHER_CKPT)
+sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/my-experiment.conf
+```
+
+## Configuration Variables
+
+### Model
+| Variable | Description | Required |
+|----------|-------------|----------|
+| `STUDENT_MODEL` | Student model architecture | Yes |
+| `TEACHER_MODEL` | Teacher model architecture | Yes |
+
+### Checkpoints (REQUIRED)
+| Variable | Description | Required |
+|----------|-------------|----------|
+| `STUDENT_CKPT` | Path to student checkpoint (FP4 for QAD) | **Yes** |
+| `TEACHER_CKPT` | Path to teacher checkpoint (BF16) | **Yes (QAD)** |
+| `TEACHER_MODEL_CONFIG` | Path to teacher config YAML | No (auto) |
+
+### Training
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `LR` | Learning rate | 1e-6 |
+| `DATASET_NAME` | Dataset to use | openscience |
+| `KD_CFG_PATH` | Custom KD config YAML | (empty) |
+| `TRAIN_SAMPLES` | Override sample count | (auto) |
+
+### Parallelism
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `TP_SIZE` | Tensor parallelism | 8 |
+| `PP_SIZE` | Pipeline parallelism | 1 |
+| `EP_SIZE` | Expert parallelism (MoE) | 1 |
+| `MBS` | Micro-batch size | 16 |
+| `NUM_GPUS` | GPUs per node | 8 |
+| `MASTER_PORT` | Distributed port | 29500 |
+
+### Paths
+| Variable | Description |
+|----------|-------------|
+| `MLM_DIR` | Megatron-LM directory |
+| `MODELOPT_DIR` | ModelOpt directory |
+| `MODELS_ROOT` | Model checkpoints root |
+| `QAD_CHECKPOINT_ROOT` | Output root |
+| `DATACACHE_DIR` | Data cache |
+
+### Container
+| Variable | Description |
+|----------|-------------|
+| `CONTAINER_IMAGE` | Container squashfs |
+| `CONTAINER_MOUNTS` | Mount points |
+| `CONTAINER_WORKDIR` | Working directory |
+
+## Output Directory Naming
+
+Output directories are named using the checkpoint directory names:
+```
+{QAD_CHECKPOINT_ROOT}/{STUDENT_CKPT_NAME}-Teacher-{TEACHER_CKPT_NAME}-Data-{DATASET}-lr{LR}/
+```
+
+Example:
+```
+/checkpoints/Qwen3-8B-NVFP4-TP8-MLM-Teacher-Qwen3-8B-TP8-MLM-Data-nemotron-lr1e-6/
+```
+
+## SLURM Options
+
+SLURM parameters should be passed via `sbatch` command:
+
+```bash
+sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config ...
+sbatch --nodes=8 -t 8:00:00 -p batch -A myaccount sbatch_qwen_qad.sh --config ...
+```
+
+## Variable Priority
+
+```
+Script defaults < Config file < Environment variables < Command line args
+```
diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf
new file mode 100644
index 000000000..d5449d431
--- /dev/null
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf
@@ -0,0 +1,56 @@
+#!/bin/bash
+########################################################
+# QAD Configuration: Qwen3-30B-A3B Instruct (MoE)
+# Mixture of Experts - requires more resources
+#
+# Usage:
+#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
+########################################################
+
+########################################################
+# MODEL
+########################################################
+export STUDENT_MODEL="Qwen3-30B-A3B"
+export TEACHER_MODEL="Qwen3-30B-A3B"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+# export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Instruct-2507-NVFP4-TP4-MLM"
+export STUDENT_CKPT="/home/scratch.weimingc_sw/models/modelopt_artifacts/Qwen3-30B-A3B-Instruct-2507-NVFP4-cnn_nemotron_calib-TP4-MLM"
+export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Instruct-2507-TP4-MLM"
+export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Instruct-2507-teacher.yaml"
+
+########################################################
+# TRAINING
+########################################################
+export LR="1e-6"
+export DATASET_NAME="nemotron_v2_code" #"nemotron_v2_code"
+# export KD_CFG_PATH=""  # Optional: path to custom KD config YAML
+
+########################################################
+# PARALLELISM (MoE specific)
+# Note: QAD loads both student + teacher models, requires more memory
+########################################################
+export TP_SIZE=4
+export PP_SIZE=1
+export EP_SIZE=4    # Expert parallelism for 128 experts
+export MBS=4        
+export NUM_GPUS=8
+export MASTER_PORT=29500
+
+########################################################
+# PATHS
+########################################################
+export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
+export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
+export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
+export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
+
+########################################################
+# CONTAINER
+########################################################
+export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
+export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
+export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-moe.conf b/examples/llm_qad/configs/qwen3-30b-a3b-moe.conf
new file mode 100644
index 000000000..7b8941ce2
--- /dev/null
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-moe.conf
@@ -0,0 +1,55 @@
+#!/bin/bash
+########################################################
+# QAD Configuration: Qwen3-30B-A3B (MoE)
+# Mixture of Experts - requires more resources
+#
+# Usage:
+#   sbatch --nodes=8 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-moe.conf
+########################################################
+
+########################################################
+# MODEL
+########################################################
+export STUDENT_MODEL="Qwen3-30B-A3B"
+export TEACHER_MODEL="Qwen3-30B-A3B"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-NVFP4-TP4-MLM"
+export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-TP4-MLM"
+export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-teacher.yaml"
+
+########################################################
+# TRAINING
+########################################################
+export LR="1e-6"
+export DATASET_NAME="nemotron"
+
+########################################################
+# PARALLELISM (MoE specific)
+# Qwen3-30B-A3B: TP=4 max (num-query-groups = 4)
+# With 8 GPUs: TP=4 × EP=2 = 8 GPUs total
+########################################################
+export TP_SIZE=4
+export PP_SIZE=1
+export EP_SIZE=2    # Expert parallelism: TP×EP = 4×2 = 8 GPUs
+export MBS=16        # Smaller batch for larger model
+export NUM_GPUS=8
+export MASTER_PORT=29500
+
+########################################################
+# PATHS
+########################################################
+export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
+export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
+export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
+export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
+
+########################################################
+# CONTAINER
+########################################################
+export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
+export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
+export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf b/examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf
new file mode 100644
index 000000000..0f76697be
--- /dev/null
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf
@@ -0,0 +1,56 @@
+#!/bin/bash
+########################################################
+# QAD Configuration: Qwen3-30B-A3B Instruct (MoE)
+# Mixture of Experts - requires more resources
+#
+# Usage:
+#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
+########################################################
+
+########################################################
+# MODEL
+########################################################
+export STUDENT_MODEL="Qwen3-30B-A3B"
+export TEACHER_MODEL="Qwen3-30B-A3B"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Thinking-2507-NVFP4-TP4-MLM"
+export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Thinking-2507-TP4-MLM"
+export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Thinking-2507-teacher.yaml"
+
+########################################################
+# TRAINING
+########################################################
+export LR="1e-6"
+export DATASET_NAME="combined_v2_cot_chat"
+# export KD_CFG_PATH=""  # Optional: path to custom KD config YAML
+
+########################################################
+# PARALLELISM (MoE specific)
+# Note: QAD loads both student + teacher models, requires more memory
+########################################################
+export TP_SIZE=4
+export PP_SIZE=1
+export EP_SIZE=4    # Expert parallelism for 128 experts
+export MBS=4        # MBS=4 for H100x8
+export NUM_GPUS=8
+export MASTER_PORT=29500
+
+########################################################
+# PATHS
+########################################################
+export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
+export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
+export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
+export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
+
+########################################################
+# CONTAINER
+########################################################
+export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
+export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
+export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
+
diff --git a/examples/llm_qad/configs/qwen3-8b-default.conf b/examples/llm_qad/configs/qwen3-8b-default.conf
new file mode 100644
index 000000000..eedf0f00d
--- /dev/null
+++ b/examples/llm_qad/configs/qwen3-8b-default.conf
@@ -0,0 +1,53 @@
+#!/bin/bash
+########################################################
+# QAD Configuration: Qwen3-8B Default (OpenScience)
+# 
+# Usage:
+#   sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
+########################################################
+
+########################################################
+# MODEL
+########################################################
+export STUDENT_MODEL="Qwen3-8B"
+export TEACHER_MODEL="Qwen3-8B"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-NVFP4-TP8-MLM"
+export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-TP8-MLM"
+export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-teacher.yaml"
+
+########################################################
+# TRAINING
+########################################################
+export LR="1e-6"
+export DATASET_NAME="openscience"
+# export KD_CFG_PATH=""  # Optional: path to custom KD config YAML
+
+########################################################
+# PARALLELISM
+########################################################
+export TP_SIZE=8
+export PP_SIZE=1
+export EP_SIZE=1
+export MBS=16
+export NUM_GPUS=8
+export MASTER_PORT=29500
+
+########################################################
+# PATHS
+########################################################
+export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
+export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
+export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
+export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
+
+########################################################
+# CONTAINER
+########################################################
+export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
+export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
+export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/qwen3-8b-nemotron.conf b/examples/llm_qad/configs/qwen3-8b-nemotron.conf
new file mode 100644
index 000000000..052c6fe7d
--- /dev/null
+++ b/examples/llm_qad/configs/qwen3-8b-nemotron.conf
@@ -0,0 +1,53 @@
+#!/bin/bash
+########################################################
+# QAD Configuration: Qwen3-8B with Nemotron-v1 Dataset
+# Best for MMLU accuracy improvement
+#
+# Usage:
+#   sbatch --nodes=4 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
+########################################################
+
+########################################################
+# MODEL
+########################################################
+export STUDENT_MODEL="Qwen3-8B"
+export TEACHER_MODEL="Qwen3-8B"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-NVFP4-TP8-MLM"
+export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-TP8-MLM"
+export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-teacher.yaml"
+
+########################################################
+# TRAINING
+########################################################
+export LR="1e-6"
+export DATASET_NAME="nemotron"  # Nemotron-v1 @ 30% (~7.5M samples)
+
+########################################################
+# PARALLELISM
+########################################################
+export TP_SIZE=8
+export PP_SIZE=1
+export EP_SIZE=1
+export MBS=16
+export NUM_GPUS=8
+export MASTER_PORT=29500
+
+########################################################
+# PATHS
+########################################################
+export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
+export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
+export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
+export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
+
+########################################################
+# CONTAINER
+########################################################
+export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
+export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
+export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/template.conf b/examples/llm_qad/configs/template.conf
new file mode 100644
index 000000000..814a9de02
--- /dev/null
+++ b/examples/llm_qad/configs/template.conf
@@ -0,0 +1,124 @@
+#!/bin/bash
+########################################################
+# QAD Training Configuration Template
+########################################################
+# Copy this file and modify for your experiment:
+#   cp template.conf my-experiment.conf
+#
+# Then run:
+#   sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/my-experiment.conf
+########################################################
+
+########################################################
+# MODEL CONFIGURATION
+########################################################
+
+# Student model architecture name
+export STUDENT_MODEL="Qwen3-8B"
+
+# Teacher model architecture name
+export TEACHER_MODEL="Qwen3-8B"
+
+# Model architecture config file (optional - auto-detected from STUDENT_MODEL)
+# export STUDENT_CONFIG_FILE="/path/to/Megatron-LM/.../conf/Qwen/Qwen3-8B.sh"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+
+# Student checkpoint path (REQUIRED)
+# This is the FP4 quantized checkpoint for QAD training
+export STUDENT_CKPT="/path/to/student/checkpoint"
+
+# Teacher checkpoint path (REQUIRED for QAD mode)
+# This is the BF16 teacher model for knowledge distillation
+export TEACHER_CKPT="/path/to/teacher/checkpoint"
+
+# Teacher model config YAML (REQUIRED)
+# Contains: num_layers, hidden_size, num_attention_heads, ffn_hidden_size
+export TEACHER_MODEL_CONFIG="/path/to/teacher.yaml"
+
+########################################################
+# TRAINING CONFIGURATION
+########################################################
+
+# Learning rate
+export LR="1e-6"
+
+# Dataset name (selects from predefined datablends)
+# Options: openscience, nemotron, nemotron_v2, combined, slimorca
+export DATASET_NAME="openscience"
+
+# Training samples (leave empty to use dataset default)
+# export TRAIN_SAMPLES=""
+
+# KD config file path (optional - for custom distillation settings)
+# export KD_CFG_PATH="/path/to/kd_config.yaml"
+
+########################################################
+# PARALLELISM CONFIGURATION
+########################################################
+
+# Tensor Parallelism (must match checkpoint TP)
+export TP_SIZE=8
+
+# Pipeline Parallelism (increase for larger models)
+export PP_SIZE=1
+
+# Expert Parallelism (for MoE models)
+export EP_SIZE=1
+
+# Micro-batch size per GPU
+export MBS=16
+
+# Number of GPUs per node
+export NUM_GPUS=8
+
+# Master port for distributed training
+export MASTER_PORT=29500
+
+########################################################
+# ROOT PATHS
+########################################################
+
+# Megatron-LM source directory
+export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+
+# ModelOpt source directory
+export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
+
+# Root directory containing model checkpoints
+export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
+
+# Root directory for training outputs
+export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
+
+# Data cache directory
+export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
+
+########################################################
+# DATASET PATHS (optional)
+########################################################
+
+# Custom datablend JSON path (overrides DATASET_NAME)
+# export BLEND_PATH="/path/to/datablend.json"
+
+########################################################
+# CONTAINER CONFIGURATION
+########################################################
+
+export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
+export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
+export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
+
+########################################################
+# ADVANCED OPTIONS
+########################################################
+
+# HuggingFace token for accessing gated models (avoids rate limiting)
+# Recommended: pass via --hf-token arg to avoid logging
+# Example: sbatch sbatch_qwen_qad.sh --hf-token hf_xxx --config ...
+# Or set via environment: export HF_TOKEN="hf_xxx"
+
+# Iterations to skip (comma-separated)
+# export ITERATIONS_TO_SKIP=""
diff --git a/examples/llm_qad/data_utils/download_nemotron_v1.py b/examples/llm_qad/data_utils/download_nemotron_v1.py
new file mode 100644
index 000000000..ae77ae4bf
--- /dev/null
+++ b/examples/llm_qad/data_utils/download_nemotron_v1.py
@@ -0,0 +1,536 @@
+#!/usr/bin/env python3
+"""
+Download and preprocess NVIDIA Nemotron-Post-Training-Dataset-v1 for QAD training.
+
+This dataset contains high-quality reasoning data generated by DeepSeek-R1 and Qwen3-235B,
+which is excellent for improving MMLU and reasoning capabilities.
+
+Splits available:
+- stem: 20.6M samples (science, reasoning, humanities) - BEST for MMLU
+- math: 2.0M samples (step-by-step math solutions)
+- code: 1.9M samples (programming challenges)
+- chat: 746K samples (conversational tuning)
+- tool_calling: 310K samples (function calling)
+
+Usage:
+    # Download all splits to separate folders (recommended)
+    python download_nemotron_v1.py --sample-percent 30 --tokenizer Qwen/Qwen3-8B --include-reasoning
+    
+    # Download specific splits
+    python download_nemotron_v1.py --splits stem,math --sample-percent 30 --include-reasoning
+    
+    # Combined mode (legacy - all splits in one file)
+    python download_nemotron_v1.py --sample-percent 30 --combined
+
+Output structure (split mode - default):
+    nemotron_v1/
+    ├── stem/
+    │   ├── stem_30pct_cot_chat_train.jsonl
+    │   ├── stem_30pct_cot_chat_validation.jsonl
+    │   └── stem_30pct_cot_chat_test.jsonl
+    ├── math/
+    │   └── ...
+    └── metadata.json
+
+Output structure (combined mode):
+    nemotron_v1/
+    ├── nemotron_all_30pct_cot_chat_train.jsonl
+    └── ...
+"""
+
+import argparse
+import json
+import os
+import random
+from datasets import load_dataset
+from tqdm import tqdm
+
+DEFAULT_OUTPUT_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1"
+DATABLEND_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets"
+
+# Available splits and their sizes
+AVAILABLE_SPLITS = {
+    "stem": 20662167,      # Best for MMLU - science, reasoning, humanities
+    "math": 2044407,       # Math reasoning
+    "code": 1896395,       # Code challenges
+    "chat": 746622,        # Conversational
+    "tool_calling": 310051 # Function calling
+}
+
+# Train/valid/test split ratios
+TRAIN_RATIO = 0.95
+VALID_RATIO = 0.025
+TEST_RATIO = 0.025
+RANDOM_SEED = 42
+
+# Global tokenizer for chat template (initialized if --tokenizer is provided)
+_TOKENIZER = None
+
+
+def init_tokenizer(tokenizer_name: str):
+    """Initialize tokenizer for chat template formatting."""
+    global _TOKENIZER
+    if tokenizer_name:
+        from transformers import AutoTokenizer
+        print(f"📝 Loading tokenizer for chat template: {tokenizer_name}")
+        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        
+        # Show example
+        example = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
+        formatted = _TOKENIZER.apply_chat_template(example, tokenize=False)
+        print(f"   Example format:\n   {formatted[:200]}...")
+
+
+def format_messages_to_text(messages: list, reasoning: str = None) -> str:
+    """Convert messages format to text for QAD training.
+    
+    If a tokenizer is initialized, uses its chat template.
+    Otherwise, uses simple role-based formatting.
+    """
+    global _TOKENIZER
+    
+    # Optionally prepend reasoning/chain-of-thought
+    if reasoning and reasoning.strip():
+        # Insert thinking block before last assistant message
+        messages_with_cot = []
+        for i, msg in enumerate(messages):
+            if msg.get("role") == "assistant" and i == len(messages) - 1:
+                thinking_content = f"<think>\n{reasoning}\n</think>\n{msg.get('content', '')}"
+                messages_with_cot.append({"role": "assistant", "content": thinking_content})
+            else:
+                messages_with_cot.append(msg)
+        messages = messages_with_cot
+    
+    # Use chat template if tokenizer is available
+    if _TOKENIZER is not None:
+        try:
+            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
+        except Exception as e:
+            print(f"Warning: Chat template failed, using simple format: {e}")
+    
+    # Fallback: simple role-based format
+    text_parts = []
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        
+        if role == "system":
+            text_parts.append(f"System: {content}")
+        elif role == "user":
+            text_parts.append(f"User: {content}")
+        elif role == "assistant":
+            text_parts.append(f"Assistant: {content}")
+    
+    return "\n\n".join(text_parts)
+
+
+def download_split(split_name: str, max_samples: int, output_dir: str,
+                   suffix: str, include_reasoning: bool = False) -> dict:
+    """Download a single split and save to its own folder.
+    
+    Returns dict with sample counts for each partition.
+    """
+    print(f"\n📥 Downloading split: {split_name} (target: {max_samples:,} samples)")
+    
+    # Create split-specific directory
+    split_dir = os.path.join(output_dir, split_name)
+    os.makedirs(split_dir, exist_ok=True)
+    
+    try:
+        # Load the specific split with streaming
+        dataset = load_dataset(
+            "nvidia/Nemotron-Post-Training-Dataset-v1",
+            split=split_name,
+            streaming=True
+        )
+        
+        all_examples = []
+        count = 0
+        
+        for example in tqdm(dataset, desc=f"Processing {split_name}", total=max_samples):
+            if count >= max_samples:
+                break
+            
+            messages = example.get("messages", [])
+            reasoning = example.get("reasoning", "") if include_reasoning else ""
+            
+            # Convert to text format
+            text = format_messages_to_text(messages, reasoning)
+            
+            if text.strip():
+                all_examples.append({
+                    "text": text,
+                    "category": example.get("category", split_name),
+                })
+                count += 1
+        
+        print(f"✓ Collected {count:,} examples from {split_name}")
+        
+        if not all_examples:
+            print(f"Warning: No examples collected for {split_name}")
+            return {"train": 0, "validation": 0, "test": 0}
+        
+        # Shuffle and split into train/valid/test
+        random.seed(RANDOM_SEED)
+        random.shuffle(all_examples)
+        
+        total_size = len(all_examples)
+        train_end = int(total_size * TRAIN_RATIO)
+        valid_end = train_end + int(total_size * VALID_RATIO)
+        
+        partitions = {
+            'train': all_examples[:train_end],
+            'validation': all_examples[train_end:valid_end],
+            'test': all_examples[valid_end:]
+        }
+        
+        # Save each partition
+        counts = {}
+        for part_name, part_data in partitions.items():
+            output_file = os.path.join(split_dir, f"{split_name}_{suffix}_{part_name}.jsonl")
+            
+            with open(output_file, 'w', encoding='utf-8') as f:
+                for example in part_data:
+                    json_line = json.dumps({"text": example["text"]}, ensure_ascii=False)
+                    f.write(json_line + '\n')
+            
+            counts[part_name] = len(part_data)
+            print(f"   ✓ {part_name}: {len(part_data):,} samples → {output_file}")
+        
+        return counts
+        
+    except Exception as e:
+        print(f"Error loading {split_name}: {e}")
+        return {"train": 0, "validation": 0, "test": 0}
+
+
+def create_datablend_configs(output_dir: str, splits_downloaded: list, suffix: str, 
+                             sample_counts: dict):
+    """Create datablend JSON configs for each split and combined."""
+    preprocessed_dir = output_dir.replace("nemotron_v1", "nemotron_v1_preprocessed")
+    
+    # Create individual datablend for each split
+    for split_name in splits_downloaded:
+        blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v1_{split_name}_{suffix}.json")
+        blend_config = {
+            "train": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_train_text_document"],
+            "valid": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_validation_text_document"],
+            "test": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_test_text_document"]
+        }
+        
+        with open(blend_file, 'w') as f:
+            json.dump(blend_config, f, indent=2)
+        print(f"📝 Created: {blend_file}")
+    
+    # Create combined datablend (all English splits)
+    if len(splits_downloaded) > 1:
+        english_splits = [s for s in splits_downloaded if s in ["stem", "math", "code", "chat"]]
+        if english_splits:
+            # Calculate weights based on sample counts
+            total_samples = sum(sample_counts.get(s, {}).get("train", 0) for s in english_splits)
+            
+            blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v1_all_en_{suffix}.json")
+            
+            train_entries = []
+            valid_entries = []
+            test_entries = []
+            
+            for split_name in english_splits:
+                split_count = sample_counts.get(split_name, {}).get("train", 0)
+                weight = split_count / total_samples if total_samples > 0 else 1.0 / len(english_splits)
+                
+                train_entries.extend([
+                    weight,
+                    f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_train_text_document"
+                ])
+                valid_entries.extend([
+                    weight,
+                    f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_validation_text_document"
+                ])
+                test_entries.extend([
+                    weight,
+                    f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_test_text_document"
+                ])
+            
+            blend_config = {
+                "train": train_entries,
+                "valid": valid_entries,
+                "test": test_entries
+            }
+            
+            with open(blend_file, 'w') as f:
+                json.dump(blend_config, f, indent=2)
+            print(f"📝 Created combined: {blend_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download Nemotron-v1 for QAD")
+    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
+                        help="Output directory for JSONL files")
+    parser.add_argument("--splits", type=str, default="stem,math,code,chat",
+                        help="Comma-separated list of splits to download (stem,math,code,chat,tool_calling)")
+    parser.add_argument("--sample-percent", type=float, default=30.0,
+                        help="Percentage of each split to use (1-100). E.g., 10 = 10%% of each split")
+    parser.add_argument("--max-samples", type=int, default=None,
+                        help="Maximum samples per split (absolute cap, used if --sample-percent not set)")
+    parser.add_argument("--include-reasoning", action="store_true",
+                        help="Include chain-of-thought reasoning in output")
+    parser.add_argument("--tokenizer", type=str, default=None,
+                        help="HuggingFace tokenizer to use for chat template (e.g., Qwen/Qwen3-8B)")
+    parser.add_argument("--combined", action="store_true",
+                        help="Legacy mode: combine all splits into single files instead of separate folders")
+    args = parser.parse_args()
+    
+    # Default to 30% if neither option is specified
+    if args.sample_percent is None and args.max_samples is None:
+        args.sample_percent = 30.0
+    
+    # Initialize tokenizer if specified
+    if args.tokenizer:
+        init_tokenizer(args.tokenizer)
+    
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    
+    splits_to_download = [s.strip() for s in args.splits.split(",")]
+    
+    # Build suffix string
+    if args.sample_percent is not None:
+        pct_str = f"{int(args.sample_percent)}pct"
+    else:
+        pct_str = ""
+    
+    cot_str = "_cot" if args.include_reasoning else ""
+    chat_str = "_chat" if args.tokenizer else ""
+    suffix = f"{pct_str}{cot_str}{chat_str}"
+    
+    print("=" * 70)
+    print("Downloading NVIDIA Nemotron-Post-Training-Dataset-v1")
+    print("=" * 70)
+    print(f"Mode: {'Combined (legacy)' if args.combined else 'Split (fine-grained)'}")
+    print(f"Splits: {splits_to_download}")
+    print(f"Sample percent: {args.sample_percent}%")
+    print(f"Include reasoning: {args.include_reasoning}")
+    print(f"Chat template: {args.tokenizer or 'None (simple format)'}")
+    print(f"Suffix: {suffix}")
+    print(f"Output directory: {output_dir}")
+    print("=" * 70)
+    
+    # Calculate samples per split (with 500K cap per split)
+    MAX_SAMPLES_PER_SPLIT = 500000  # Cap at 500K per split for manageable dataset size
+    samples_per_split = {}
+    for split_name in splits_to_download:
+        if split_name not in AVAILABLE_SPLITS:
+            continue
+        available = AVAILABLE_SPLITS[split_name]
+        if args.sample_percent is not None:
+            calculated = int(available * args.sample_percent / 100)
+            samples_per_split[split_name] = min(calculated, MAX_SAMPLES_PER_SPLIT)
+        else:
+            samples_per_split[split_name] = min(available, args.max_samples, MAX_SAMPLES_PER_SPLIT)
+    
+    print(f"\nExpected samples per split (capped at {MAX_SAMPLES_PER_SPLIT:,}):")
+    total_expected = 0
+    for split_name, count in samples_per_split.items():
+        available = AVAILABLE_SPLITS[split_name]
+        pct = count / available * 100
+        capped = " (CAPPED)" if count == MAX_SAMPLES_PER_SPLIT else ""
+        print(f"  {split_name}: {count:,} / {available:,} ({pct:.1f}%){capped}")
+        total_expected += count
+    print(f"  Total expected: {total_expected:,}")
+    
+    if args.combined:
+        # Legacy combined mode
+        download_combined_mode(args, splits_to_download, samples_per_split, suffix)
+    else:
+        # New split mode (default)
+        sample_counts = {}
+        
+        for split_name in splits_to_download:
+            if split_name not in AVAILABLE_SPLITS:
+                print(f"Warning: Unknown split '{split_name}', skipping...")
+                continue
+            
+            max_samples = samples_per_split[split_name]
+            counts = download_split(
+                split_name=split_name,
+                max_samples=max_samples,
+                output_dir=output_dir,
+                suffix=suffix,
+                include_reasoning=args.include_reasoning
+            )
+            sample_counts[split_name] = counts
+        
+        # Save metadata
+        metadata = {
+            "sample_percent": args.sample_percent,
+            "include_reasoning": args.include_reasoning,
+            "tokenizer": args.tokenizer,
+            "suffix": suffix,
+            "splits": {}
+        }
+        for split_name, counts in sample_counts.items():
+            metadata["splits"][split_name] = {
+                "train": counts.get("train", 0),
+                "validation": counts.get("validation", 0),
+                "test": counts.get("test", 0),
+                "total": sum(counts.values())
+            }
+        
+        metadata_file = os.path.join(output_dir, f"metadata_{suffix}.json")
+        with open(metadata_file, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        print(f"\n📝 Saved metadata: {metadata_file}")
+        
+        # Create datablend configs
+        print("\n" + "=" * 70)
+        print("Creating datablend configs...")
+        create_datablend_configs(output_dir, list(sample_counts.keys()), suffix, sample_counts)
+        
+        # Print summary
+        print("\n" + "=" * 70)
+        print("✓ Nemotron-v1 download complete!")
+        print("=" * 70)
+        print(f"\nOutput structure:")
+        print(f"  {output_dir}/")
+        for split_name in sample_counts.keys():
+            print(f"  ├── {split_name}/")
+            print(f"  │   ├── {split_name}_{suffix}_train.jsonl")
+            print(f"  │   ├── {split_name}_{suffix}_validation.jsonl")
+            print(f"  │   └── {split_name}_{suffix}_test.jsonl")
+        print(f"  └── metadata_{suffix}.json")
+        
+        print(f"\nSample counts:")
+        total_train = 0
+        for split_name, counts in sample_counts.items():
+            train_count = counts.get("train", 0)
+            total_train += train_count
+            print(f"  {split_name}: {train_count:,} train samples")
+        print(f"  Total: {total_train:,} train samples")
+        
+        print(f"\nNext steps:")
+        print(f"1. Preprocess each split:")
+        for split_name in sample_counts.keys():
+            print(f"   bash process_nemotron_v1_qwen3-8B.sh {split_name} {suffix}")
+        print(f"\n2. Use individual splits:")
+        print(f"   DATASET_NAME=nemotron_v1_stem_{suffix} bash qwen_qad.sh ...")
+        print(f"\n3. Or use combined datablend:")
+        print(f"   BLEND_PATH=datablend_nemotron_v1_all_en_{suffix}.json bash qwen_qad.sh ...")
+        print("=" * 70)
+
+
+def download_combined_mode(args, splits_to_download, samples_per_split, suffix):
+    """Legacy combined mode - all splits in single files."""
+    output_dir = args.output_dir
+    
+    all_examples = []
+    
+    for split_name in splits_to_download:
+        if split_name not in AVAILABLE_SPLITS:
+            print(f"Warning: Unknown split '{split_name}', skipping...")
+            continue
+        
+        max_for_split = samples_per_split[split_name]
+        print(f"\n📥 Loading split: {split_name} (target: {max_for_split:,} samples)")
+        
+        try:
+            dataset = load_dataset(
+                "nvidia/Nemotron-Post-Training-Dataset-v1",
+                split=split_name,
+                streaming=True
+            )
+            
+            count = 0
+            for example in tqdm(dataset, desc=f"Processing {split_name}", total=max_for_split):
+                if count >= max_for_split:
+                    break
+                
+                messages = example.get("messages", [])
+                reasoning = example.get("reasoning", "") if args.include_reasoning else ""
+                
+                text = format_messages_to_text(messages, reasoning)
+                
+                if text.strip():
+                    all_examples.append({
+                        "text": text,
+                        "category": example.get("category", split_name),
+                        "source": "nemotron_v1"
+                    })
+                    count += 1
+            
+            print(f"✓ Collected {count:,} examples from {split_name}")
+            
+        except Exception as e:
+            print(f"Error loading {split_name}: {e}")
+            continue
+    
+    if not all_examples:
+        print("Error: No examples collected!")
+        return
+    
+    print(f"\n📊 Total examples collected: {len(all_examples):,}")
+    
+    # Shuffle and split
+    random.seed(RANDOM_SEED)
+    random.shuffle(all_examples)
+    
+    total_size = len(all_examples)
+    train_end = int(total_size * TRAIN_RATIO)
+    valid_end = train_end + int(total_size * VALID_RATIO)
+    
+    splits = {
+        'train': all_examples[:train_end],
+        'validation': all_examples[train_end:valid_end],
+        'test': all_examples[valid_end:]
+    }
+    
+    # Generate output filename
+    if set(splits_to_download) >= {"stem", "math", "code", "chat"}:
+        split_suffix = "all"
+    else:
+        split_suffix = "_".join(splits_to_download)
+    
+    full_suffix = f"_{suffix}" if suffix else ""
+    
+    # Save each split as JSONL
+    for split_name, split_data in splits.items():
+        output_file = os.path.join(output_dir, f"nemotron_{split_suffix}{full_suffix}_{split_name}.jsonl")
+        print(f"\nWriting {output_file}...")
+        
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for example in tqdm(split_data, desc=split_name):
+                json_line = json.dumps({"text": example["text"]}, ensure_ascii=False)
+                f.write(json_line + '\n')
+        
+        print(f"✓ Saved {split_name}")
+    
+    # Create datablend config
+    blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_{split_suffix}{full_suffix}.json")
+    
+    preprocessed_dir = output_dir.replace("nemotron_v1", "nemotron_v1_preprocessed")
+    blend_config = {
+        "train": [1.0, f"{preprocessed_dir}/nemotron_{split_suffix}{full_suffix}_train_text_document"],
+        "valid": [1.0, f"{preprocessed_dir}/nemotron_{split_suffix}{full_suffix}_validation_text_document"],
+        "test": [1.0, f"{preprocessed_dir}/nemotron_{split_suffix}{full_suffix}_test_text_document"]
+    }
+    
+    with open(blend_file, 'w') as f:
+        json.dump(blend_config, f, indent=2)
+    print(f"\n📝 Saved datablend config: {blend_file}")
+    
+    print("\n" + "=" * 70)
+    print("✓ Nemotron-v1 download complete (combined mode)!")
+    print(f"Output directory: {output_dir}")
+    print(f"\nDataset summary:")
+    print(f"  Total samples: {len(all_examples):,}")
+    print(f"  Train: {len(splits['train']):,}")
+    print(f"  Validation: {len(splits['validation']):,}")
+    print(f"  Test: {len(splits['test']):,}")
+    print("\nNext steps:")
+    print(f"1. Preprocess: bash process_nemotron_qwen3-8B.sh {split_suffix}{full_suffix}")
+    print(f"2. Run QAD: DATASET_NAME=nemotron_{split_suffix}{full_suffix} bash qwen_qad.sh ...")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_qad/data_utils/download_nemotron_v2.py b/examples/llm_qad/data_utils/download_nemotron_v2.py
new file mode 100644
index 000000000..0699dd98b
--- /dev/null
+++ b/examples/llm_qad/data_utils/download_nemotron_v2.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python3
+"""
+Download and preprocess NVIDIA Nemotron-Post-Training-Dataset-v2 for QAD training.
+
+Each split is saved to its own folder for fine-grained control over datablends.
+
+Splits available:
+- stem: Science, reasoning, humanities (English)
+- math: Step-by-step math solutions (English)
+- code: Programming challenges (English)
+- chat: Conversational tuning (English)
+- multilingual_ja: Japanese
+- multilingual_de: German
+- multilingual_it: Italian
+- multilingual_es: Spanish
+- multilingual_fr: French
+
+NOTE: This dataset is GATED. You need to:
+1. Go to https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2
+2. Request access and wait for approval
+3. Login with: huggingface-cli login
+
+Usage:
+    # Download all English splits (each to separate folder)
+    python download_nemotron_v2.py --sample-percent 30
+    
+    # Download specific splits
+    python download_nemotron_v2.py --splits stem,math --sample-percent 50
+    
+    # Include multilingual
+    python download_nemotron_v2.py --sample-percent 30 --include-multilingual
+
+Output structure:
+    nemotron_v2/
+    ├── stem/
+    │   ├── stem_30pct_train.jsonl
+    │   ├── stem_30pct_validation.jsonl
+    │   └── stem_30pct_test.jsonl
+    ├── math/
+    │   ├── math_30pct_train.jsonl
+    │   └── ...
+    └── ...
+
+Datablend configs:
+    datasets/
+    ├── datablend_nemotron_v2_stem_30pct.json      # Per-split configs
+    ├── datablend_nemotron_v2_math_30pct.json
+    └── datablend_nemotron_v2_all_en_30pct.json    # Combined config
+"""
+
+import argparse
+import json
+import os
+from datasets import load_dataset, get_dataset_config_names, load_dataset_builder
+from tqdm import tqdm
+
+DEFAULT_OUTPUT_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v2"
+DATABLEND_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets"
+DATASET_NAME = "nvidia/Nemotron-Post-Training-Dataset-v2"
+
+# Known splits (actual sizes will be fetched from HuggingFace)
+ENGLISH_SPLITS = ["stem", "math", "code", "chat"]
+MULTILINGUAL_SPLITS = ["multilingual_ja", "multilingual_de", "multilingual_it", 
+                       "multilingual_es", "multilingual_fr"]
+ALL_SPLIT_NAMES = ENGLISH_SPLITS + MULTILINGUAL_SPLITS
+
+
+def get_split_sizes(splits_to_check: list) -> dict:
+    """Fetch actual split sizes from HuggingFace dataset info."""
+    print("\n📊 Fetching actual dataset sizes from HuggingFace...")
+    
+    split_sizes = {}
+    
+    for split_name in splits_to_check:
+        try:
+            # Try to get dataset info without downloading
+            builder = load_dataset_builder(DATASET_NAME, split_name)
+            info = builder.info
+            
+            # Get the split info
+            if info.splits and split_name in info.splits:
+                split_sizes[split_name] = info.splits[split_name].num_examples
+                print(f"  ✓ {split_name}: {split_sizes[split_name]:,} samples")
+            else:
+                # If split info not available, try loading a small sample to estimate
+                print(f"  ⚠ {split_name}: size not in metadata, will count during download")
+                split_sizes[split_name] = None
+                
+        except Exception as e:
+            if "gated" in str(e).lower() or "access" in str(e).lower():
+                print(f"\n❌ ACCESS DENIED - Please request access at:")
+                print(f"   https://huggingface.co/datasets/{DATASET_NAME}")
+                print("   Then login with: huggingface-cli login")
+                raise
+            else:
+                print(f"  ⚠ {split_name}: could not fetch size ({e})")
+                split_sizes[split_name] = None
+    
+    return split_sizes
+
+# Train/valid/test split ratios
+TRAIN_RATIO = 0.95
+VALID_RATIO = 0.025
+TEST_RATIO = 0.025
+RANDOM_SEED = 42
+
+# Global tokenizer for chat template (initialized if --tokenizer is provided)
+_TOKENIZER = None
+
+
+def init_tokenizer(tokenizer_name: str):
+    """Initialize tokenizer for chat template formatting."""
+    global _TOKENIZER
+    if tokenizer_name:
+        from transformers import AutoTokenizer
+        print(f"📝 Loading tokenizer for chat template: {tokenizer_name}")
+        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        
+        # Show example
+        example = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
+        formatted = _TOKENIZER.apply_chat_template(example, tokenize=False)
+        print(f"   Example format:\n   {formatted[:200]}...")
+
+
+def format_messages_to_text(messages: list, reasoning: str = None) -> str:
+    """Convert messages format to text for QAD training.
+    
+    If a tokenizer is initialized, uses its chat template.
+    Otherwise, uses simple role-based formatting.
+    """
+    global _TOKENIZER
+    
+    # Optionally prepend reasoning/chain-of-thought as thinking block
+    if reasoning and reasoning.strip():
+        # Insert thinking block before last assistant message
+        messages_with_cot = []
+        for i, msg in enumerate(messages):
+            if msg.get("role") == "assistant" and i == len(messages) - 1:
+                # Add thinking before final assistant response
+                thinking_content = f"<think>\n{reasoning}\n</think>\n{msg.get('content', '')}"
+                messages_with_cot.append({"role": "assistant", "content": thinking_content})
+            else:
+                messages_with_cot.append(msg)
+        messages = messages_with_cot
+    
+    # Use chat template if tokenizer is available
+    if _TOKENIZER is not None:
+        try:
+            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
+        except Exception as e:
+            print(f"Warning: Chat template failed, using simple format: {e}")
+    
+    # Fallback: simple role-based format
+    text_parts = []
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        
+        if role == "system":
+            text_parts.append(f"System: {content}")
+        elif role == "user":
+            text_parts.append(f"User: {content}")
+        elif role == "assistant":
+            text_parts.append(f"Assistant: {content}")
+    
+    return "\n\n".join(text_parts)
+
+
+def download_split(split_name: str, max_samples: int, output_dir: str, 
+                   pct_str: str, include_reasoning: bool = False,
+                   sample_percent: float = None) -> dict:
+    """Download a single split and save to its own folder.
+    
+    Args:
+        split_name: Name of the split to download
+        max_samples: Maximum samples to download (None = download all, then sample)
+        output_dir: Output directory
+        pct_str: Percentage string for filenames
+        include_reasoning: Include chain-of-thought reasoning
+        sample_percent: If max_samples is None, use this percentage after counting
+    """
+    
+    split_dir = os.path.join(output_dir, split_name)
+    os.makedirs(split_dir, exist_ok=True)
+    
+    if max_samples is not None:
+        print(f"\n📥 Loading split: {split_name} (target: {max_samples:,} samples)")
+    else:
+        print(f"\n📥 Loading split: {split_name} (downloading all, will sample {sample_percent}%)")
+    
+    examples = []
+    
+    try:
+        # Load the specific split
+        dataset = load_dataset(
+            DATASET_NAME,
+            split=split_name,
+            streaming=True  # Use streaming for large datasets
+        )
+        
+        count = 0
+        for example in tqdm(dataset, desc=f"Processing {split_name}", total=max_samples):
+            if max_samples is not None and count >= max_samples:
+                break
+            
+            messages = example.get("messages", [])
+            reasoning = example.get("reasoning", "") if include_reasoning else ""
+            
+            # Convert to text format
+            text = format_messages_to_text(messages, reasoning)
+            
+            if text.strip():
+                examples.append({
+                    "text": text,
+                    "category": example.get("category", split_name),
+                    "source": "nemotron_v2",
+                    "split": split_name,
+                    "language": "multilingual" if "multilingual" in split_name else "en"
+                })
+                count += 1
+        
+        print(f"✓ Collected {count:,} examples from {split_name}")
+        
+        # If we downloaded all and need to sample
+        if max_samples is None and sample_percent is not None:
+            import random
+            random.seed(RANDOM_SEED)
+            target_samples = int(len(examples) * sample_percent / 100)
+            if target_samples < len(examples):
+                examples = random.sample(examples, target_samples)
+                print(f"  Sampled {len(examples):,} examples ({sample_percent}% of {count:,})")
+        
+    except Exception as e:
+        if "gated" in str(e).lower() or "access" in str(e).lower():
+            print(f"\n❌ ACCESS DENIED for {split_name}")
+            print("   Please request access at:")
+            print("   https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2")
+            print("   Then login with: huggingface-cli login")
+            return None
+        else:
+            print(f"Error loading {split_name}: {e}")
+            return None
+    
+    if not examples:
+        print(f"Warning: No examples collected from {split_name}")
+        return None
+    
+    # Shuffle and split into train/valid/test
+    import random
+    random.seed(RANDOM_SEED)
+    random.shuffle(examples)
+    
+    total_size = len(examples)
+    train_end = int(total_size * TRAIN_RATIO)
+    valid_end = train_end + int(total_size * VALID_RATIO)
+    
+    splits = {
+        'train': examples[:train_end],
+        'validation': examples[train_end:valid_end],
+        'test': examples[valid_end:]
+    }
+    
+    # Save each split
+    saved_files = {}
+    for data_split, data in splits.items():
+        output_file = os.path.join(split_dir, f"{split_name}{pct_str}_{data_split}.jsonl")
+        
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for ex in data:
+                json_line = json.dumps({"text": ex["text"]}, ensure_ascii=False)
+                f.write(json_line + '\n')
+        
+        saved_files[data_split] = output_file
+        print(f"  Saved {data_split}: {len(data):,} examples -> {output_file}")
+    
+    return {
+        'split_name': split_name,
+        'total': len(examples),
+        'train': len(splits['train']),
+        'validation': len(splits['validation']),
+        'test': len(splits['test']),
+        'files': saved_files
+    }
+
+
+def create_datablend_config(split_info: dict, output_dir: str, pct_str: str) -> str:
+    """Create datablend config for a single split."""
+    split_name = split_info['split_name']
+    
+    # Preprocessed path pattern
+    preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
+    split_preprocessed_dir = os.path.join(preprocessed_dir, split_name)
+    
+    blend_config = {
+        "train": [1.0, f"{split_preprocessed_dir}/{split_name}{pct_str}_train_text_document"],
+        "valid": [1.0, f"{split_preprocessed_dir}/{split_name}{pct_str}_validation_text_document"],
+        "test": [1.0, f"{split_preprocessed_dir}/{split_name}{pct_str}_test_text_document"]
+    }
+    
+    blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v2_{split_name}{pct_str}.json")
+    with open(blend_file, 'w') as f:
+        json.dump(blend_config, f, indent=2)
+    
+    return blend_file
+
+
+def create_combined_datablend(all_split_infos: list, output_dir: str, pct_str: str, 
+                               suffix: str = "all_en") -> str:
+    """Create combined datablend config for multiple splits with equal weighting."""
+    
+    preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
+    
+    # Calculate total samples for weighting
+    total_train = sum(info['train'] for info in all_split_infos)
+    
+    train_blend = []
+    valid_blend = []
+    test_blend = []
+    
+    for info in all_split_infos:
+        split_name = info['split_name']
+        split_preprocessed_dir = os.path.join(preprocessed_dir, split_name)
+        
+        # Weight proportional to sample count
+        weight = info['train'] / total_train if total_train > 0 else 1.0 / len(all_split_infos)
+        
+        train_blend.extend([weight, f"{split_preprocessed_dir}/{split_name}{pct_str}_train_text_document"])
+        valid_blend.extend([weight, f"{split_preprocessed_dir}/{split_name}{pct_str}_validation_text_document"])
+        test_blend.extend([weight, f"{split_preprocessed_dir}/{split_name}{pct_str}_test_text_document"])
+    
+    blend_config = {
+        "train": train_blend,
+        "valid": valid_blend,
+        "test": test_blend
+    }
+    
+    blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v2_{suffix}{pct_str}.json")
+    with open(blend_file, 'w') as f:
+        json.dump(blend_config, f, indent=2)
+    
+    return blend_file
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download Nemotron-v2 for QAD (per-split folders)")
+    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
+                        help="Output directory for JSONL files")
+    parser.add_argument("--splits", type=str, default="stem,math,code,chat",
+                        help="Comma-separated list of English splits to download")
+    parser.add_argument("--include-multilingual", action="store_true",
+                        help="Include all multilingual splits (ja, de, it, es, fr)")
+    parser.add_argument("--sample-percent", type=float, default=30.0,
+                        help="Percentage of each split to use (1-100). Default: 30%%")
+    parser.add_argument("--max-samples", type=int, default=None,
+                        help="Maximum samples per split (absolute cap)")
+    parser.add_argument("--include-reasoning", action="store_true", default=True,
+                        help="Include chain-of-thought reasoning in output (default: True)")
+    parser.add_argument("--no-reasoning", action="store_true",
+                        help="Exclude chain-of-thought reasoning from output")
+    parser.add_argument("--tokenizer", type=str, default=None,
+                        help="HuggingFace tokenizer to use for chat template (e.g., Qwen/Qwen3-8B). "
+                             "If not specified, uses simple role-based formatting.")
+    args = parser.parse_args()
+    
+    # Handle reasoning flag (--no-reasoning overrides default)
+    include_reasoning = args.include_reasoning and not args.no_reasoning
+    
+    # Initialize tokenizer if specified
+    if args.tokenizer:
+        init_tokenizer(args.tokenizer)
+    
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Build list of splits to download
+    splits_to_download = [s.strip() for s in args.splits.split(",")]
+    if args.include_multilingual:
+        splits_to_download.extend(MULTILINGUAL_SPLITS.keys())
+    
+    # Remove duplicates while preserving order
+    splits_to_download = list(dict.fromkeys(splits_to_download))
+    
+    pct_str = f"_{int(args.sample_percent)}pct"
+    reasoning_str = "_cot" if include_reasoning else ""  # chain-of-thought suffix
+    chat_str = "_chat" if args.tokenizer else ""  # chat template suffix
+    
+    print("=" * 70)
+    print("Downloading NVIDIA Nemotron-Post-Training-Dataset-v2")
+    print("=" * 70)
+    print("⚠️  NOTE: This dataset requires HuggingFace access approval!")
+    print("   If you get an access error, visit:")
+    print("   https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2")
+    print("=" * 70)
+    print(f"Splits: {splits_to_download}")
+    print(f"Sample percent: {args.sample_percent}%")
+    print(f"Include reasoning: {include_reasoning}")
+    print(f"Chat template: {args.tokenizer or 'Simple role-based format'}")
+    print(f"Output directory: {output_dir}")
+    print(f"Each split saved to: {output_dir}/<split_name>/")
+    print("=" * 70)
+    
+    # Get actual split sizes from HuggingFace
+    try:
+        actual_sizes = get_split_sizes(splits_to_download)
+    except Exception as e:
+        print(f"\n❌ Failed to fetch dataset info: {e}")
+        return
+    
+    # Calculate samples per split based on actual sizes
+    print(f"\nTarget samples per split:")
+    samples_per_split = {}
+    for split_name in splits_to_download:
+        available = actual_sizes.get(split_name)
+        if available is None:
+            print(f"  ⚠ {split_name}: size unknown, will download all and sample")
+            # If size unknown, set a large number and we'll sample during download
+            samples_per_split[split_name] = None  # Download all, then sample
+            continue
+        
+        if args.max_samples is not None:
+            samples_per_split[split_name] = min(available, args.max_samples)
+        else:
+            samples_per_split[split_name] = int(available * args.sample_percent / 100)
+        pct = samples_per_split[split_name] / available * 100
+        print(f"  {split_name}: {samples_per_split[split_name]:,} ({pct:.1f}% of {available:,})")
+    
+    print("=" * 70)
+    
+    # Download each split to its own folder
+    all_split_infos = []
+    
+    for split_name in splits_to_download:
+        if split_name not in samples_per_split:
+            continue
+        
+        max_for_split = samples_per_split.get(split_name)
+        split_info = download_split(
+            split_name=split_name,
+            max_samples=max_for_split,
+            output_dir=output_dir,
+            pct_str=pct_str + reasoning_str + chat_str,  # Include reasoning and chat template suffix
+            include_reasoning=include_reasoning,
+            sample_percent=args.sample_percent if max_for_split is None else None
+        )
+        
+        if split_info:
+            all_split_infos.append(split_info)
+            
+            # Create per-split datablend config
+            blend_file = create_datablend_config(split_info, output_dir, pct_str + reasoning_str + chat_str)
+            print(f"  📝 Datablend config: {blend_file}")
+    
+    if not all_split_infos:
+        print("\n❌ Error: No splits were successfully downloaded!")
+        return
+    
+    # Create combined datablend config
+    print("\n" + "=" * 70)
+    print("Creating combined datablend configs...")
+    
+    # English-only combined
+    full_suffix = pct_str + reasoning_str + chat_str
+    en_splits = [info for info in all_split_infos if "multilingual" not in info['split_name']]
+    if en_splits:
+        combined_file = create_combined_datablend(en_splits, output_dir, full_suffix, "all_en")
+        print(f"📝 Combined English datablend: {combined_file}")
+    
+    # All splits combined (if multilingual included)
+    if len(all_split_infos) > len(en_splits):
+        combined_all_file = create_combined_datablend(all_split_infos, output_dir, full_suffix, "all_multilingual")
+        print(f"📝 Combined all datablend: {combined_all_file}")
+    
+    # Save metadata JSON with sample counts
+    total_samples = sum(info['total'] for info in all_split_infos)
+    total_train = sum(info['train'] for info in all_split_infos)
+    total_valid = sum(info['validation'] for info in all_split_infos)
+    total_test = sum(info['test'] for info in all_split_infos)
+    
+    metadata = {
+        "dataset": DATASET_NAME,
+        "sample_percent": args.sample_percent,
+        "include_reasoning": include_reasoning,
+        "chat_template": args.tokenizer or "none (simple role format)",
+        "download_date": __import__('datetime').datetime.now().isoformat(),
+        "total_samples": total_samples,
+        "total_train": total_train,
+        "total_validation": total_valid,
+        "total_test": total_test,
+        "splits": {}
+    }
+    
+    for info in all_split_infos:
+        split_name = info['split_name']
+        metadata["splits"][split_name] = {
+            "available_in_dataset": actual_sizes.get(split_name),  # Actual HF count
+            "downloaded": info['total'],
+            "train": info['train'],
+            "validation": info['validation'],
+            "test": info['test'],
+            "files": info['files']
+        }
+    
+    metadata_file = os.path.join(output_dir, f"metadata{full_suffix}.json")
+    with open(metadata_file, 'w') as f:
+        json.dump(metadata, f, indent=2)
+    print(f"📊 Metadata saved: {metadata_file}")
+    
+    # Summary
+    print("\n" + "=" * 70)
+    print("✓ Download complete!")
+    print("=" * 70)
+    
+    print(f"\nSummary:")
+    print(f"  Total splits downloaded: {len(all_split_infos)}")
+    print(f"  Total samples: {total_samples:,}")
+    print(f"  Total train samples: {total_train:,}")
+    
+    print(f"\nPer-split breakdown:")
+    for info in all_split_infos:
+        print(f"  {info['split_name']}:")
+        print(f"    Total: {info['total']:,} | Train: {info['train']:,} | Valid: {info['validation']:,} | Test: {info['test']:,}")
+    
+    print(f"\nOutput structure:")
+    print(f"  {output_dir}/")
+    for info in all_split_infos:
+        print(f"  └── {info['split_name']}/")
+        print(f"      ├── {info['split_name']}{full_suffix}_train.jsonl")
+        print(f"      ├── {info['split_name']}{full_suffix}_validation.jsonl")
+        print(f"      └── {info['split_name']}{full_suffix}_test.jsonl")
+    
+    print(f"\nNext steps:")
+    print(f"1. Preprocess each split:")
+    for info in all_split_infos:
+        print(f"   bash process_nemotron_v2_qwen3-8B.sh {info['split_name']} {full_suffix.replace('_', '')}")
+    print(f"\n2. Or use individual datablend configs:")
+    for info in all_split_infos:
+        print(f"   DATASET_NAME=nemotron_v2_{info['split_name']}{full_suffix}")
+    print(f"\n3. Or use combined config:")
+    print(f"   DATASET_NAME=nemotron_v2_all_en{full_suffix}")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_qad/data_utils/download_openscience.py b/examples/llm_qad/data_utils/download_openscience.py
new file mode 100644
index 000000000..8dabe36e8
--- /dev/null
+++ b/examples/llm_qad/data_utils/download_openscience.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""
+Download and preprocess NVIDIA OpenScience dataset for QAD training.
+
+Usage:
+    # Simple format (default)
+    python download_openscience.py
+    
+    # With chat template (Qwen format)
+    python download_openscience.py --tokenizer Qwen/Qwen3-8B
+"""
+
+import argparse
+from datasets import load_dataset
+import json
+import os
+from tqdm import tqdm
+
+DEFAULT_OUTPUT_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/openscience_splits"
+
+# Split configuration
+TRAIN_RATIO = 0.95
+VALID_RATIO = 0.025
+TEST_RATIO = 0.025
+RANDOM_SEED = 42
+
+# Global tokenizer for chat template
+_TOKENIZER = None
+
+
+def init_tokenizer(tokenizer_name: str):
+    """Initialize tokenizer for chat template formatting."""
+    global _TOKENIZER
+    if tokenizer_name:
+        from transformers import AutoTokenizer
+        print(f"📝 Loading tokenizer for chat template: {tokenizer_name}")
+        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+
+
+def format_example(example: dict) -> str:
+    """Format a single example to text."""
+    global _TOKENIZER
+    
+    # OpenScience has input/output format
+    input_text = example.get("input", "")
+    output_text = example.get("output", "")
+    
+    if _TOKENIZER is not None:
+        # Use chat template
+        messages = [
+            {"role": "user", "content": input_text},
+            {"role": "assistant", "content": output_text}
+        ]
+        try:
+            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
+        except Exception as e:
+            print(f"Warning: Chat template failed: {e}")
+    
+    # Simple format
+    return f"User: {input_text}\n\nAssistant: {output_text}"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download OpenScience dataset")
+    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
+                        help="Output directory")
+    parser.add_argument("--tokenizer", type=str, default=None,
+                        help="HuggingFace tokenizer for chat template (e.g., Qwen/Qwen3-8B)")
+    args = parser.parse_args()
+    
+    OUTPUT_DIR = args.output_dir
+    chat_suffix = "_chat" if args.tokenizer else ""
+    
+    if args.tokenizer:
+        init_tokenizer(args.tokenizer)
+    
+    print("Loading NVIDIA/OpenScience dataset...")
+    try:
+        dataset = load_dataset("nvidia/OpenScience", "OS-Q3-235B-4")
+        
+        # create output directory if it doesn't exist
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        
+        # Handle different dataset structures
+        if 'train' in dataset:
+            full_data = dataset['train']
+        else:
+            # If no 'train' key, use the first available split
+            split_name = list(dataset.keys())[0]
+            print(f"No 'train' split found, using '{split_name}' split")
+            full_data = dataset[split_name]
+        
+        print(f"Shuffling {len(full_data)} examples with seed {RANDOM_SEED}...")
+        shuffled_data = full_data.shuffle(seed=RANDOM_SEED)
+        
+        total_size = len(shuffled_data)
+        train_end = int(total_size * TRAIN_RATIO)
+        valid_end = train_end + int(total_size * VALID_RATIO)
+        
+        splits_config = {
+            'train': shuffled_data.select(range(0, train_end)),
+            'validation': shuffled_data.select(range(train_end, valid_end)),
+            'test': shuffled_data.select(range(valid_end, total_size))
+        }
+        
+        print(f"\nCreated splits:")
+        for name, data in splits_config.items():
+            print(f"  {name}: {len(data)} examples ({len(data)/total_size*100:.2f}%)")
+        
+        print(f"\nFormat: {'Chat template' if args.tokenizer else 'Simple role format'}")
+        
+        # Save splits to JSONL
+        for split_name, split_data in splits_config.items():
+            output_file = os.path.join(OUTPUT_DIR, f"openscience{chat_suffix}_{split_name}.jsonl")
+            print(f"\nWriting {output_file}...")
+            
+            with open(output_file, 'w', encoding='utf-8') as f:
+                for example in tqdm(split_data, desc=split_name):
+                    # Format using chat template or simple format
+                    text = format_example(example)
+                    
+                    json_line = json.dumps({"text": text}, ensure_ascii=False)
+                    f.write(json_line + '\n')
+            
+            print(f"✓ Saved {len(split_data)} examples")
+        
+        print("\n✓ Dataset splitting complete!")
+        print(f"\nOutput files: openscience{chat_suffix}_*.jsonl")
+        
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        print("\nTrying alternative loading method...")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_qad/data_utils/process_all_datasets.sh b/examples/llm_qad/data_utils/process_all_datasets.sh
new file mode 100644
index 000000000..40247eafb
--- /dev/null
+++ b/examples/llm_qad/data_utils/process_all_datasets.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# Download and process all datasets with Qwen3-30B-A3B-Thinking-2507 chat template
+# All datasets are split into individual folders for fine-grained control
+
+set -e
+
+cd /lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM/examples/post_training/modelopt
+
+TOKENIZER="Qwen/Qwen3-30B-A3B-Thinking-2507"
+SUFFIX="30pct_cot_chat"
+
+echo "=========================================="
+echo "Downloading and Processing All Datasets"
+echo "Tokenizer: ${TOKENIZER}"
+echo "Suffix: ${SUFFIX}"
+echo "=========================================="
+
+# 1. Download datasets (all in split mode for fine-grained control)
+echo ""
+echo "=== Step 1: Downloading Datasets ==="
+
+# echo ">>> Downloading OpenScience..."
+# python download_openscience.py --tokenizer $TOKENIZER
+
+# echo ">>> Downloading Nemotron-v1 @ 30% (split mode)..."
+# python download_nemotron_v1.py --sample-percent 30 --include-reasoning --tokenizer $TOKENIZER
+
+# echo ">>> Downloading Nemotron-v2 @ 30%..."
+# python download_nemotron_v2.py --sample-percent 30 --tokenizer $TOKENIZER
+
+# 2. Process datasets
+echo ""
+echo "=== Step 2: Processing Datasets ==="
+
+# echo ">>> Processing OpenScience..."
+# bash process_openscience_qwen3-8B.sh chat ${TOKENIZER}
+
+echo ">>> Processing Nemotron-v1 splits..."
+for split in stem math code chat; do
+    echo "    Processing nemotron_v1/${split}..."
+    bash process_nemotron_v1_qwen3-8B.sh $split ${SUFFIX} ${TOKENIZER}
+done
+
+# echo ">>> Processing Nemotron-v2 splits..."
+# for split in stem math code chat; do
+#     echo "    Processing nemotron_v2/${split}..."
+#     bash process_nemotron_v2_qwen3-8B.sh $split ${SUFFIX} ${TOKENIZER}
+# done
+
+echo ""
+echo "=========================================="
+echo "✓ All datasets downloaded and processed!"
+echo "=========================================="
+echo ""
+echo "Available datasets:"
+echo "  - openscience_chat"
+echo "  - nemotron_v1_stem_${SUFFIX}"
+echo "  - nemotron_v1_math_${SUFFIX}"
+echo "  - nemotron_v1_code_${SUFFIX}"
+echo "  - nemotron_v1_chat_${SUFFIX}"
+echo "  - nemotron_v2_stem_${SUFFIX}"
+echo "  - nemotron_v2_math_${SUFFIX}"
+echo "  - nemotron_v2_code_${SUFFIX}"
+echo "  - nemotron_v2_chat_${SUFFIX}"
+echo "  - combined_cot_chat (uses all above with weights)"
+echo ""
+echo "Usage:"
+echo "  DATASET_NAME=combined_cot_chat sbatch sbatch_qwen_qad.sh --config configs/your-config.conf"
+echo "  DATASET_NAME=nemotron_v1_stem_${SUFFIX} sbatch sbatch_qwen_qad.sh --config ..."
+echo "=========================================="
diff --git a/examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh b/examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh
new file mode 100644
index 000000000..a28ceab6e
--- /dev/null
+++ b/examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# Preprocess Nemotron-v1 dataset for Qwen3-8B QAD training
+
+set -e
+
+# Default to ALL splits at 30% for best general improvement
+# Options: all_30pct (default), all_10pct, all_50pct, all_100pct, stem, math, etc.
+# Add _chat suffix for chat template formatted data
+# Examples:
+#   bash process_nemotron_qwen3-8B.sh all_30pct        # 30% of all splits (simple format)
+#   bash process_nemotron_qwen3-8B.sh all_30pct_chat   # 30% of all splits (chat template)
+#   bash process_nemotron_qwen3-8B.sh all_10pct        # 10% of all splits (~2.5M samples)
+#   bash process_nemotron_qwen3-8B.sh all_50pct        # 50% of all splits (~12.5M samples)
+SPLIT_NAME="${1:-all_30pct}"
+
+# Paths
+MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1"
+OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1_preprocessed"
+
+mkdir -p ${OUTPUT_DIR}
+
+# Install required dependencies
+echo "Installing dependencies..."
+pip install -q transformers tokenizers || true
+
+# Tokenizer settings for Qwen3-8B
+TOKENIZER_TYPE="HuggingFaceTokenizer"
+TOKENIZER_MODEL="Qwen/Qwen3-8B"
+
+# Number of workers for parallel processing
+WORKERS=32
+
+echo "=========================================="
+echo "Preprocessing Nemotron-v1 Dataset (${SPLIT_NAME}) for Qwen3-8B"
+echo "=========================================="
+
+# Process training split
+TRAIN_FILE="${INPUT_DIR}/nemotron_${SPLIT_NAME}_train.jsonl"
+if [ -f "${TRAIN_FILE}" ]; then
+    echo "Processing training split: ${TRAIN_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TRAIN_FILE} \
+        --output-prefix ${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_train \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Training file not found: ${TRAIN_FILE}"
+fi
+
+# Process validation split
+VALID_FILE="${INPUT_DIR}/nemotron_${SPLIT_NAME}_validation.jsonl"
+if [ -f "${VALID_FILE}" ]; then
+    echo "Processing validation split: ${VALID_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${VALID_FILE} \
+        --output-prefix ${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_validation \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Validation file not found: ${VALID_FILE}"
+fi
+
+# Process test split
+TEST_FILE="${INPUT_DIR}/nemotron_${SPLIT_NAME}_test.jsonl"
+if [ -f "${TEST_FILE}" ]; then
+    echo "Processing test split: ${TEST_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TEST_FILE} \
+        --output-prefix ${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_test \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Test file not found: ${TEST_FILE}"
+fi
+
+# Create datablend config
+BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_${SPLIT_NAME}.json"
+echo "Creating datablend config: ${BLEND_FILE}"
+cat > ${BLEND_FILE} << EOF
+{
+    "train": [1.0, "${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_train_text_document"],
+    "valid": [1.0, "${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_validation_text_document"],
+    "test": [1.0, "${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_test_text_document"]
+}
+EOF
+
+echo "=========================================="
+echo "✓ Nemotron-v1 (${SPLIT_NAME}) preprocessing complete!"
+echo "Output directory: ${OUTPUT_DIR}"
+echo "Datablend config: ${BLEND_FILE}"
+echo ""
+echo "To run QAD training:"
+echo "  bash qwen_qad.sh 1e-5 Qwen3-8B False nemotron_${SPLIT_NAME}"
+echo "=========================================="
+
diff --git a/examples/llm_qad/data_utils/process_nemotron_v1_qwen3-8B.sh b/examples/llm_qad/data_utils/process_nemotron_v1_qwen3-8B.sh
new file mode 100644
index 000000000..82bd5fbbf
--- /dev/null
+++ b/examples/llm_qad/data_utils/process_nemotron_v1_qwen3-8B.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+# Preprocess Nemotron-v1 dataset (split mode) for Qwen3 QAD training
+#
+# New folder structure from download_nemotron_v1.py:
+#   nemotron_v1/
+#   ├── stem/
+#   │   ├── stem_30pct_cot_chat_train.jsonl
+#   │   └── ...
+#   ├── math/
+#   │   └── ...
+#
+# Usage:
+#   bash process_nemotron_v1_qwen3-8B.sh <split> <suffix> [tokenizer]
+#
+# Examples:
+#   bash process_nemotron_v1_qwen3-8B.sh stem 30pct_cot_chat                              # Default: Qwen3-8B
+#   bash process_nemotron_v1_qwen3-8B.sh stem 30pct_cot_chat Qwen/Qwen3-30B-A3B-Thinking-2507  # Thinking model
+
+set -e
+
+# Ensure transformers is installed for tokenizer
+pip install -q transformers tokenizers
+
+# Arguments
+SPLIT="${1:-stem}"           # stem, math, code, chat
+SUFFIX="${2:-30pct_cot_chat}"
+TOKENIZER_MODEL="${3:-Qwen/Qwen3-8B}"  # Can override with any HuggingFace tokenizer
+
+# Paths
+MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1"
+OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1_preprocessed"
+
+mkdir -p ${OUTPUT_DIR}/${SPLIT}
+
+# Tokenizer settings
+TOKENIZER_TYPE="HuggingFaceTokenizer"
+
+# Number of workers for parallel processing
+WORKERS=32
+
+# Full name for output files
+FULL_NAME="${SPLIT}_${SUFFIX}"
+
+echo "=========================================="
+echo "Preprocessing Nemotron-v1 Dataset"
+echo "=========================================="
+echo "Split: ${SPLIT}"
+echo "Suffix: ${SUFFIX}"
+echo "Tokenizer: ${TOKENIZER_MODEL}"
+echo "Input dir: ${INPUT_DIR}/${SPLIT}/"
+echo "Output dir: ${OUTPUT_DIR}/${SPLIT}/"
+echo "=========================================="
+
+# Process training split
+TRAIN_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
+if [ -f "${TRAIN_FILE}" ]; then
+    echo "Processing training split: ${TRAIN_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TRAIN_FILE} \
+        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "❌ Error: Training file not found: ${TRAIN_FILE}"
+    echo "   Check if download was successful."
+    echo "   Expected file pattern: ${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
+    ls -la ${INPUT_DIR}/${SPLIT}/ 2>/dev/null || echo "   Directory doesn't exist: ${INPUT_DIR}/${SPLIT}/"
+    exit 1
+fi
+
+# Process validation split
+VALID_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_validation.jsonl"
+if [ -f "${VALID_FILE}" ]; then
+    echo "Processing validation split: ${VALID_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${VALID_FILE} \
+        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Validation file not found: ${VALID_FILE}"
+fi
+
+# Process test split
+TEST_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_test.jsonl"
+if [ -f "${TEST_FILE}" ]; then
+    echo "Processing test split: ${TEST_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TEST_FILE} \
+        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Test file not found: ${TEST_FILE}"
+fi
+
+# Create datablend config
+BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_${FULL_NAME}.json"
+echo "Creating datablend config: ${BLEND_FILE}"
+cat > ${BLEND_FILE} << EOF
+{
+    "train": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train_text_document"],
+    "valid": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation_text_document"],
+    "test": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test_text_document"]
+}
+EOF
+
+echo "=========================================="
+echo "✓ Nemotron-v1 (${FULL_NAME}) preprocessing complete!"
+echo "=========================================="
+echo "Output directory: ${OUTPUT_DIR}/${SPLIT}/"
+echo "Datablend config: ${BLEND_FILE}"
+echo ""
+echo "To run QAD training:"
+echo "  DATASET_NAME=nemotron_v1_${SPLIT}_${SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
+echo "=========================================="
+
diff --git a/examples/llm_qad/data_utils/process_nemotron_v2_qwen3-8B.sh b/examples/llm_qad/data_utils/process_nemotron_v2_qwen3-8B.sh
new file mode 100644
index 000000000..2194d5848
--- /dev/null
+++ b/examples/llm_qad/data_utils/process_nemotron_v2_qwen3-8B.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Preprocess Nemotron-v2 dataset for Qwen3 QAD training
+#
+# New folder structure from download_nemotron_v2.py:
+#   nemotron_v2/
+#   ├── stem/
+#   │   ├── stem_30pct_cot_train.jsonl
+#   │   └── ...
+#   ├── math/
+#   │   └── ...
+#
+# Usage:
+#   bash process_nemotron_v2_qwen3-8B.sh <split> [suffix] [tokenizer]
+#
+# Examples:
+#   bash process_nemotron_v2_qwen3-8B.sh stem 30pct_cot_chat                              # Default: Qwen3-8B
+#   bash process_nemotron_v2_qwen3-8B.sh stem 30pct_cot_chat Qwen/Qwen3-30B-A3B-Thinking-2507  # Thinking model
+
+set -e
+
+# Ensure transformers is installed for tokenizer
+pip install -q transformers tokenizers
+
+# Arguments
+SPLIT="${1:-stem}"           # stem, math, code, chat
+SUFFIX="${2:-30pct_cot}"     # e.g., 30pct, 30pct_cot, 50pct_cot
+TOKENIZER_MODEL="${3:-Qwen/Qwen3-8B}"  # Can override with any HuggingFace tokenizer
+
+# Normalize suffix (handle both 30pctcot and 30pct_cot)
+SUFFIX=$(echo "$SUFFIX" | sed 's/pctcot/pct_cot/g')
+
+# Paths
+MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v2"
+OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v2_preprocessed"
+
+mkdir -p ${OUTPUT_DIR}/${SPLIT}
+
+# Tokenizer settings
+TOKENIZER_TYPE="HuggingFaceTokenizer"
+
+# Number of workers for parallel processing
+WORKERS=32
+
+# Full name for output files
+FULL_NAME="${SPLIT}_${SUFFIX}"
+
+echo "=========================================="
+echo "Preprocessing Nemotron-v2 Dataset for Qwen3-8B"
+echo "=========================================="
+echo "Split: ${SPLIT}"
+echo "Suffix: ${SUFFIX}"
+echo "Input dir: ${INPUT_DIR}/${SPLIT}/"
+echo "Output dir: ${OUTPUT_DIR}/${SPLIT}/"
+echo "=========================================="
+
+# Process training split
+# File pattern: nemotron_v2/<split>/<split>_<suffix>_train.jsonl
+TRAIN_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
+if [ -f "${TRAIN_FILE}" ]; then
+    echo "Processing training split: ${TRAIN_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TRAIN_FILE} \
+        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "❌ Error: Training file not found: ${TRAIN_FILE}"
+    echo "   Check if download was successful."
+    echo "   Expected file pattern: ${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
+    ls -la ${INPUT_DIR}/${SPLIT}/ 2>/dev/null || echo "   Directory doesn't exist: ${INPUT_DIR}/${SPLIT}/"
+    exit 1
+fi
+
+# Process validation split
+VALID_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_validation.jsonl"
+if [ -f "${VALID_FILE}" ]; then
+    echo "Processing validation split: ${VALID_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${VALID_FILE} \
+        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Validation file not found: ${VALID_FILE}"
+fi
+
+# Process test split
+TEST_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_test.jsonl"
+if [ -f "${TEST_FILE}" ]; then
+    echo "Processing test split: ${TEST_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TEST_FILE} \
+        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Test file not found: ${TEST_FILE}"
+fi
+
+# Create datablend config
+# This matches what qwen_qad.sh expects
+BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_${FULL_NAME}.json"
+echo "Creating datablend config: ${BLEND_FILE}"
+cat > ${BLEND_FILE} << EOF
+{
+    "train": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train_text_document"],
+    "valid": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation_text_document"],
+    "test": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test_text_document"]
+}
+EOF
+
+echo "=========================================="
+echo "✓ Nemotron-v2 (${FULL_NAME}) preprocessing complete!"
+echo "=========================================="
+echo "Output directory: ${OUTPUT_DIR}/${SPLIT}/"
+echo "Datablend config: ${BLEND_FILE}"
+echo ""
+echo "To run QAD training:"
+echo "  DATASET_NAME=nemotron_v2_${SPLIT}_${SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
+echo ""
+echo "Or set in config file:"
+echo "  export DATASET_NAME=\"nemotron_v2_${SPLIT}\""
+if [[ "$SUFFIX" == *"cot"* ]]; then
+    echo "  # With chain-of-thought reasoning"
+    echo "  export DATASET_NAME=\"nemotron_v2_${SPLIT}_cot\""
+fi
+echo "=========================================="
diff --git a/examples/llm_qad/data_utils/process_openscience_qwen3-8B.sh b/examples/llm_qad/data_utils/process_openscience_qwen3-8B.sh
new file mode 100644
index 000000000..bec8eb394
--- /dev/null
+++ b/examples/llm_qad/data_utils/process_openscience_qwen3-8B.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+# Preprocess OpenScience dataset for Qwen3 QAD training
+#
+# Usage:
+#   bash process_openscience_qwen3-8B.sh [suffix] [tokenizer]
+#
+# Examples:
+#   bash process_openscience_qwen3-8B.sh                                          # Simple format, Qwen3-8B
+#   bash process_openscience_qwen3-8B.sh chat                                     # Chat template, Qwen3-8B
+#   bash process_openscience_qwen3-8B.sh chat Qwen/Qwen3-30B-A3B-Thinking-2507    # Chat template, Thinking model
+
+set -e
+
+# Arguments
+SUFFIX="${1:-}"  # empty for simple format, "chat" for chat template
+TOKENIZER_MODEL="${2:-Qwen/Qwen3-8B}"  # Can override with any HuggingFace tokenizer
+
+# Normalize suffix
+if [ -n "$SUFFIX" ]; then
+    FILE_SUFFIX="_${SUFFIX}"
+else
+    FILE_SUFFIX=""
+fi
+
+# Paths
+MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/openscience_splits"
+OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/openscience_splits_preprocessed"
+
+mkdir -p ${OUTPUT_DIR}
+
+# Tokenizer settings
+TOKENIZER_TYPE="HuggingFaceTokenizer"
+
+# Number of workers for parallel processing
+WORKERS=32
+
+echo "=========================================="
+echo "Preprocessing OpenScience Dataset"
+echo "Format suffix: ${FILE_SUFFIX:-none (simple format)}"
+echo "Tokenizer: ${TOKENIZER_MODEL}"
+echo "=========================================="
+
+# Process training split
+TRAIN_FILE="${INPUT_DIR}/openscience${FILE_SUFFIX}_train.jsonl"
+if [ -f "${TRAIN_FILE}" ]; then
+    echo "Processing training split: ${TRAIN_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TRAIN_FILE} \
+        --output-prefix ${OUTPUT_DIR}/openscience${FILE_SUFFIX}_train \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "❌ Training file not found: ${TRAIN_FILE}"
+    exit 1
+fi
+
+# Process validation split
+VALID_FILE="${INPUT_DIR}/openscience${FILE_SUFFIX}_validation.jsonl"
+if [ -f "${VALID_FILE}" ]; then
+    echo "Processing validation split: ${VALID_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${VALID_FILE} \
+        --output-prefix ${OUTPUT_DIR}/openscience${FILE_SUFFIX}_validation \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Validation file not found: ${VALID_FILE}"
+fi
+
+# Process test split (if exists)
+TEST_FILE="${INPUT_DIR}/openscience${FILE_SUFFIX}_test.jsonl"
+if [ -f "${TEST_FILE}" ]; then
+    echo "Processing test split: ${TEST_FILE}"
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${TEST_FILE} \
+        --output-prefix ${OUTPUT_DIR}/openscience${FILE_SUFFIX}_test \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+else
+    echo "Warning: Test file not found: ${TEST_FILE}"
+fi
+
+# Create datablend config
+BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience${FILE_SUFFIX}.json"
+echo "Creating datablend config: ${BLEND_FILE}"
+cat > ${BLEND_FILE} << EOF
+{
+    "train": [1.0, "${OUTPUT_DIR}/openscience${FILE_SUFFIX}_train_text_document"],
+    "valid": [1.0, "${OUTPUT_DIR}/openscience${FILE_SUFFIX}_validation_text_document"],
+    "test": [1.0, "${OUTPUT_DIR}/openscience${FILE_SUFFIX}_test_text_document"]
+}
+EOF
+
+echo "=========================================="
+echo "✓ Preprocessing complete!"
+echo "Output files are in: ${OUTPUT_DIR}"
+echo "Datablend config: ${BLEND_FILE}"
+echo "=========================================="
+
+# List generated files
+echo "Generated files:"
+ls -lh ${OUTPUT_DIR}/openscience${FILE_SUFFIX}*.bin 2>/dev/null || echo "No .bin files found"
+ls -lh ${OUTPUT_DIR}/openscience${FILE_SUFFIX}*.idx 2>/dev/null || echo "No .idx files found"
+
+echo ""
+echo "To use in QAD training:"
+echo "  DATASET_NAME=openscience${FILE_SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
diff --git a/examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh b/examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh
new file mode 100644
index 000000000..98b237805
--- /dev/null
+++ b/examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Preprocess SlimOrca dataset for Qwen3-8B QAD training
+
+set -e
+
+# Paths
+MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
+INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/slimorca"
+OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/slimorca_preprocessed"
+
+mkdir -p ${OUTPUT_DIR}
+
+# Tokenizer settings for Qwen3-8B
+TOKENIZER_TYPE="HuggingFaceTokenizer"
+TOKENIZER_MODEL="Qwen/Qwen3-8B"
+
+# Number of workers for parallel processing
+WORKERS=32
+
+echo "=========================================="
+echo "Preprocessing SlimOrca Dataset for Qwen3-8B"
+echo "=========================================="
+
+# Process training split
+if [ -f "${INPUT_DIR}/slimorca_train.jsonl" ]; then
+    echo "Processing training split..."
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${INPUT_DIR}/slimorca_train.jsonl \
+        --output-prefix ${OUTPUT_DIR}/slimorca_train \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+fi
+
+# Process validation split
+if [ -f "${INPUT_DIR}/slimorca_validation.jsonl" ]; then
+    echo "Processing validation split..."
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${INPUT_DIR}/slimorca_validation.jsonl \
+        --output-prefix ${OUTPUT_DIR}/slimorca_validation \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+fi
+
+# Process test split
+if [ -f "${INPUT_DIR}/slimorca_test.jsonl" ]; then
+    echo "Processing test split..."
+    python ${MLM_DIR}/tools/preprocess_data.py \
+        --input ${INPUT_DIR}/slimorca_test.jsonl \
+        --output-prefix ${OUTPUT_DIR}/slimorca_test \
+        --tokenizer-type ${TOKENIZER_TYPE} \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --append-eod \
+        --workers ${WORKERS} \
+        --json-keys text
+fi
+
+echo "=========================================="
+echo "✓ SlimOrca preprocessing complete!"
+echo "Output directory: ${OUTPUT_DIR}"
+echo "=========================================="
+
diff --git a/examples/llm_qad/qwen_qad.sh b/examples/llm_qad/qwen_qad.sh
new file mode 100644
index 000000000..05707e99f
--- /dev/null
+++ b/examples/llm_qad/qwen_qad.sh
@@ -0,0 +1,932 @@
+#!/bin/bash
+# Generic QAD training script for Qwen models - Docker/Interactive Version
+# Supports: Qwen3-8B, Qwen3-30B-A3B (MoE), and other Qwen variants
+#
+# Usage:
+#   # With config file (recommended)
+#   bash qwen_qad.sh --config configs/qwen3-8b-default.conf
+#   bash qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
+#
+#   # With HuggingFace token (secure, not logged)
+#   bash qwen_qad.sh --hf-token hf_xxx --config configs/qwen3-8b-default.conf
+#
+#   # With command line args
+#   bash qwen_qad.sh [LR] [TEACHER_MODEL] [DATASET_NAME] [STUDENT_MODEL] [KD_CFG_PATH]
+#   bash qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B
+#   bash qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B /path/to/kd_config.yaml
+#
+#   # Config + override
+#   LR=1e-5 bash qwen_qad.sh --config configs/qwen3-8b-default.conf
+#
+# Get interactive node:
+#   srun -A coreai_dlalgo_modelopt --nodes=1 -p batch --mpi=pmix \
+#     -J qwen-qad:dev \
+#     --container-image=/lustre/.../pytorch_25.06-py3.sqsh \
+#     --container-mounts="/lustre/fsw:/lustre/fsw" \
+#     --container-workdir="/lustre/.../workspace" \
+#     -t 4:0:0 --pty bash
+
+set -e  # Exit on error
+
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=19
+export UB_TIMEOUT=720
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_DEBUG=WARN
+export NCCL_SHM_DISABLE=1
+export NCCL_NVLS_ENABLE=0
+export GLOO_SOCKET_IFNAME=ibp26s0
+# Disable torch inductor subprocess compilation to avoid CUDA fork issues
+export TORCHINDUCTOR_COMPILE_THREADS=1
+# Disable PyTorch compilation to avoid Triton/cubin errors during training
+export TORCH_COMPILE_DISABLE=1
+# Workaround for B300 autograd issues with quantization
+export PYTORCH_NO_CUDA_MEMORY_CACHING=0
+export TORCH_DISTRIBUTED_DEBUG=OFF
+# Force fallback for missing autograd kernels
+export PYTORCH_JIT=0
+export TORCH_USE_CUDA_DSA=0
+
+# HuggingFace token for accessing gated models (avoids rate limiting)
+# Set via: export HF_TOKEN=hf_xxx or in config file
+if [ -n "${HF_TOKEN:-}" ]; then
+    export HF_TOKEN="${HF_TOKEN}"
+    export HUGGING_FACE_HUB_TOKEN="${HF_TOKEN}"  # Legacy variable name
+    echo "🔑 HuggingFace token configured"
+fi
+
+########################################################
+#### CONFIG FILE LOADING ####
+########################################################
+
+SCRIPT_PATH=$(realpath "$0")
+SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
+CONFIG_FILE=""
+HF_TOKEN_ARG=""
+
+# Parse arguments
+POSITIONAL_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --config|-c)
+            CONFIG_FILE="$2"
+            shift 2
+            ;;
+        --hf-token)
+            HF_TOKEN_ARG="$2"
+            shift 2
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+# Set HF_TOKEN from arg (takes precedence, doesn't appear in logs)
+if [ -n "$HF_TOKEN_ARG" ]; then
+    export HF_TOKEN="$HF_TOKEN_ARG"
+fi
+# Restore positional args (handle empty array for set -u)
+if [ ${#POSITIONAL_ARGS[@]} -gt 0 ]; then
+    set -- "${POSITIONAL_ARGS[@]}"
+else
+    set --
+fi
+
+# Load config file if specified
+if [ -n "$CONFIG_FILE" ]; then
+    # Handle relative paths
+    if [[ ! "$CONFIG_FILE" = /* ]]; then
+        CONFIG_FILE="${SCRIPT_DIR}/${CONFIG_FILE}"
+    fi
+    
+    if [ -f "$CONFIG_FILE" ]; then
+        echo "📄 Loading config from: ${CONFIG_FILE}"
+        source "$CONFIG_FILE"
+    else
+        echo "❌ ERROR: Config file not found: ${CONFIG_FILE}"
+        echo "Available configs:"
+        ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null || echo "  (none found)"
+        exit 1
+    fi
+fi
+
+########################################################
+#### CONFIGURATION PARAMETERS ####
+########################################################
+
+CURRENT_DIR=$(pwd)
+
+# Command line args override config/env
+# Order: LR, TEACHER_MODEL, DATASET_NAME, STUDENT_MODEL, KD_CFG_PATH
+LR="${1:-${LR:-1e-6}}"
+TEACHER_MODEL="${2:-${TEACHER_MODEL:-Qwen3-8B}}"
+DATASET_NAME="${3:-${DATASET_NAME:-openscience}}"
+STUDENT_MODEL="${4:-${STUDENT_MODEL:-Qwen3-8B}}"
+KD_CFG_PATH="${5:-${KD_CFG_PATH:-}}"
+
+# Allow environment variable override (takes precedence)
+STUDENT_MODEL="${STUDENT_MODEL_ENV:-$STUDENT_MODEL}"
+TEACHER_MODEL="${TEACHER_MODEL_ENV:-$TEACHER_MODEL}"
+
+########################################################
+#### PATH CONFIGURATION ####
+########################################################
+
+MLM_DIR="${MLM_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM}"
+MODELOPT_DIR="${MODELOPT_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer}"
+MODEL_CONF_DIR="${MLM_DIR}/examples/post_training/modelopt/conf/Qwen"
+MODELS_ROOT="${MODELS_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models}"
+QAD_CHECKPOINT_ROOT="${QAD_CHECKPOINT_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints}"
+DATACACHE_DIR="${DATACACHE_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache}"
+
+########################################################
+#### MODEL CONFIGURATION - Source from conf file ####
+########################################################
+
+# Load student model architecture config (MODEL_ARGS)
+# Config file path can be set explicitly or auto-detected
+STUDENT_CONFIG_FILE="${STUDENT_CONFIG_FILE:-${MODEL_CONF_DIR}/${STUDENT_MODEL}.sh}"
+
+if [ ! -f "${STUDENT_CONFIG_FILE}" ]; then
+    echo "❌ ERROR: Student model config not found: ${STUDENT_CONFIG_FILE}"
+    echo "Available model configs:"
+    ls -1 "${MODEL_CONF_DIR}/"*.sh 2>/dev/null | xargs -n1 basename | sed 's/.sh$//' || echo "  (none)"
+    exit 1
+fi
+
+echo "📄 Loading student model config from: ${STUDENT_CONFIG_FILE}"
+source "${STUDENT_CONFIG_FILE}"
+STUDENT_MODEL_ARGS="${MODEL_ARGS}"
+
+# Parallelism settings (from config file, required)
+TP_SIZE="${TP_SIZE:?ERROR: TP_SIZE must be set in config}"
+EP_SIZE="${EP_SIZE:-1}"
+MBS="${MBS:?ERROR: MBS must be set in config}"
+
+# Detect MoE from EP_SIZE
+if [ "${EP_SIZE}" -gt 1 ]; then
+    IS_MOE=true
+else
+    IS_MOE=false
+fi
+
+# Disable log-params-norm for MoE models by default (causes OOM due to FP32 conversion)
+# Can be overridden with LOG_PARAMS_NORM=1 in config
+if [ "${LOG_PARAMS_NORM:-}" = "1" ]; then
+    LOG_PARAMS_NORM_ARG="--log-params-norm"
+elif [ "$IS_MOE" = "true" ]; then
+    LOG_PARAMS_NORM_ARG=""  # Disabled for MoE to save memory
+    echo "⚠️  log-params-norm disabled for MoE model (saves ~2GB memory)"
+else
+    LOG_PARAMS_NORM_ARG="--log-params-norm"
+fi
+
+echo "🔧 Model: ${STUDENT_MODEL}"
+echo "   TP=${TP_SIZE}, EP=${EP_SIZE}, MBS=${MBS}, MoE=${IS_MOE}"
+
+########################################################
+#### CHECKPOINT PATHS (REQUIRED) ####
+########################################################
+
+# STUDENT_CKPT: Path to student checkpoint (REQUIRED)
+# TEACHER_CKPT: Path to teacher checkpoint (REQUIRED)
+# These must be set in config file or environment
+
+if [ -z "${STUDENT_CKPT:-}" ]; then
+    echo "❌ ERROR: STUDENT_CKPT is required. Set it in config or environment."
+    exit 1
+fi
+if [ -z "${TEACHER_CKPT:-}" ]; then
+    echo "❌ ERROR: TEACHER_CKPT is required. Set it in config or environment."
+    exit 1
+fi
+
+BASE_STUDENT_CKPT="${STUDENT_CKPT}"
+TEACHER_CKPT_DIR="${TEACHER_CKPT}"
+
+# TEACHER_MODEL_CONFIG is required
+if [ -z "${TEACHER_MODEL_CONFIG:-}" ]; then
+    echo "❌ ERROR: TEACHER_MODEL_CONFIG is required. Set it in config or environment."
+    exit 1
+fi
+
+if [ ! -f "${TEACHER_MODEL_CONFIG}" ]; then
+    echo "❌ ERROR: Teacher model config file not found: ${TEACHER_MODEL_CONFIG}"
+    exit 1
+fi
+
+echo "📚 Student checkpoint: ${STUDENT_CKPT}"
+echo "🎓 Teacher checkpoint: ${TEACHER_CKPT}"
+
+########################################################
+#### OUTPUT PATHS ####
+########################################################
+
+DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
+
+# Use checkpoint directory name for output path
+STUDENT_CKPT_NAME=$(basename "${STUDENT_CKPT}")
+TEACHER_CKPT_NAME=$(basename "${TEACHER_CKPT}")
+OUTPUT_ROOT="${QAD_CHECKPOINT_ROOT}/${STUDENT_CKPT_NAME}-Teacher-${TEACHER_CKPT_NAME}-Data-${DATASET_NAME}-lr${LR}"
+NAME="${STUDENT_CKPT_NAME}"
+
+RUN_DIR="${OUTPUT_ROOT}"
+LOGS_DIR="${RUN_DIR}/logs"
+CHECKPOINT_DIR="${RUN_DIR}/checkpoints/${NAME}"
+TENSORBOARD_DIR="${RUN_DIR}/tensorboard/${NAME}"
+ENV_LOG_FILENAME=${NAME}_${DATETIME}.env.log
+
+########################################################
+#### KD CONFIG ####
+########################################################
+
+# KD_CFG_PATH: Path to custom KD config YAML (optional)
+# If set, uses custom distillation configuration
+if [ -n "${KD_CFG_PATH}" ]; then
+    if [ -f "${KD_CFG_PATH}" ]; then
+        KD_CFG_ARGS="--export-kd-cfg ${KD_CFG_PATH}"
+        echo "🎓 Using KD config: ${KD_CFG_PATH}"
+    else
+        echo "⚠️  Warning: KD config not found: ${KD_CFG_PATH}, using default KD settings"
+        KD_CFG_ARGS=""
+    fi
+else
+    KD_CFG_ARGS=""
+fi
+
+########################################################
+#### DATASET SELECTION ####
+########################################################
+
+# Select Datablend based on argument
+# Naming convention:
+#   - Plain text: datablend_<dataset>.json
+#   - With COT (chain-of-thought): datablend_<dataset>_cot.json
+#   - With chat template: datablend_<dataset>_chat.json
+#   - With both COT and chat: datablend_<dataset>_cot_chat.json
+case "$DATASET_NAME" in
+    # ====================
+    # Nemotron-v1 options (plain text)
+    # ====================
+    nemotron_10pct|nemotron_all_10pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct.json}"
+        DEFAULT_TRAIN_SAMPLES=2500000
+        echo "📊 Using Nemotron-v1 ALL Subjects @ 10% (~2.5M samples)"
+        ;;
+    nemotron|nemotron_30pct|nemotron_all_30pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct.json}"
+        DEFAULT_TRAIN_SAMPLES=7500000
+        echo "📊 Using Nemotron-v1 ALL Subjects @ 30% (~7.5M samples)"
+        ;;
+    nemotron_50pct|nemotron_all_50pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct.json}"
+        DEFAULT_TRAIN_SAMPLES=12500000
+        echo "📊 Using Nemotron-v1 ALL Subjects @ 50% (~12.5M samples)"
+        ;;
+    nemotron_100pct|nemotron_all_100pct|nemotron_full)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_100pct.json}"
+        DEFAULT_TRAIN_SAMPLES=25000000
+        echo "📊 Using Nemotron-v1 ALL Subjects @ 100% (~25M samples)"
+        ;;
+    nemotron_stem)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem.json}"
+        DEFAULT_TRAIN_SAMPLES=5000000
+        echo "📊 Using Nemotron-v1 STEM Dataset (Best for MMLU)"
+        ;;
+    nemotron_math)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_math.json}"
+        DEFAULT_TRAIN_SAMPLES=2000000
+        echo "📊 Using Nemotron-v1 Math Dataset"
+        ;;
+
+    # ====================
+    # Nemotron-v1 with COT (chain-of-thought reasoning)
+    # ====================
+    nemotron_10pct_cot|nemotron_all_10pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=2500000
+        echo "📊 Using Nemotron-v1 ALL @ 10% + COT (~2.5M samples)"
+        ;;
+    nemotron_30pct_cot|nemotron_all_30pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=7500000
+        echo "📊 Using Nemotron-v1 ALL @ 30% + COT (~7.5M samples)"
+        ;;
+    nemotron_50pct_cot|nemotron_all_50pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=12500000
+        echo "📊 Using Nemotron-v1 ALL @ 50% + COT (~12.5M samples)"
+        ;;
+    nemotron_stem_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=5000000
+        echo "📊 Using Nemotron-v1 STEM + COT"
+        ;;
+
+    # ====================
+    # Nemotron-v1 with chat template (no COT)
+    # ====================
+    nemotron_10pct_chat|nemotron_all_10pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=2500000
+        echo "📊 Using Nemotron-v1 ALL @ 10% + Chat Template (~2.5M samples)"
+        ;;
+    nemotron_chat|nemotron_30pct_chat|nemotron_all_30pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=7500000
+        echo "📊 Using Nemotron-v1 ALL @ 30% + Chat Template (~7.5M samples)"
+        ;;
+    nemotron_50pct_chat|nemotron_all_50pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=12500000
+        echo "📊 Using Nemotron-v1 ALL @ 50% + Chat Template (~12.5M samples)"
+        ;;
+    nemotron_stem_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=5000000
+        echo "📊 Using Nemotron-v1 STEM + Chat Template"
+        ;;
+
+    # ====================
+    # Nemotron-v1 with COT + chat template
+    # ====================
+    nemotron_10pct_cot_chat|nemotron_all_10pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=2500000
+        echo "📊 Using Nemotron-v1 ALL @ 10% + COT + Chat Template (~2.5M samples)"
+        ;;
+    nemotron_cot_chat|nemotron_30pct_cot_chat|nemotron_all_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=7500000
+        echo "📊 Using Nemotron-v1 ALL @ 30% + COT + Chat Template (~7.5M samples)"
+        ;;
+    nemotron_50pct_cot_chat|nemotron_all_50pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=12500000
+        echo "📊 Using Nemotron-v1 ALL @ 50% + COT + Chat Template (~12.5M samples)"
+        ;;
+    nemotron_stem_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=5000000
+        echo "📊 Using Nemotron-v1 STEM + COT + Chat Template"
+        ;;
+
+    # ====================
+    # Nemotron-v1 individual splits (fine-grained control)
+    # Format: nemotron_v1_<split>_<pct>pct_cot_chat
+    # ====================
+    nemotron_v1_stem|nemotron_v1_stem_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_stem_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=5886717  # ~6M (30% of 20.6M * 0.95 train split)
+        echo "📊 Using Nemotron-v1 STEM @ 30% + COT + Chat (~5.9M samples)"
+        ;;
+    nemotron_v1_math|nemotron_v1_math_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_math_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=582654   # ~583K (30% of 2M * 0.95)
+        echo "📊 Using Nemotron-v1 Math @ 30% + COT + Chat (~583K samples)"
+        ;;
+    nemotron_v1_code|nemotron_v1_code_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_code_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=540472   # ~540K (30% of 1.9M * 0.95)
+        echo "📊 Using Nemotron-v1 Code @ 30% + COT + Chat (~540K samples)"
+        ;;
+    nemotron_v1_chat|nemotron_v1_chat_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_chat_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=212786   # ~213K (30% of 746K * 0.95)
+        echo "📊 Using Nemotron-v1 Chat @ 30% + COT + Chat (~213K samples)"
+        ;;
+    nemotron_v1_all|nemotron_v1_all_en_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_all_en_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=7222629  # Sum of all splits
+        echo "📊 Using Nemotron-v1 ALL splits @ 30% + COT + Chat (~7.2M samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 combined options (plain text)
+    # Total @ 30%: stem(101K) + math(68K) + code(50K) + chat(179K) = ~398K
+    # ====================
+    nemotron_v2|nemotron_v2_30pct|nemotron_v2_all_en_30pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct.json}"
+        DEFAULT_TRAIN_SAMPLES=398198
+        echo "📊 Using Nemotron-v2 English @ 30% (~398K samples)"
+        ;;
+    nemotron_v2_50pct|nemotron_v2_all_en_50pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct.json}"
+        DEFAULT_TRAIN_SAMPLES=663663
+        echo "📊 Using Nemotron-v2 English @ 50% (~664K samples)"
+        ;;
+    nemotron_v2_multilingual|nemotron_v2_all_multilingual_30pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_multilingual_30pct.json}"
+        DEFAULT_TRAIN_SAMPLES=600000
+        echo "📊 Using Nemotron-v2 ALL Languages @ 30% (~600K samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 combined with chat template
+    # ====================
+    nemotron_v2_chat_tmpl|nemotron_v2_30pct_chat|nemotron_v2_all_en_30pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=398198
+        echo "📊 Using Nemotron-v2 English @ 30% + Chat Template (~398K samples)"
+        ;;
+    nemotron_v2_50pct_chat|nemotron_v2_all_en_50pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=663663
+        echo "📊 Using Nemotron-v2 English @ 50% + Chat Template (~664K samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 combined with COT (chain-of-thought reasoning)
+    # ====================
+    nemotron_v2_cot|nemotron_v2_30pct_cot|nemotron_v2_all_en_30pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=398198
+        echo "📊 Using Nemotron-v2 English @ 30% + COT (~398K samples)"
+        ;;
+    nemotron_v2_50pct_cot|nemotron_v2_all_en_50pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=663663
+        echo "📊 Using Nemotron-v2 English @ 50% + COT (~664K samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 combined with COT + chat template
+    # ====================
+    nemotron_v2_cot_chat|nemotron_v2_30pct_cot_chat|nemotron_v2_all_en_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=398198
+        echo "📊 Using Nemotron-v2 English @ 30% + COT + Chat Template (~398K samples)"
+        ;;
+    nemotron_v2_50pct_cot_chat|nemotron_v2_all_en_50pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=663663
+        echo "📊 Using Nemotron-v2 English @ 50% + COT + Chat Template (~664K samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 individual splits (plain text)
+    # ====================
+    nemotron_v2_stem|nemotron_v2_stem_30pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct.json}"
+        DEFAULT_TRAIN_SAMPLES=101175
+        echo "📊 Using Nemotron-v2 STEM split @ 30% (~101K samples)"
+        ;;
+    nemotron_v2_math|nemotron_v2_math_30pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct.json}"
+        DEFAULT_TRAIN_SAMPLES=68248
+        echo "📊 Using Nemotron-v2 Math split @ 30% (~68K samples)"
+        ;;
+    nemotron_v2_code|nemotron_v2_code_30pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct.json}"
+        DEFAULT_TRAIN_SAMPLES=99750
+        echo "📊 Using Nemotron-v2 Code split @ 30% (~50K x2 epochs)"
+        ;;
+    nemotron_v2_chat|nemotron_v2_chat_30pct)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct.json}"
+        DEFAULT_TRAIN_SAMPLES=178900
+        echo "📊 Using Nemotron-v2 Chat split @ 30% (~179K samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 individual splits with chat template
+    # ====================
+    nemotron_v2_stem_chat|nemotron_v2_stem_30pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=101175
+        echo "📊 Using Nemotron-v2 STEM @ 30% + Chat Template (~101K samples)"
+        ;;
+    nemotron_v2_math_chat|nemotron_v2_math_30pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=68248
+        echo "📊 Using Nemotron-v2 Math @ 30% + Chat Template (~68K samples)"
+        ;;
+    nemotron_v2_code_chat|nemotron_v2_code_30pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=99750
+        echo "📊 Using Nemotron-v2 Code @ 30% + Chat Template (~50K x2 epochs)"
+        ;;
+    nemotron_v2_chat_chat|nemotron_v2_chat_30pct_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=178900
+        echo "📊 Using Nemotron-v2 Chat @ 30% + Chat Template (~179K samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 individual splits with COT (chain-of-thought reasoning)
+    # ====================
+    nemotron_v2_stem_cot|nemotron_v2_stem_30pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=101175
+        echo "📊 Using Nemotron-v2 STEM split @ 30% + COT (~101K samples)"
+        ;;
+    nemotron_v2_math_cot|nemotron_v2_math_30pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=68248
+        echo "📊 Using Nemotron-v2 Math split @ 30% + COT (~68K samples)"
+        ;;
+    nemotron_v2_code_cot|nemotron_v2_code_30pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=49875
+        echo "📊 Using Nemotron-v2 Code split @ 30% + COT (~50K samples)"
+        ;;
+    nemotron_v2_chat_cot|nemotron_v2_chat_30pct_cot)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct_cot.json}"
+        DEFAULT_TRAIN_SAMPLES=178900
+        echo "📊 Using Nemotron-v2 Chat split @ 30% + COT (~179K samples)"
+        ;;
+
+    # ====================
+    # Nemotron-v2 individual splits with COT + chat template
+    # ====================
+    nemotron_v2_stem_cot_chat|nemotron_v2_stem_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=101175
+        echo "📊 Using Nemotron-v2 STEM @ 30% + COT + Chat Template (~101K samples)"
+        ;;
+    nemotron_v2_math_cot_chat|nemotron_v2_math_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=68248
+        echo "📊 Using Nemotron-v2 Math @ 30% + COT + Chat Template (~68K samples)"
+        ;;
+    nemotron_v2_code_cot_chat|nemotron_v2_code_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=49875
+        echo "📊 Using Nemotron-v2 Code @ 30% + COT + Chat Template (~50K samples)"
+        ;;
+    nemotron_v2_chat_cot_chat|nemotron_v2_chat_30pct_cot_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=178900
+        echo "📊 Using Nemotron-v2 Chat @ 30% + COT + Chat Template (~179K samples)"
+        ;;
+
+    # ====================
+    # OpenScience datasets
+    # ====================
+    openscience)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience.json}"
+        DEFAULT_TRAIN_SAMPLES=299800
+        echo "📊 Using OpenScience Dataset (plain text)"
+        ;;
+    openscience_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=299800
+        echo "📊 Using OpenScience Dataset + Chat Template"
+        ;;
+
+    # ====================
+    # Combined datasets
+    # ====================
+    combined|combined_v1_v2_openscience)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_v1_v2_openscience.json}"
+        DEFAULT_TRAIN_SAMPLES=10000000
+        echo "📊 Using Combined Dataset: 50% Nemotron-v1 + 30% Nemotron-v2 + 20% OpenScience (~10M samples)"
+        ;;
+    combined_chat|combined_v1_v2_openscience_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_v1_v2_openscience_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=10000000
+        echo "📊 Using Combined Dataset + Chat Template (~10M samples)"
+        ;;
+    combined_cot_chat|combined_all_cot_chat)
+        # Combined: 20% OpenScience + 50% Nemotron-v1 + 30% Nemotron-v2 (all splits)
+        # All with COT reasoning + Qwen3 chat template
+        # Nemotron-v2 breakdown: 7.5% stem + 7.5% math + 5% code + 10% chat = 30%
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_cot_chat.json}"
+        # Total samples: ~300K OpenScience + ~7.5M Nemotron-v1 + ~398K Nemotron-v2 ≈ 8.2M
+        DEFAULT_TRAIN_SAMPLES=8200000
+        echo "📊 Using Combined Dataset + COT + Chat Template (~8.2M samples)"
+        echo "   - 20% OpenScience (chat)"
+        echo "   - 50% Nemotron-v1 @ 30% (cot+chat)"
+        echo "   - 30% Nemotron-v2 @ 30% (stem+math+code+chat, cot+chat)"
+        ;;
+    combined_v2|combined_v2_cot_chat)
+        # Combined V2: Code & Math focused
+        # All with COT reasoning + Qwen3 chat template
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_v2_cot_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=1024000  # manually set to 1M samples
+        echo "📊 Using Combined V2 (Code & Math focused) + COT + Chat (~8.2M samples)"
+        echo "   - 20% OpenScience"
+        echo "   - 40% Nemotron-v1 (10% stem, 10% math, 15% code, 5% chat)"
+        echo "   - 40% Nemotron-v2 (5% stem, 10% math, 15% code, 10% chat)"
+        ;;
+
+    # ====================
+    # Other datasets
+    # ====================
+    slimorca)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_slimorca.json}"
+        DEFAULT_TRAIN_SAMPLES=500000
+        echo "📊 Using SlimOrca Dataset"
+        ;;
+    slimorca_chat)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_slimorca_chat.json}"
+        DEFAULT_TRAIN_SAMPLES=500000
+        echo "📊 Using SlimOrca Dataset + Chat Template"
+        ;;
+
+    # ====================
+    # Default fallback
+    # ====================
+    *)
+        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience.json}"
+        DEFAULT_TRAIN_SAMPLES=299800
+        echo "📊 Using OpenScience Dataset (Default)"
+        ;;
+esac
+
+# Allow override via environment variable
+TRAIN_SAMPLES=${TRAIN_SAMPLES:-$DEFAULT_TRAIN_SAMPLES}
+LR_DECAY_SAMPLES=$(python3 -c "print(int(${TRAIN_SAMPLES} * 0.99))")
+LR_WARMUP_SAMPLES=$(python3 -c "print(int(${TRAIN_SAMPLES} * 0.01))")
+
+echo "📈 Training samples configuration:"
+echo "    Train samples: ${TRAIN_SAMPLES}"
+echo "    LR decay samples: ${LR_DECAY_SAMPLES}"
+echo "    LR warmup samples: ${LR_WARMUP_SAMPLES}"
+
+########################################################
+#### RESUME LOGIC ####
+########################################################
+
+if [ -f "${CHECKPOINT_DIR}/latest_checkpointed_iteration.txt" ]; then
+    echo "🔄 Found existing checkpoint at ${CHECKPOINT_DIR}"
+    echo "   Resuming training from there..."
+    LOAD_CHECKPOINT_DIR="${CHECKPOINT_DIR}"
+    FINETUNE_FLAG=""
+    LOAD_OPTIM_ARGS=""
+    CKPT_PARALLEL_LOAD_ARG="--ckpt-fully-parallel-load"
+else
+    echo "🆕 No existing checkpoint found. Starting fresh from base student."
+    LOAD_CHECKPOINT_DIR="${BASE_STUDENT_CKPT}"
+    FINETUNE_FLAG="--finetune"
+    LOAD_OPTIM_ARGS="--no-load-optim --no-load-rng"
+    CKPT_PARALLEL_LOAD_ARG=""
+fi
+
+########################################################
+#### CREATE DIRECTORIES ####
+########################################################
+
+mkdir -p ${LOGS_DIR}
+mkdir -p ${CHECKPOINT_DIR}
+mkdir -p ${DATACACHE_DIR}
+mkdir -p ${TENSORBOARD_DIR}
+
+########################################################
+#### LOG ENVIRONMENT ####
+########################################################
+
+echo "========================================" | tee ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "Starting ${STUDENT_MODEL} NVFP4 QAD Training" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "Time: ${DATETIME}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+echo "<< MODEL CONFIG >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "STUDENT_MODEL=${STUDENT_MODEL}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "TEACHER_MODEL=${TEACHER_MODEL}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "CONFIG_FILE=${STUDENT_CONFIG_FILE}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "IS_MOE=${IS_MOE}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "<< END MODEL CONFIG >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo -e "\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+echo "<< START PATHS >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "SCRIPT_DIR=${SCRIPT_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "MLM_DIR=${MLM_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "RUN_DIR=${RUN_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "LOGS_DIR=${LOGS_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "CHECKPOINT_DIR=${CHECKPOINT_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "DATACACHE_DIR=${DATACACHE_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "TENSORBOARD_DIR=${TENSORBOARD_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "LOAD_CHECKPOINT_DIR=${LOAD_CHECKPOINT_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "<< END PATHS >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+echo "<< START GIT >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "GIT LOG" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+git -C ${MLM_DIR} log --oneline -1 |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "GIT STATUS" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+git -C ${MLM_DIR} status --porcelain --branch |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "GIT DIFF" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+git -C ${MLM_DIR} diff |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "<< END GIT >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+echo "<< START ENV >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+env |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "<< END ENV >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+########################################################
+#### TRAINING ARGUMENTS ####
+########################################################
+
+# Iterations to skip (if any)
+ITERATIONS_TO_SKIP="${ITERATIONS_TO_SKIP:-}"
+
+# Number of GPUs to use
+NUM_GPUS=${NUM_GPUS:-8}
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+MASTER_PORT=${MASTER_PORT:-29500}
+
+# Checkpoint and Model Loading
+CHECKPOINT_ARGS=" \
+    --auto-detect-ckpt-format \
+    --export-te-mcore-model \
+    --dist-ckpt-strictness log_unexpected \
+    ${FINETUNE_FLAG} \
+    ${LOAD_OPTIM_ARGS} \
+    --load ${LOAD_CHECKPOINT_DIR}"
+
+# Add KD teacher args (always enabled - TEACHER_CKPT and TEACHER_MODEL_CONFIG are required)
+CHECKPOINT_ARGS="${CHECKPOINT_ARGS} \
+    --export-quant-cfg nvfp4 \
+    --export-kd-teacher-load ${TEACHER_CKPT_DIR} \
+    --teacher-model-config ${TEACHER_MODEL_CONFIG} \
+    ${KD_CFG_ARGS}"
+
+# Tokenizer Settings (from sourced config or default)
+TOKENIZER_MODEL="${TOKENIZER_MODEL:-${HF_MODEL_CKPT:-Qwen/${STUDENT_MODEL}}}"
+TOKENIZER_ARGS=" \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL}"
+
+# Data Settings
+DATA_ARGS=" \
+    --per-split-data-args-path ${BLEND_PATH} \
+    --data-cache-path ${DATACACHE_DIR} \
+    --no-mmap-bin-files \
+    --num-dataset-builder-threads 16 \
+    --no-create-attention-mask-in-dataloader"
+
+# Training Hyperparameters
+TRAINING_ARGS=" \
+    --micro-batch-size ${MBS} \
+    --global-batch-size 256 \
+    --train-samples ${TRAIN_SAMPLES} \
+    --lr-decay-samples ${LR_DECAY_SAMPLES} \
+    --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --bf16 \
+    --no-masked-softmax-fusion"
+
+# Optimizer Settings
+OPTIMIZER_ARGS=" \
+    --lr ${LR} \
+    --min-lr 0.0 \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --lr-decay-style cosine \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --use-distributed-optimizer \
+    --overlap-grad-reduce \
+    --overlap-param-gather"
+
+# Parallelism Settings
+# Build parallel args based on model type
+PARALLEL_ARGS=" \
+    --tensor-model-parallel-size ${TP_SIZE} \
+    --pipeline-model-parallel-size ${PP_SIZE:-1} \
+    --distributed-timeout-minutes 360 \
+    --disable-gloo-process-groups \
+    --ddp-num-buckets 7"
+
+# Add expert parallelism for MoE models
+if [ "$IS_MOE" = "true" ] && [ "$EP_SIZE" -gt 1 ]; then
+    PARALLEL_ARGS="${PARALLEL_ARGS} \
+    --expert-model-parallel-size ${EP_SIZE}"
+    echo "🔧 MoE Expert Parallelism: EP=${EP_SIZE}"
+fi
+
+# Add sequence parallel if supported (check if it's in MODEL_ARGS)
+if echo "$STUDENT_MODEL_ARGS" | grep -q "sequence-parallel"; then
+    echo "🔧 Sequence Parallel: enabled (from model config)"
+else
+    PARALLEL_ARGS="${PARALLEL_ARGS} --sequence-parallel"
+fi
+
+# Memory Optimization
+MEMORY_ARGS=" \
+    --recompute-granularity full \
+    --recompute-method uniform \
+    --recompute-num-layers 1 \
+    --no-gradient-accumulation-fusion"
+
+# Checkpoint Saving
+SAVE_ARGS=" \
+    --save ${CHECKPOINT_DIR} \
+    --save-interval 200 \
+    --save-retain-interval 200 \
+    --ckpt-format torch_dist \
+    --ckpt-fully-parallel-save \
+    --ckpt-assume-constant-structure \
+    --exit-duration-in-mins 230 \
+    ${CKPT_PARALLEL_LOAD_ARG}"
+
+# Logging and Monitoring
+LOGGING_ARGS=" \
+    --log-interval 10 \
+    --eval-iters 20 \
+    --eval-interval 200 \
+    --log-progress \
+    --timing-log-option minmax \
+    ${LOG_PARAMS_NORM_ARG:-} \
+    --log-num-zeros-in-grad \
+    --log-throughput \
+    --log-straggler \
+    --disable-straggler-on-startup \
+    --straggler-minmax-count 16 \
+    --tensorboard-dir ${TENSORBOARD_DIR}"
+
+# Runtime Settings
+RUNTIME_ARGS=" \
+    --exit-duration-in-mins 1200 \
+    --num-workers 8 \
+    --no-check-for-nan-in-loss-and-grad"
+
+# Combine all arguments
+# NOTE: Argument order matters! Later args override earlier ones (argparse behavior)
+# 
+# Order explanation:
+#   1. CHECKPOINT_ARGS   - Loading/saving config
+#   2. STUDENT_MODEL_ARGS - From conf file (may contain --micro-batch-size, --bf16, --save-interval, etc.)
+#   3. TOKENIZER_ARGS    - Overrides --tokenizer-type from conf file
+#   4. DATA_ARGS         - Dataset configuration
+#   5. TRAINING_ARGS     - Overrides --micro-batch-size, --bf16 from conf file
+#   6. OPTIMIZER_ARGS    - Learning rate, optimizer settings
+#   7. PARALLEL_ARGS     - TP/PP/EP settings
+#   8. MEMORY_ARGS       - Recompute settings
+#   9. SAVE_ARGS         - Overrides --save-interval from conf file
+#   10. LOGGING_ARGS     - Logging configuration
+#   11. RUNTIME_ARGS     - Runtime settings
+#
+# This allows conf files to set defaults that QAD script can override
+ALL_ARGS=" \
+    ${CHECKPOINT_ARGS} \
+    ${STUDENT_MODEL_ARGS} \
+    ${TOKENIZER_ARGS} \
+    ${DATA_ARGS} \
+    ${TRAINING_ARGS} \
+    ${OPTIMIZER_ARGS} \
+    ${PARALLEL_ARGS} \
+    ${MEMORY_ARGS} \
+    ${SAVE_ARGS} \
+    ${LOGGING_ARGS} \
+    ${RUNTIME_ARGS}"
+
+if [ -n "${ITERATIONS_TO_SKIP}" ]; then
+    ALL_ARGS="${ALL_ARGS} --iterations-to-skip ${ITERATIONS_TO_SKIP}"
+fi
+
+# Update PYTHONPATH
+export PYTHONPATH="${MODELOPT_DIR}:${MLM_DIR}:${PYTHONPATH:-}"
+
+########################################################
+#### LAUNCH TRAINING ####
+########################################################
+
+echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "Running training command..." | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+LOG_FILE="${LOGS_DIR}/${MODEL_SHORT_NAME}_qad_${DATETIME}.log"
+
+echo "Output will be written to: ${LOG_FILE}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+# Multi-node configuration
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+
+echo "<< DISTRIBUTED CONFIG >>" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  NNODES: ${NNODES}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  NODE_RANK: ${NODE_RANK}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  NUM_GPUS per node: ${NUM_GPUS}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  MASTER_ADDR: ${MASTER_ADDR}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  MASTER_PORT: ${MASTER_PORT}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  TP: ${TP_SIZE}, PP: ${PP_SIZE:-1}, EP: ${EP_SIZE}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  MBS: ${MBS}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "  Total GPUs: $((NNODES * NUM_GPUS))" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "<< END DISTRIBUTED CONFIG >>" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+
+# Launch training
+torchrun \
+    --nproc_per_node=${NUM_GPUS} \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    ${MLM_DIR}/pretrain_gpt.py ${ALL_ARGS} 2>&1 | tee ${LOG_FILE}
+
+echo "" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "Training completed or exited" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "Check logs at: ${LOG_FILE}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
+echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
diff --git a/examples/llm_qad/sbatch_qwen_qad.sh b/examples/llm_qad/sbatch_qwen_qad.sh
new file mode 100755
index 000000000..1d86a959c
--- /dev/null
+++ b/examples/llm_qad/sbatch_qwen_qad.sh
@@ -0,0 +1,276 @@
+#!/bin/bash
+
+#SBATCH -p batch
+#SBATCH --account=coreai_dlalgo_modelopt
+#SBATCH --nodes=4
+#SBATCH -t 4:00:00
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --ntasks-per-node=1
+#SBATCH --job-name=coreai_dlalgo_modelopt-qwen.qad
+
+# Usage:
+#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
+#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
+#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-moe.conf
+#
+# With HuggingFace token:
+#   sbatch sbatch_qwen_qad.sh --hf-token hf_xxx --config configs/qwen3-8b-default.conf
+#
+# Override config values:
+#   LR=1e-5 sbatch sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
+#   STUDENT_FP4_CKPT=/path/to/ckpt sbatch sbatch_qwen_qad.sh --config ...
+#
+# Command line usage:
+#   sbatch sbatch_qwen_qad.sh [LR] [TEACHER_MODEL] [DATASET_NAME] [STUDENT_MODEL] [KD_CFG_PATH]
+#   sbatch sbatch_qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B
+#   sbatch sbatch_qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B /path/to/kd_config.yaml
+
+set -x -e
+
+########################################################
+# Parse Arguments
+########################################################
+
+# Use SLURM_SUBMIT_DIR if available (SLURM copies script to temp location)
+# Otherwise use the script's directory
+if [ -n "${SLURM_SUBMIT_DIR:-}" ]; then
+    SCRIPT_DIR="${SLURM_SUBMIT_DIR}"
+else
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+fi
+CONFIG_FILE=""
+HF_TOKEN_ARG=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --config|-c)
+            CONFIG_FILE="$2"
+            shift 2
+            ;;
+        --hf-token)
+            HF_TOKEN_ARG="$2"
+            shift 2
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
+
+# Set HF_TOKEN from arg (takes precedence, doesn't appear in logs)
+if [ -n "$HF_TOKEN_ARG" ]; then
+    export HF_TOKEN="$HF_TOKEN_ARG"
+fi
+
+########################################################
+# Load Config File
+########################################################
+
+if [ -n "$CONFIG_FILE" ]; then
+    # Handle relative paths
+    if [[ ! "$CONFIG_FILE" = /* ]]; then
+        CONFIG_FILE="${SCRIPT_DIR}/${CONFIG_FILE}"
+    fi
+    
+    if [ -f "$CONFIG_FILE" ]; then
+        echo "📄 Loading config from: ${CONFIG_FILE}"
+        source "$CONFIG_FILE"
+    else
+        echo "❌ ERROR: Config file not found: ${CONFIG_FILE}"
+        echo "Available configs:"
+        ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null || echo "  (none found)"
+        exit 1
+    fi
+fi
+
+########################################################
+# Default Values (only if not set by config/env)
+########################################################
+
+# Training args (command line can override)
+# Order: LR, TEACHER_MODEL, DATASET_NAME, STUDENT_MODEL, KD_CFG_PATH
+LR="${1:-${LR:-1e-6}}"
+TEACHER_MODEL="${2:-${TEACHER_MODEL:-Qwen3-8B}}"
+DATASET_NAME="${3:-${DATASET_NAME:-openscience}}"
+STUDENT_MODEL="${4:-${STUDENT_MODEL:-Qwen3-8B}}"
+KD_CFG_PATH="${5:-${KD_CFG_PATH:-}}"
+
+# Paths
+MLM_DIR="${MLM_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM}"
+MODELOPT_DIR="${MODELOPT_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer}"
+MODELS_ROOT="${MODELS_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models}"
+QAD_CHECKPOINT_ROOT="${QAD_CHECKPOINT_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints}"
+DATACACHE_DIR="${DATACACHE_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache}"
+LOG_DIR="${LOG_DIR:-${QAD_CHECKPOINT_ROOT}/logs_slurm}"
+
+# Container
+CONTAINER_IMAGE="${CONTAINER_IMAGE:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh}"
+CONTAINER_MOUNTS="${CONTAINER_MOUNTS:-/lustre/fsw:/lustre/fsw}"
+CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad}"
+
+# Parallelism settings (from config, required)
+TP_SIZE="${TP_SIZE:?ERROR: TP_SIZE must be set in config}"
+PP_SIZE="${PP_SIZE:-1}"
+EP_SIZE="${EP_SIZE:-1}"
+MBS="${MBS:?ERROR: MBS must be set in config}"
+
+# Other settings
+NUM_GPUS="${NUM_GPUS:-8}"
+MASTER_PORT="${MASTER_PORT:-29500}"
+
+# Multi-node config from SLURM (passed via sbatch --nodes=N)
+NNODES="${SLURM_NNODES:-4}"
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+
+# Create directories
+mkdir -p ${LOG_DIR}
+
+DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
+
+########################################################
+# Display Configuration
+########################################################
+
+echo "========================================"
+echo "QAD Training Configuration"
+echo "========================================"
+if [ -n "$CONFIG_FILE" ]; then
+    echo "CONFIG_FILE: ${CONFIG_FILE}"
+fi
+echo ""
+echo "Model:"
+echo "  STUDENT_MODEL: ${STUDENT_MODEL}"
+echo "  TEACHER_MODEL: ${TEACHER_MODEL}"
+echo ""
+echo "Training:"
+echo "  LR: ${LR}"
+echo "  DATASET: ${DATASET_NAME}"
+echo "  KD_CFG_PATH: ${KD_CFG_PATH:-none}"
+echo ""
+echo "Parallelism:"
+echo "  TP: ${TP_SIZE}, PP: ${PP_SIZE}, EP: ${EP_SIZE}"
+echo "  MBS: ${MBS}"
+echo "  NNODES: ${NNODES}"
+echo "  NUM_GPUS/node: ${NUM_GPUS}"
+echo "  Total GPUs: $((NNODES * NUM_GPUS))"
+echo ""
+echo "Distributed:"
+echo "  MASTER_ADDR: ${MASTER_ADDR}"
+echo "  MASTER_PORT: ${MASTER_PORT}"
+echo "  SLURM_NODELIST: ${SLURM_JOB_NODELIST}"
+echo ""
+echo "Paths:"
+echo "  MLM_DIR: ${MLM_DIR}"
+echo "  MODELOPT_DIR: ${MODELOPT_DIR}"
+echo "  MODELS_ROOT: ${MODELS_ROOT}"
+echo "  QAD_CHECKPOINT_ROOT: ${QAD_CHECKPOINT_ROOT}"
+echo "  DATACACHE_DIR: ${DATACACHE_DIR}"
+echo "  LOG_DIR: ${LOG_DIR}"
+echo ""
+echo "Container:"
+echo "  IMAGE: ${CONTAINER_IMAGE}"
+echo "  WORKDIR: ${CONTAINER_WORKDIR}"
+
+# Show checkpoint paths
+echo ""
+echo "Checkpoints:"
+echo "  STUDENT_CKPT: ${STUDENT_CKPT:-NOT SET}"
+echo "  TEACHER_CKPT: ${TEACHER_CKPT:-NOT SET}"
+if [ -n "${TEACHER_MODEL_CONFIG:-}" ]; then
+    echo "  TEACHER_MODEL_CONFIG: ${TEACHER_MODEL_CONFIG}"
+fi
+if [ -n "${BLEND_PATH:-}" ]; then
+    echo "  BLEND_PATH: ${BLEND_PATH}"
+fi
+echo "========================================"
+
+# Validate required checkpoints
+if [ -z "${STUDENT_CKPT:-}" ]; then
+    echo "❌ ERROR: STUDENT_CKPT is required. Set it in config file."
+    exit 1
+fi
+if [ -z "${TEACHER_CKPT:-}" ]; then
+    echo "❌ ERROR: TEACHER_CKPT is required. Set it in config file."
+    exit 1
+fi
+
+########################################################
+# Build Container Environment Exports
+########################################################
+
+# Core exports
+EXPORTS="export NODE_RANK=\${SLURM_PROCID} && \
+export NNODES=${NNODES} && \
+export NUM_GPUS=${NUM_GPUS} && \
+export TP_SIZE=${TP_SIZE} && \
+export PP_SIZE=${PP_SIZE} && \
+export EP_SIZE=${EP_SIZE} && \
+export MBS=${MBS} && \
+export MASTER_ADDR=${MASTER_ADDR} && \
+export MASTER_PORT=${MASTER_PORT} && \
+export MLM_DIR=${MLM_DIR} && \
+export MODELOPT_DIR=${MODELOPT_DIR} && \
+export MODELS_ROOT=${MODELS_ROOT} && \
+export QAD_CHECKPOINT_ROOT=${QAD_CHECKPOINT_ROOT} && \
+export DATACACHE_DIR=${DATACACHE_DIR}"
+
+# Checkpoint exports (required)
+EXPORTS="${EXPORTS} && export STUDENT_CKPT=${STUDENT_CKPT}"
+EXPORTS="${EXPORTS} && export TEACHER_CKPT=${TEACHER_CKPT}"
+if [ -n "${TEACHER_MODEL_CONFIG:-}" ]; then
+    EXPORTS="${EXPORTS} && export TEACHER_MODEL_CONFIG=${TEACHER_MODEL_CONFIG}"
+fi
+
+# Optional dataset exports
+if [ -n "${BLEND_PATH:-}" ]; then
+    EXPORTS="${EXPORTS} && export BLEND_PATH=${BLEND_PATH}"
+fi
+if [ -n "${TRAIN_SAMPLES:-}" ]; then
+    EXPORTS="${EXPORTS} && export TRAIN_SAMPLES=${TRAIN_SAMPLES}"
+fi
+
+# HuggingFace token (avoid rate limiting)
+if [ -n "${HF_TOKEN:-}" ]; then
+    EXPORTS="${EXPORTS} && export HF_TOKEN=${HF_TOKEN}"
+    EXPORTS="${EXPORTS} && export HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}"
+fi
+if [ -n "${ITERATIONS_TO_SKIP:-}" ]; then
+    EXPORTS="${EXPORTS} && export ITERATIONS_TO_SKIP=${ITERATIONS_TO_SKIP}"
+fi
+
+# Optional KD config
+if [ -n "${DISTILL_CONFIG_PATH:-}" ]; then
+    EXPORTS="${EXPORTS} && export DISTILL_CONFIG_PATH=${DISTILL_CONFIG_PATH}"
+fi
+
+########################################################
+# Launch Training
+########################################################
+
+SCRIPT_NAME="qwen_qad.sh"
+
+run_cmd="pip install transformers==4.54 && \
+${EXPORTS} && \
+cd ${CONTAINER_WORKDIR} && \
+bash ${SCRIPT_NAME} ${LR} ${TEACHER_MODEL} ${DATASET_NAME} ${STUDENT_MODEL} ${KD_CFG_PATH}"
+
+echo ""
+echo "Running command:"
+echo "${run_cmd}"
+echo ""
+
+srun -l \
+    --output=${LOG_DIR}/%x_%j_${DATETIME}.log \
+    --error=${LOG_DIR}/err_%x_%j_${DATETIME}.log \
+    --container-image ${CONTAINER_IMAGE} \
+    --container-mounts ${CONTAINER_MOUNTS} \
+    --container-workdir ${CONTAINER_WORKDIR} \
+    sh -c "${run_cmd}"
+
+echo ""
+echo "========================================"
+echo "QAD Training completed at $(date)"
+echo "Logs: ${LOG_DIR}/"
+echo "========================================"

From 14e28c9faf2db465da643bf7c08b7a758823e267 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:44 -0800
Subject: [PATCH 02/16] fix precommit

Signed-off-by: Wei-Ming Chen <weimingc@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_qad/README.md | 360 +++++++++++++++++++++++++++++++++++++
 1 file changed, 360 insertions(+)
 create mode 100644 examples/llm_qad/README.md

diff --git a/examples/llm_qad/README.md b/examples/llm_qad/README.md
new file mode 100644
index 000000000..a05ad72cc
--- /dev/null
+++ b/examples/llm_qad/README.md
@@ -0,0 +1,360 @@
+# Qwen QAD Training Scripts
+
+Quantization-Aware Distillation (QAD) training scripts for Qwen models using Megatron-LM. These scripts enable training quantized (NVFP4) student models with knowledge distillation from full-precision teacher models.
+
+## Overview
+
+| Script | Purpose |
+|--------|---------|
+| `qwen_qad.sh` | Main training script (interactive/Docker) |
+| `sbatch_qwen_qad.sh` | SLURM batch submission wrapper |
+| `configs/*.conf` | Model-specific configuration files |
+
+## Quick Start
+
+### SLURM Batch Submission (Recommended) for H100 x 8
+
+```bash
+# With HuggingFace token (for gated models)
+sbatch sbatch_qwen_qad.sh --hf-token $HF_TOKEN --config configs/qwen3-30b-a3b-thinking-2507-moe.conf
+```
+
+### Interactive Mode
+
+```bash
+# Get interactive node first
+srun -A coreai_dlalgo_modelopt --nodes=1 -p batch --mpi=pmix \
+    -J qwen-qad:dev \
+    --container-image=/lustre/.../pytorch_25.06-py3.sqsh \
+    --container-mounts="/lustre/fsw:/lustre/fsw" \
+    -t 4:0:0 --pty bash
+
+# Run training
+bash qwen_qad.sh --config configs/qwen3-8b-default.conf
+```
+
+## Configuration Files
+
+Configuration files in `configs/` define model architecture, parallelism, and checkpoint paths.
+
+### Required Config Variables
+
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `STUDENT_MODEL` | Student model name | `Qwen3-8B` |
+| `TEACHER_MODEL` | Teacher model name | `Qwen3-8B` |
+| `STUDENT_CKPT` | Path to quantized student checkpoint | `/path/to/Qwen3-8B-NVFP4-TP1-MLM` |
+| `TEACHER_CKPT` | Path to teacher checkpoint | `/path/to/Qwen3-8B-TP1-MLM` |
+| `TEACHER_MODEL_CONFIG` | Teacher model YAML config | `/path/to/Qwen3-8B-teacher.yaml` |
+| `TP_SIZE` | Tensor parallelism size | `1`, `4`, `8` |
+| `MBS` | Micro-batch size | `1`, `2`, `4` |
+
+### Optional Config Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `PP_SIZE` | `1` | Pipeline parallelism size |
+| `EP_SIZE` | `1` | Expert parallelism (MoE models) |
+| `NUM_GPUS` | `8` | GPUs per node |
+| `LR` | `1e-6` | Learning rate |
+| `DATASET_NAME` | `openscience` | Training dataset |
+| `TRAIN_SAMPLES` | Auto | Override training samples |
+| `BLEND_PATH` | Auto | Override datablend path |
+
+### Example Config Structure
+
+```bash
+# configs/qwen3-8b-default.conf
+export STUDENT_MODEL="Qwen3-8B"
+export TEACHER_MODEL="Qwen3-8B"
+export STUDENT_CKPT="/path/to/Qwen3-8B-NVFP4-TP1-MLM"
+export TEACHER_CKPT="/path/to/Qwen3-8B-TP1-MLM"
+export TEACHER_MODEL_CONFIG="/path/to/Qwen3-8B-teacher.yaml"
+export TP_SIZE=1
+export PP_SIZE=1
+export MBS=4
+export NUM_GPUS=8
+export DATASET_NAME="combined_v2_cot_chat"
+```
+
+## Dataset Options
+
+### Naming Convention
+
+Datasets follow this naming pattern:
+
+- **Plain text**: `datablend_<dataset>.json`
+- **With COT** (chain-of-thought): `datablend_<dataset>_cot.json`
+- **With chat template**: `datablend_<dataset>_chat.json`
+- **COT + chat**: `datablend_<dataset>_cot_chat.json`
+
+### Available Datasets
+
+#### Nemotron-v1 (Large scale, ~25M samples full)
+
+| Name | Samples | Description |
+|------|---------|-------------|
+| `nemotron_30pct` | ~7.5M | ALL subjects @ 30% |
+| `nemotron_30pct_cot_chat` | ~7.5M | ALL @ 30% + COT + Chat |
+| `nemotron_stem_cot_chat` | ~5M | STEM only + COT + Chat |
+| `nemotron_v1_math_30pct_cot_chat` | ~583K | Math split |
+| `nemotron_v1_code_30pct_cot_chat` | ~540K | Code split |
+
+#### Nemotron-v2 (High quality, ~400K samples @ 30%)
+
+| Name | Samples | Description |
+|------|---------|-------------|
+| `nemotron_v2_30pct` | ~398K | English @ 30% |
+| `nemotron_v2_cot_chat` | ~398K | English + COT + Chat |
+| `nemotron_v2_stem_30pct_cot_chat` | ~101K | STEM split |
+| `nemotron_v2_math_30pct_cot_chat` | ~68K | Math split |
+| `nemotron_v2_code_30pct_cot_chat` | ~50K | Code split |
+
+#### OpenScience
+
+| Name | Samples | Description |
+|------|---------|-------------|
+| `openscience` | ~300K | Plain text |
+| `openscience_chat` | ~300K | With chat template |
+
+#### Combined Datasets (Recommended)
+
+| Name | Samples | Description |
+|------|---------|-------------|
+| `combined_cot_chat` | ~8.2M | 20% OpenScience + 50% v1 + 30% v2 |
+| `combined_v2_cot_chat` | ~1M | Code & Math focused blend |
+
+## Parallelism Settings
+
+### Dense Models (Qwen3-8B)
+
+```bash
+TP_SIZE=1   # Single GPU per tensor
+PP_SIZE=1   # No pipeline parallelism
+EP_SIZE=1   # Not MoE
+MBS=4       # Can use larger micro-batch
+```
+
+### MoE Models (Qwen3-30B-A3B)
+
+```bash
+TP_SIZE=4   # Tensor parallel across 4 GPUs
+PP_SIZE=1   # No pipeline parallelism  
+EP_SIZE=8   # 128 experts / 8 = 16 experts per rank
+MBS=1       # Small MBS for large vocab KD loss
+```
+
+**Note**: MoE models with EP=8 require 4 nodes (32 GPUs total).
+
+### GPU Requirements
+
+| Model | TP | EP | Nodes | Total GPUs |
+|-------|----|----|-------|------------|
+| Qwen3-8B | 1 | 1 | 1 | 8 |
+| Qwen3-30B-A3B | 4 | 4 | 2 | 16 |
+| Qwen3-30B-A3B | 4 | 8 | 4 | 32 |
+
+## Multi-Node Training
+
+### SLURM Multi-Node
+
+```bash
+# Set nodes in sbatch header or command line
+#SBATCH --nodes=4
+
+# Or override at submission
+sbatch --nodes=4 sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-thinking-2507-moe.conf
+```
+
+The script automatically:
+
+- Detects `SLURM_NNODES` and `SLURM_JOB_NODELIST`
+- Sets `MASTER_ADDR` to first node
+- Exports `NODE_RANK` per process
+
+### Manual Multi-Node (Interactive)
+
+On each node, set:
+
+```bash
+export NNODES=4
+export NODE_RANK=0  # 0, 1, 2, 3 for each node
+export MASTER_ADDR=<first-node-hostname>
+export MASTER_PORT=29500
+bash qwen_qad.sh --config configs/your-config.conf
+```
+
+## Resuming Training
+
+Training automatically resumes from checkpoints:
+
+1. **Fresh start**: Loads from `STUDENT_CKPT` with `--finetune`
+2. **Resume**: If `CHECKPOINT_DIR/latest_checkpointed_iteration.txt` exists, loads from there
+
+To force fresh start, remove the checkpoint directory:
+
+```bash
+rm -rf /path/to/checkpoints/*/latest_checkpointed_iteration.txt
+```
+
+## Job Dependencies
+
+Chain jobs to run sequentially:
+
+```bash
+# Submit first job
+JOB1=$(sbatch --parsable sbatch_qwen_qad.sh --config ...)
+
+# Submit dependent job (runs after JOB1 finishes, regardless of success/failure)
+sbatch --dependency=afterany:$JOB1 sbatch_qwen_qad.sh --config ...
+```
+
+Dependency options:
+
+- `afterany:jobid` - Run after job finishes (success or failure)
+- `afterok:jobid` - Run only if job succeeds
+- `afternotok:jobid` - Run only if job fails
+
+## Environment Variables
+
+### HuggingFace Authentication
+
+```bash
+# Via argument (recommended - not logged)
+sbatch sbatch_qwen_qad.sh --hf-token $HF_TOKEN --config ...
+
+# Via environment
+export HF_TOKEN=hf_xxx
+sbatch sbatch_qwen_qad.sh --config ...
+```
+
+### Path Overrides
+
+```bash
+export MLM_DIR=/path/to/Megatron-LM
+export MODELOPT_DIR=/path/to/TensorRT-Model-Optimizer
+export MODELS_ROOT=/path/to/models
+export QAD_CHECKPOINT_ROOT=/path/to/checkpoints
+export DATACACHE_DIR=/path/to/data_cache
+```
+
+### Training Overrides
+
+```bash
+export LR=1e-5                    # Learning rate
+export DATASET_NAME=nemotron_v2   # Dataset
+export TRAIN_SAMPLES=100000       # Override sample count
+export ITERATIONS_TO_SKIP=100     # Skip first N iterations
+```
+
+## Output Structure
+
+```bash
+$QAD_CHECKPOINT_ROOT/
+├── <student>-Teacher-<teacher>-Data-<dataset>-lr<lr>/
+│   ├── checkpoints/<model-name>/
+│   │   ├── iter_0000200/
+│   │   ├── iter_0000400/
+│   │   └── latest_checkpointed_iteration.txt
+│   ├── tensorboard/<model-name>/
+│   └── logs/
+│       ├── <model>_qad_<datetime>.log
+│       └── <model>_<datetime>.env.log
+└── logs_slurm/
+    ├── coreai_dlalgo_modelopt-qwen.qad_<jobid>_<datetime>.log
+    └── err_coreai_dlalgo_modelopt-qwen.qad_<jobid>_<datetime>.log
+```
+
+## Monitoring
+
+### TensorBoard
+
+```bash
+tensorboard --logdir /path/to/tensorboard/ --port 6006 --bind_all
+```
+
+### Check Job Status
+
+```bash
+squeue -u $USER                    # List your jobs
+squeue -j <jobid>                  # Check specific job
+sacct -j <jobid> --format=...      # Job accounting info
+```
+
+### Estimated Time
+
+```bash
+squeue -j <jobid> -o "%.18i %.9P %.30j %.8u %.2t %.10M %.10L %.6D %R"
+# %.10L shows time left
+```
+
+## Troubleshooting
+
+### OOM Errors
+
+1. **Reduce MBS**: Set `MBS=1` in config
+2. **Increase EP**: For MoE, increase `EP_SIZE` (requires more nodes)
+3. **Disable log-params-norm**: Set `LOG_PARAMS_NORM=0` in config
+
+### Rate Limiting (429 Errors)
+
+Use HuggingFace token:
+
+```bash
+sbatch sbatch_qwen_qad.sh --hf-token $HF_TOKEN --config ...
+```
+
+### Shape Mismatch Errors
+
+Ensure teacher model config has correct GQA settings:
+
+```yaml
+num_query_groups: 4    # For Qwen3-30B-A3B
+kv_channels: 128
+```
+
+### Gradient Norm Spikes
+
+Isolated spikes are normal with heterogeneous data. Monitor if:
+
+- Spikes are persistent (every few iterations)
+- Loss doesn't recover after spike
+- Training diverges
+
+## Advanced Usage
+
+### Custom KD Config
+
+```bash
+bash qwen_qad.sh --config configs/... 1e-6 Qwen3-8B dataset Qwen3-8B /path/to/kd_config.yaml
+```
+
+### Skip Iterations
+
+Resume but skip specific iterations:
+
+```bash
+export ITERATIONS_TO_SKIP=100
+sbatch sbatch_qwen_qad.sh --config ...
+```
+
+### Custom Datablend
+
+```bash
+export BLEND_PATH=/path/to/custom_datablend.json
+export TRAIN_SAMPLES=500000
+sbatch sbatch_qwen_qad.sh --config ...
+```
+
+## Requirements
+
+- **Container**: PyTorch 25.06+ with CUDA support
+- **Megatron-LM**: With ModelOpt integration
+- **TensorRT-Model-Optimizer**: Latest version
+- **transformers**: 4.54+
+
+## See Also
+
+- [Megatron-LM Documentation](https://github.com/NVIDIA/Megatron-LM)
+- [TensorRT-Model-Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+- [MoE Optimization Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/api-guide/moe.html)

From 4abefc2a9b1fe3f69cd8f7560d387b5c4f58b3a5 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:45 -0800
Subject: [PATCH 03/16] generalize datasets

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../data_utils/download_nemotron_v1.py        |  26 +--
 .../data_utils/download_nemotron_v2.py        |  36 ++--
 .../data_utils/download_openscience.py        |  24 ++-
 .../data_utils/process_all_datasets.sh        | 187 ++++++++++++++++--
 .../data_utils/process_nemotron_qwen3-8B.sh   | 105 ----------
 ..._v1_qwen3-8B.sh => process_nemotron_v1.sh} | 121 +++++++++---
 ..._v2_qwen3-8B.sh => process_nemotron_v2.sh} | 136 +++++++++----
 ...nce_qwen3-8B.sh => process_openscience.sh} | 108 ++++++++--
 .../data_utils/process_slimorca_qwen3-8B.sh   |  67 -------
 9 files changed, 501 insertions(+), 309 deletions(-)
 mode change 100644 => 100755 examples/llm_qad/data_utils/process_all_datasets.sh
 delete mode 100644 examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh
 rename examples/llm_qad/data_utils/{process_nemotron_v1_qwen3-8B.sh => process_nemotron_v1.sh} (58%)
 mode change 100644 => 100755
 rename examples/llm_qad/data_utils/{process_nemotron_v2_qwen3-8B.sh => process_nemotron_v2.sh} (57%)
 mode change 100644 => 100755
 rename examples/llm_qad/data_utils/{process_openscience_qwen3-8B.sh => process_openscience.sh} (57%)
 mode change 100644 => 100755
 delete mode 100644 examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh

diff --git a/examples/llm_qad/data_utils/download_nemotron_v1.py b/examples/llm_qad/data_utils/download_nemotron_v1.py
index ae77ae4bf..4d35d7e9c 100644
--- a/examples/llm_qad/data_utils/download_nemotron_v1.py
+++ b/examples/llm_qad/data_utils/download_nemotron_v1.py
@@ -45,8 +45,8 @@
 from datasets import load_dataset
 from tqdm import tqdm
 
-DEFAULT_OUTPUT_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1"
-DATABLEND_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets"
+DEFAULT_OUTPUT_DIR = None  # Must be specified via --output-dir
+DEFAULT_DATABLEND_DIR = None  # Must be specified via --datablend-dir
 
 # Available splits and their sizes
 AVAILABLE_SPLITS = {
@@ -205,13 +205,13 @@ def download_split(split_name: str, max_samples: int, output_dir: str,
 
 
 def create_datablend_configs(output_dir: str, splits_downloaded: list, suffix: str, 
-                             sample_counts: dict):
+                             sample_counts: dict, datablend_dir: str):
     """Create datablend JSON configs for each split and combined."""
     preprocessed_dir = output_dir.replace("nemotron_v1", "nemotron_v1_preprocessed")
     
     # Create individual datablend for each split
     for split_name in splits_downloaded:
-        blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v1_{split_name}_{suffix}.json")
+        blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v1_{split_name}_{suffix}.json")
         blend_config = {
             "train": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_train_text_document"],
             "valid": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_validation_text_document"],
@@ -229,7 +229,7 @@ def create_datablend_configs(output_dir: str, splits_downloaded: list, suffix: s
             # Calculate weights based on sample counts
             total_samples = sum(sample_counts.get(s, {}).get("train", 0) for s in english_splits)
             
-            blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v1_all_en_{suffix}.json")
+            blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v1_all_en_{suffix}.json")
             
             train_entries = []
             valid_entries = []
@@ -265,8 +265,10 @@ def create_datablend_configs(output_dir: str, splits_downloaded: list, suffix: s
 
 def main():
     parser = argparse.ArgumentParser(description="Download Nemotron-v1 for QAD")
-    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
-                        help="Output directory for JSONL files")
+    parser.add_argument("--output-dir", type=str, required=True,
+                        help="Output directory for JSONL files (required)")
+    parser.add_argument("--datablend-dir", type=str, required=True,
+                        help="Directory for datablend config files (required)")
     parser.add_argument("--splits", type=str, default="stem,math,code,chat",
                         help="Comma-separated list of splits to download (stem,math,code,chat,tool_calling)")
     parser.add_argument("--sample-percent", type=float, default=30.0,
@@ -290,7 +292,9 @@ def main():
         init_tokenizer(args.tokenizer)
     
     output_dir = args.output_dir
+    datablend_dir = args.datablend_dir
     os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(datablend_dir, exist_ok=True)
     
     splits_to_download = [s.strip() for s in args.splits.split(",")]
     
@@ -341,7 +345,7 @@ def main():
     
     if args.combined:
         # Legacy combined mode
-        download_combined_mode(args, splits_to_download, samples_per_split, suffix)
+        download_combined_mode(args, splits_to_download, samples_per_split, suffix, datablend_dir)
     else:
         # New split mode (default)
         sample_counts = {}
@@ -385,7 +389,7 @@ def main():
         # Create datablend configs
         print("\n" + "=" * 70)
         print("Creating datablend configs...")
-        create_datablend_configs(output_dir, list(sample_counts.keys()), suffix, sample_counts)
+        create_datablend_configs(output_dir, list(sample_counts.keys()), suffix, sample_counts, datablend_dir)
         
         # Print summary
         print("\n" + "=" * 70)
@@ -419,7 +423,7 @@ def main():
         print("=" * 70)
 
 
-def download_combined_mode(args, splits_to_download, samples_per_split, suffix):
+def download_combined_mode(args, splits_to_download, samples_per_split, suffix, datablend_dir):
     """Legacy combined mode - all splits in single files."""
     output_dir = args.output_dir
     
@@ -505,7 +509,7 @@ def download_combined_mode(args, splits_to_download, samples_per_split, suffix):
         print(f"✓ Saved {split_name}")
     
     # Create datablend config
-    blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_{split_suffix}{full_suffix}.json")
+    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_{split_suffix}{full_suffix}.json")
     
     preprocessed_dir = output_dir.replace("nemotron_v1", "nemotron_v1_preprocessed")
     blend_config = {
diff --git a/examples/llm_qad/data_utils/download_nemotron_v2.py b/examples/llm_qad/data_utils/download_nemotron_v2.py
index 0699dd98b..e4cd51d46 100644
--- a/examples/llm_qad/data_utils/download_nemotron_v2.py
+++ b/examples/llm_qad/data_utils/download_nemotron_v2.py
@@ -54,8 +54,8 @@
 from datasets import load_dataset, get_dataset_config_names, load_dataset_builder
 from tqdm import tqdm
 
-DEFAULT_OUTPUT_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v2"
-DATABLEND_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets"
+DEFAULT_OUTPUT_DIR = None  # Must be specified via --output-dir
+DEFAULT_DATABLEND_DIR = None  # Must be specified via --datablend-dir
 DATASET_NAME = "nvidia/Nemotron-Post-Training-Dataset-v2"
 
 # Known splits (actual sizes will be fetched from HuggingFace)
@@ -283,7 +283,7 @@ def download_split(split_name: str, max_samples: int, output_dir: str,
     }
 
 
-def create_datablend_config(split_info: dict, output_dir: str, pct_str: str) -> str:
+def create_datablend_config(split_info: dict, output_dir: str, pct_str: str, datablend_dir: str) -> str:
     """Create datablend config for a single split."""
     split_name = split_info['split_name']
     
@@ -297,7 +297,7 @@ def create_datablend_config(split_info: dict, output_dir: str, pct_str: str) ->
         "test": [1.0, f"{split_preprocessed_dir}/{split_name}{pct_str}_test_text_document"]
     }
     
-    blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v2_{split_name}{pct_str}.json")
+    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_{split_name}{pct_str}.json")
     with open(blend_file, 'w') as f:
         json.dump(blend_config, f, indent=2)
     
@@ -305,7 +305,7 @@ def create_datablend_config(split_info: dict, output_dir: str, pct_str: str) ->
 
 
 def create_combined_datablend(all_split_infos: list, output_dir: str, pct_str: str, 
-                               suffix: str = "all_en") -> str:
+                               datablend_dir: str, suffix: str = "all_en") -> str:
     """Create combined datablend config for multiple splits with equal weighting."""
     
     preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
@@ -334,7 +334,7 @@ def create_combined_datablend(all_split_infos: list, output_dir: str, pct_str: s
         "test": test_blend
     }
     
-    blend_file = os.path.join(DATABLEND_DIR, f"datablend_nemotron_v2_{suffix}{pct_str}.json")
+    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_{suffix}{pct_str}.json")
     with open(blend_file, 'w') as f:
         json.dump(blend_config, f, indent=2)
     
@@ -343,8 +343,10 @@ def create_combined_datablend(all_split_infos: list, output_dir: str, pct_str: s
 
 def main():
     parser = argparse.ArgumentParser(description="Download Nemotron-v2 for QAD (per-split folders)")
-    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
-                        help="Output directory for JSONL files")
+    parser.add_argument("--output-dir", type=str, required=True,
+                        help="Output directory for JSONL files (required)")
+    parser.add_argument("--datablend-dir", type=str, required=True,
+                        help="Directory for datablend config files (required)")
     parser.add_argument("--splits", type=str, default="stem,math,code,chat",
                         help="Comma-separated list of English splits to download")
     parser.add_argument("--include-multilingual", action="store_true",
@@ -353,24 +355,24 @@ def main():
                         help="Percentage of each split to use (1-100). Default: 30%%")
     parser.add_argument("--max-samples", type=int, default=None,
                         help="Maximum samples per split (absolute cap)")
-    parser.add_argument("--include-reasoning", action="store_true", default=True,
-                        help="Include chain-of-thought reasoning in output (default: True)")
-    parser.add_argument("--no-reasoning", action="store_true",
-                        help="Exclude chain-of-thought reasoning from output")
+    parser.add_argument("--include-reasoning", action="store_true", default=False,
+                        help="Include chain-of-thought reasoning in output (for Thinking models)")
     parser.add_argument("--tokenizer", type=str, default=None,
                         help="HuggingFace tokenizer to use for chat template (e.g., Qwen/Qwen3-8B). "
                              "If not specified, uses simple role-based formatting.")
     args = parser.parse_args()
     
-    # Handle reasoning flag (--no-reasoning overrides default)
-    include_reasoning = args.include_reasoning and not args.no_reasoning
+    # Handle reasoning flag
+    include_reasoning = args.include_reasoning
     
     # Initialize tokenizer if specified
     if args.tokenizer:
         init_tokenizer(args.tokenizer)
     
     output_dir = args.output_dir
+    datablend_dir = args.datablend_dir
     os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(datablend_dir, exist_ok=True)
     
     # Build list of splits to download
     splits_to_download = [s.strip() for s in args.splits.split(",")]
@@ -447,7 +449,7 @@ def main():
             all_split_infos.append(split_info)
             
             # Create per-split datablend config
-            blend_file = create_datablend_config(split_info, output_dir, pct_str + reasoning_str + chat_str)
+            blend_file = create_datablend_config(split_info, output_dir, pct_str + reasoning_str + chat_str, datablend_dir)
             print(f"  📝 Datablend config: {blend_file}")
     
     if not all_split_infos:
@@ -462,12 +464,12 @@ def main():
     full_suffix = pct_str + reasoning_str + chat_str
     en_splits = [info for info in all_split_infos if "multilingual" not in info['split_name']]
     if en_splits:
-        combined_file = create_combined_datablend(en_splits, output_dir, full_suffix, "all_en")
+        combined_file = create_combined_datablend(en_splits, output_dir, full_suffix, datablend_dir, "all_en")
         print(f"📝 Combined English datablend: {combined_file}")
     
     # All splits combined (if multilingual included)
     if len(all_split_infos) > len(en_splits):
-        combined_all_file = create_combined_datablend(all_split_infos, output_dir, full_suffix, "all_multilingual")
+        combined_all_file = create_combined_datablend(all_split_infos, output_dir, full_suffix, datablend_dir, "all_multilingual")
         print(f"📝 Combined all datablend: {combined_all_file}")
     
     # Save metadata JSON with sample counts
diff --git a/examples/llm_qad/data_utils/download_openscience.py b/examples/llm_qad/data_utils/download_openscience.py
index 8dabe36e8..6133b07c3 100644
--- a/examples/llm_qad/data_utils/download_openscience.py
+++ b/examples/llm_qad/data_utils/download_openscience.py
@@ -16,7 +16,7 @@
 import os
 from tqdm import tqdm
 
-DEFAULT_OUTPUT_DIR = "/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/openscience_splits"
+DEFAULT_OUTPUT_DIR = None  # Must be specified via --output-dir
 
 # Split configuration
 TRAIN_RATIO = 0.95
@@ -62,15 +62,21 @@ def format_example(example: dict) -> str:
 
 def main():
     parser = argparse.ArgumentParser(description="Download OpenScience dataset")
-    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR,
-                        help="Output directory")
+    parser.add_argument("--output-dir", type=str, required=True,
+                        help="Output directory (required)")
+    parser.add_argument("--datablend-dir", type=str, required=True,
+                        help="Directory for datablend config files (required)")
     parser.add_argument("--tokenizer", type=str, default=None,
                         help="HuggingFace tokenizer for chat template (e.g., Qwen/Qwen3-8B)")
     args = parser.parse_args()
     
     OUTPUT_DIR = args.output_dir
+    DATABLEND_DIR = args.datablend_dir
     chat_suffix = "_chat" if args.tokenizer else ""
     
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    os.makedirs(DATABLEND_DIR, exist_ok=True)
+    
     if args.tokenizer:
         init_tokenizer(args.tokenizer)
     
@@ -124,6 +130,18 @@ def main():
             
             print(f"✓ Saved {len(split_data)} examples")
         
+        # Create datablend config
+        preprocessed_dir = OUTPUT_DIR.replace("openscience_splits", "openscience_splits_preprocessed")
+        blend_file = os.path.join(DATABLEND_DIR, f"datablend_openscience{chat_suffix}.json")
+        blend_config = {
+            "train": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_train_text_document"],
+            "valid": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_validation_text_document"],
+            "test": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_test_text_document"]
+        }
+        with open(blend_file, 'w') as f:
+            json.dump(blend_config, f, indent=2)
+        print(f"📝 Created datablend config: {blend_file}")
+        
         print("\n✓ Dataset splitting complete!")
         print(f"\nOutput files: openscience{chat_suffix}_*.jsonl")
         
diff --git a/examples/llm_qad/data_utils/process_all_datasets.sh b/examples/llm_qad/data_utils/process_all_datasets.sh
old mode 100644
new mode 100755
index 40247eafb..cdd9e6be4
--- a/examples/llm_qad/data_utils/process_all_datasets.sh
+++ b/examples/llm_qad/data_utils/process_all_datasets.sh
@@ -1,51 +1,195 @@
 #!/bin/bash
-# Download and process all datasets with Qwen3-30B-A3B-Thinking-2507 chat template
-# All datasets are split into individual folders for fine-grained control
+# Download and process all datasets (general, model-agnostic)
+#
+# Usage:
+#   bash process_all_datasets.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
+#
+# Required arguments:
+#   --output-dir      Base output directory for datasets
+#   --mlm-path        Path to Megatron-LM directory
+#   --tokenizer       HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
+#
+# Optional arguments:
+#   --datablend-dir     Directory for datablend configs (default: output-dir)
+#   --suffix            Suffix for file naming (default: 30pct_chat)
+#   --sample-percent    Percentage of data to use (default: 30)
+#   --include-reasoning Include chain-of-thought reasoning (for Thinking models)
+#                       Default: OFF (suitable for Instruct models)
+#
+# Examples:
+#   # For Instruct models (no COT):
+#   bash process_all_datasets.sh --output-dir /data --mlm-path /mlm --tokenizer Qwen/Qwen3-30B-A3B-Instruct-2507
+#
+#   # For Thinking models (with COT):
+#   bash process_all_datasets.sh --output-dir /data --mlm-path /mlm --tokenizer Qwen/Qwen3-30B-A3B-Thinking-2507 --include-reasoning
 
 set -e
 
-cd /lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM/examples/post_training/modelopt
+# Parse arguments
+OUTPUT_DIR=""
+MLM_DIR=""
+TOKENIZER=""
+DATABLEND_DIR=""
+SUFFIX=""  # Will be set based on --include-reasoning
+SAMPLE_PERCENT=30
+INCLUDE_REASONING=false
 
-TOKENIZER="Qwen/Qwen3-30B-A3B-Thinking-2507"
-SUFFIX="30pct_cot_chat"
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --mlm-path)
+            MLM_DIR="$2"
+            shift 2
+            ;;
+        --tokenizer)
+            TOKENIZER="$2"
+            shift 2
+            ;;
+        --datablend-dir)
+            DATABLEND_DIR="$2"
+            shift 2
+            ;;
+        --suffix)
+            SUFFIX="$2"
+            shift 2
+            ;;
+        --sample-percent)
+            SAMPLE_PERCENT="$2"
+            shift 2
+            ;;
+        --include-reasoning)
+            INCLUDE_REASONING=true
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Set default suffix based on reasoning flag
+if [ -z "$SUFFIX" ]; then
+    if [ "$INCLUDE_REASONING" = true ]; then
+        SUFFIX="${SAMPLE_PERCENT}pct_cot_chat"
+    else
+        SUFFIX="${SAMPLE_PERCENT}pct_chat"
+    fi
+fi
+
+# Validate required arguments
+if [ -z "$OUTPUT_DIR" ]; then
+    echo "Error: --output-dir is required"
+    echo "Usage: bash process_all_datasets.sh --output-dir <path> --mlm-path <path> --tokenizer <model>"
+    exit 1
+fi
+
+if [ -z "$MLM_DIR" ]; then
+    echo "Error: --mlm-path is required"
+    exit 1
+fi
+
+if [ -z "$TOKENIZER" ]; then
+    echo "Error: --tokenizer is required"
+    exit 1
+fi
+
+# Set defaults
+if [ -z "$DATABLEND_DIR" ]; then
+    DATABLEND_DIR="${OUTPUT_DIR}"
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 echo "=========================================="
 echo "Downloading and Processing All Datasets"
+echo "=========================================="
+echo "Output dir: ${OUTPUT_DIR}"
+echo "Datablend dir: ${DATABLEND_DIR}"
+echo "MLM path: ${MLM_DIR}"
 echo "Tokenizer: ${TOKENIZER}"
 echo "Suffix: ${SUFFIX}"
+echo "Sample percent: ${SAMPLE_PERCENT}%"
+echo "Include reasoning (COT): ${INCLUDE_REASONING}"
 echo "=========================================="
 
-# 1. Download datasets (all in split mode for fine-grained control)
+# Create directories
+mkdir -p "${OUTPUT_DIR}"
+mkdir -p "${DATABLEND_DIR}"
+
+# 1. Download datasets
 echo ""
 echo "=== Step 1: Downloading Datasets ==="
 
-# echo ">>> Downloading OpenScience..."
-# python download_openscience.py --tokenizer $TOKENIZER
+echo ">>> Downloading OpenScience..."
+python "${SCRIPT_DIR}/download_openscience.py" \
+    --output-dir "${OUTPUT_DIR}/openscience_splits" \
+    --datablend-dir "${DATABLEND_DIR}" \
+    --tokenizer "${TOKENIZER}"
+
+# Build reasoning flag for download commands
+REASONING_FLAG=""
+if [ "$INCLUDE_REASONING" = true ]; then
+    REASONING_FLAG="--include-reasoning"
+fi
 
-# echo ">>> Downloading Nemotron-v1 @ 30% (split mode)..."
-# python download_nemotron_v1.py --sample-percent 30 --include-reasoning --tokenizer $TOKENIZER
+echo ">>> Downloading Nemotron-v1 @ ${SAMPLE_PERCENT}%..."
+python "${SCRIPT_DIR}/download_nemotron_v1.py" \
+    --output-dir "${OUTPUT_DIR}/nemotron_v1" \
+    --datablend-dir "${DATABLEND_DIR}" \
+    --sample-percent "${SAMPLE_PERCENT}" \
+    ${REASONING_FLAG} \
+    --tokenizer "${TOKENIZER}"
 
-# echo ">>> Downloading Nemotron-v2 @ 30%..."
-# python download_nemotron_v2.py --sample-percent 30 --tokenizer $TOKENIZER
+echo ">>> Downloading Nemotron-v2 @ ${SAMPLE_PERCENT}%..."
+python "${SCRIPT_DIR}/download_nemotron_v2.py" \
+    --output-dir "${OUTPUT_DIR}/nemotron_v2" \
+    --datablend-dir "${DATABLEND_DIR}" \
+    --sample-percent "${SAMPLE_PERCENT}" \
+    ${REASONING_FLAG} \
+    --tokenizer "${TOKENIZER}"
 
 # 2. Process datasets
 echo ""
 echo "=== Step 2: Processing Datasets ==="
 
-# echo ">>> Processing OpenScience..."
-# bash process_openscience_qwen3-8B.sh chat ${TOKENIZER}
+echo ">>> Processing OpenScience..."
+bash "${SCRIPT_DIR}/process_openscience.sh" \
+    --output-dir "${OUTPUT_DIR}/openscience_splits_preprocessed" \
+    --input-dir "${OUTPUT_DIR}/openscience_splits" \
+    --mlm-path "${MLM_DIR}" \
+    --tokenizer "${TOKENIZER}" \
+    --suffix chat \
+    --datablend-dir "${DATABLEND_DIR}"
 
 echo ">>> Processing Nemotron-v1 splits..."
 for split in stem math code chat; do
     echo "    Processing nemotron_v1/${split}..."
-    bash process_nemotron_v1_qwen3-8B.sh $split ${SUFFIX} ${TOKENIZER}
+    bash "${SCRIPT_DIR}/process_nemotron_v1.sh" \
+        --output-dir "${OUTPUT_DIR}/nemotron_v1_preprocessed" \
+        --input-dir "${OUTPUT_DIR}/nemotron_v1" \
+        --mlm-path "${MLM_DIR}" \
+        --tokenizer "${TOKENIZER}" \
+        --split "${split}" \
+        --suffix "${SUFFIX}" \
+        --datablend-dir "${DATABLEND_DIR}"
 done
 
-# echo ">>> Processing Nemotron-v2 splits..."
-# for split in stem math code chat; do
-#     echo "    Processing nemotron_v2/${split}..."
-#     bash process_nemotron_v2_qwen3-8B.sh $split ${SUFFIX} ${TOKENIZER}
-# done
+echo ">>> Processing Nemotron-v2 splits..."
+for split in stem math code chat; do
+    echo "    Processing nemotron_v2/${split}..."
+    bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
+        --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
+        --input-dir "${OUTPUT_DIR}/nemotron_v2" \
+        --mlm-path "${MLM_DIR}" \
+        --tokenizer "${TOKENIZER}" \
+        --split "${split}" \
+        --suffix "${SUFFIX}" \
+        --datablend-dir "${DATABLEND_DIR}"
+done
 
 echo ""
 echo "=========================================="
@@ -62,7 +206,8 @@ echo "  - nemotron_v2_stem_${SUFFIX}"
 echo "  - nemotron_v2_math_${SUFFIX}"
 echo "  - nemotron_v2_code_${SUFFIX}"
 echo "  - nemotron_v2_chat_${SUFFIX}"
-echo "  - combined_cot_chat (uses all above with weights)"
+echo ""
+echo "Datablend configs are in: ${DATABLEND_DIR}"
 echo ""
 echo "Usage:"
 echo "  DATASET_NAME=combined_cot_chat sbatch sbatch_qwen_qad.sh --config configs/your-config.conf"
diff --git a/examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh b/examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh
deleted file mode 100644
index a28ceab6e..000000000
--- a/examples/llm_qad/data_utils/process_nemotron_qwen3-8B.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-# Preprocess Nemotron-v1 dataset for Qwen3-8B QAD training
-
-set -e
-
-# Default to ALL splits at 30% for best general improvement
-# Options: all_30pct (default), all_10pct, all_50pct, all_100pct, stem, math, etc.
-# Add _chat suffix for chat template formatted data
-# Examples:
-#   bash process_nemotron_qwen3-8B.sh all_30pct        # 30% of all splits (simple format)
-#   bash process_nemotron_qwen3-8B.sh all_30pct_chat   # 30% of all splits (chat template)
-#   bash process_nemotron_qwen3-8B.sh all_10pct        # 10% of all splits (~2.5M samples)
-#   bash process_nemotron_qwen3-8B.sh all_50pct        # 50% of all splits (~12.5M samples)
-SPLIT_NAME="${1:-all_30pct}"
-
-# Paths
-MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1"
-OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1_preprocessed"
-
-mkdir -p ${OUTPUT_DIR}
-
-# Install required dependencies
-echo "Installing dependencies..."
-pip install -q transformers tokenizers || true
-
-# Tokenizer settings for Qwen3-8B
-TOKENIZER_TYPE="HuggingFaceTokenizer"
-TOKENIZER_MODEL="Qwen/Qwen3-8B"
-
-# Number of workers for parallel processing
-WORKERS=32
-
-echo "=========================================="
-echo "Preprocessing Nemotron-v1 Dataset (${SPLIT_NAME}) for Qwen3-8B"
-echo "=========================================="
-
-# Process training split
-TRAIN_FILE="${INPUT_DIR}/nemotron_${SPLIT_NAME}_train.jsonl"
-if [ -f "${TRAIN_FILE}" ]; then
-    echo "Processing training split: ${TRAIN_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TRAIN_FILE} \
-        --output-prefix ${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_train \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Training file not found: ${TRAIN_FILE}"
-fi
-
-# Process validation split
-VALID_FILE="${INPUT_DIR}/nemotron_${SPLIT_NAME}_validation.jsonl"
-if [ -f "${VALID_FILE}" ]; then
-    echo "Processing validation split: ${VALID_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${VALID_FILE} \
-        --output-prefix ${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_validation \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Validation file not found: ${VALID_FILE}"
-fi
-
-# Process test split
-TEST_FILE="${INPUT_DIR}/nemotron_${SPLIT_NAME}_test.jsonl"
-if [ -f "${TEST_FILE}" ]; then
-    echo "Processing test split: ${TEST_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TEST_FILE} \
-        --output-prefix ${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_test \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Test file not found: ${TEST_FILE}"
-fi
-
-# Create datablend config
-BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_${SPLIT_NAME}.json"
-echo "Creating datablend config: ${BLEND_FILE}"
-cat > ${BLEND_FILE} << EOF
-{
-    "train": [1.0, "${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_train_text_document"],
-    "valid": [1.0, "${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_validation_text_document"],
-    "test": [1.0, "${OUTPUT_DIR}/nemotron_${SPLIT_NAME}_test_text_document"]
-}
-EOF
-
-echo "=========================================="
-echo "✓ Nemotron-v1 (${SPLIT_NAME}) preprocessing complete!"
-echo "Output directory: ${OUTPUT_DIR}"
-echo "Datablend config: ${BLEND_FILE}"
-echo ""
-echo "To run QAD training:"
-echo "  bash qwen_qad.sh 1e-5 Qwen3-8B False nemotron_${SPLIT_NAME}"
-echo "=========================================="
-
diff --git a/examples/llm_qad/data_utils/process_nemotron_v1_qwen3-8B.sh b/examples/llm_qad/data_utils/process_nemotron_v1.sh
old mode 100644
new mode 100755
similarity index 58%
rename from examples/llm_qad/data_utils/process_nemotron_v1_qwen3-8B.sh
rename to examples/llm_qad/data_utils/process_nemotron_v1.sh
index 82bd5fbbf..1cf9aed6e
--- a/examples/llm_qad/data_utils/process_nemotron_v1_qwen3-8B.sh
+++ b/examples/llm_qad/data_utils/process_nemotron_v1.sh
@@ -1,44 +1,108 @@
 #!/bin/bash
-# Preprocess Nemotron-v1 dataset (split mode) for Qwen3 QAD training
-#
-# New folder structure from download_nemotron_v1.py:
-#   nemotron_v1/
-#   ├── stem/
-#   │   ├── stem_30pct_cot_chat_train.jsonl
-#   │   └── ...
-#   ├── math/
-#   │   └── ...
+# Preprocess Nemotron-v1 dataset for QAD training (general, model-agnostic)
 #
 # Usage:
-#   bash process_nemotron_v1_qwen3-8B.sh <split> <suffix> [tokenizer]
+#   bash process_nemotron_v1.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
+#
+# Required arguments:
+#   --output-dir    Output directory for preprocessed files
+#   --mlm-path      Path to Megatron-LM directory
+#   --tokenizer     HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
 #
-# Examples:
-#   bash process_nemotron_v1_qwen3-8B.sh stem 30pct_cot_chat                              # Default: Qwen3-8B
-#   bash process_nemotron_v1_qwen3-8B.sh stem 30pct_cot_chat Qwen/Qwen3-30B-A3B-Thinking-2507  # Thinking model
+# Optional arguments:
+#   --input-dir     Input directory (default: derived from output-dir)
+#   --split         Split name: stem, math, code, chat (default: stem)
+#   --suffix        Suffix for file naming (default: 30pct_cot_chat)
+#   --workers       Number of parallel workers (default: 32)
+#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
 
 set -e
 
-# Ensure transformers is installed for tokenizer
-pip install -q transformers tokenizers
+# Parse arguments
+OUTPUT_DIR=""
+MLM_DIR=""
+TOKENIZER_MODEL=""
+INPUT_DIR=""
+SPLIT="stem"
+SUFFIX="30pct_cot_chat"
+WORKERS=32
+DATABLEND_DIR=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --mlm-path)
+            MLM_DIR="$2"
+            shift 2
+            ;;
+        --tokenizer)
+            TOKENIZER_MODEL="$2"
+            shift 2
+            ;;
+        --input-dir)
+            INPUT_DIR="$2"
+            shift 2
+            ;;
+        --split)
+            SPLIT="$2"
+            shift 2
+            ;;
+        --suffix)
+            SUFFIX="$2"
+            shift 2
+            ;;
+        --workers)
+            WORKERS="$2"
+            shift 2
+            ;;
+        --datablend-dir)
+            DATABLEND_DIR="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required arguments
+if [ -z "$OUTPUT_DIR" ]; then
+    echo "Error: --output-dir is required"
+    exit 1
+fi
+
+if [ -z "$MLM_DIR" ]; then
+    echo "Error: --mlm-path is required"
+    exit 1
+fi
+
+if [ -z "$TOKENIZER_MODEL" ]; then
+    echo "Error: --tokenizer is required"
+    exit 1
+fi
 
-# Arguments
-SPLIT="${1:-stem}"           # stem, math, code, chat
-SUFFIX="${2:-30pct_cot_chat}"
-TOKENIZER_MODEL="${3:-Qwen/Qwen3-8B}"  # Can override with any HuggingFace tokenizer
+# Set defaults for optional arguments
+if [ -z "$INPUT_DIR" ]; then
+    INPUT_DIR="${OUTPUT_DIR//_preprocessed/}"
+fi
 
-# Paths
-MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1"
-OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v1_preprocessed"
+if [ -z "$DATABLEND_DIR" ]; then
+    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
+fi
 
 mkdir -p ${OUTPUT_DIR}/${SPLIT}
+mkdir -p ${DATABLEND_DIR}
+
+# Ensure transformers is installed for tokenizer
+pip install -q transformers tokenizers
 
 # Tokenizer settings
 TOKENIZER_TYPE="HuggingFaceTokenizer"
 
-# Number of workers for parallel processing
-WORKERS=32
-
 # Full name for output files
 FULL_NAME="${SPLIT}_${SUFFIX}"
 
@@ -48,8 +112,10 @@ echo "=========================================="
 echo "Split: ${SPLIT}"
 echo "Suffix: ${SUFFIX}"
 echo "Tokenizer: ${TOKENIZER_MODEL}"
+echo "MLM Path: ${MLM_DIR}"
 echo "Input dir: ${INPUT_DIR}/${SPLIT}/"
 echo "Output dir: ${OUTPUT_DIR}/${SPLIT}/"
+echo "Datablend dir: ${DATABLEND_DIR}"
 echo "=========================================="
 
 # Process training split
@@ -67,7 +133,6 @@ if [ -f "${TRAIN_FILE}" ]; then
 else
     echo "❌ Error: Training file not found: ${TRAIN_FILE}"
     echo "   Check if download was successful."
-    echo "   Expected file pattern: ${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
     ls -la ${INPUT_DIR}/${SPLIT}/ 2>/dev/null || echo "   Directory doesn't exist: ${INPUT_DIR}/${SPLIT}/"
     exit 1
 fi
@@ -105,7 +170,7 @@ else
 fi
 
 # Create datablend config
-BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_${FULL_NAME}.json"
+BLEND_FILE="${DATABLEND_DIR}/datablend_nemotron_v1_${FULL_NAME}.json"
 echo "Creating datablend config: ${BLEND_FILE}"
 cat > ${BLEND_FILE} << EOF
 {
diff --git a/examples/llm_qad/data_utils/process_nemotron_v2_qwen3-8B.sh b/examples/llm_qad/data_utils/process_nemotron_v2.sh
old mode 100644
new mode 100755
similarity index 57%
rename from examples/llm_qad/data_utils/process_nemotron_v2_qwen3-8B.sh
rename to examples/llm_qad/data_utils/process_nemotron_v2.sh
index 2194d5848..6a56eb288
--- a/examples/llm_qad/data_utils/process_nemotron_v2_qwen3-8B.sh
+++ b/examples/llm_qad/data_utils/process_nemotron_v2.sh
@@ -1,61 +1,127 @@
 #!/bin/bash
-# Preprocess Nemotron-v2 dataset for Qwen3 QAD training
-#
-# New folder structure from download_nemotron_v2.py:
-#   nemotron_v2/
-#   ├── stem/
-#   │   ├── stem_30pct_cot_train.jsonl
-#   │   └── ...
-#   ├── math/
-#   │   └── ...
+# Preprocess Nemotron-v2 dataset for QAD training (general, model-agnostic)
 #
 # Usage:
-#   bash process_nemotron_v2_qwen3-8B.sh <split> [suffix] [tokenizer]
+#   bash process_nemotron_v2.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
+#
+# Required arguments:
+#   --output-dir    Output directory for preprocessed files
+#   --mlm-path      Path to Megatron-LM directory
+#   --tokenizer     HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
 #
-# Examples:
-#   bash process_nemotron_v2_qwen3-8B.sh stem 30pct_cot_chat                              # Default: Qwen3-8B
-#   bash process_nemotron_v2_qwen3-8B.sh stem 30pct_cot_chat Qwen/Qwen3-30B-A3B-Thinking-2507  # Thinking model
+# Optional arguments:
+#   --input-dir     Input directory (default: derived from output-dir)
+#   --split         Split name: stem, math, code, chat (default: stem)
+#   --suffix        Suffix for file naming (default: 30pct_cot_chat)
+#   --workers       Number of parallel workers (default: 32)
+#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
 
 set -e
 
-# Ensure transformers is installed for tokenizer
-pip install -q transformers tokenizers
+# Parse arguments
+OUTPUT_DIR=""
+MLM_DIR=""
+TOKENIZER_MODEL=""
+INPUT_DIR=""
+SPLIT="stem"
+SUFFIX="30pct_cot_chat"
+WORKERS=32
+DATABLEND_DIR=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --mlm-path)
+            MLM_DIR="$2"
+            shift 2
+            ;;
+        --tokenizer)
+            TOKENIZER_MODEL="$2"
+            shift 2
+            ;;
+        --input-dir)
+            INPUT_DIR="$2"
+            shift 2
+            ;;
+        --split)
+            SPLIT="$2"
+            shift 2
+            ;;
+        --suffix)
+            SUFFIX="$2"
+            shift 2
+            ;;
+        --workers)
+            WORKERS="$2"
+            shift 2
+            ;;
+        --datablend-dir)
+            DATABLEND_DIR="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required arguments
+if [ -z "$OUTPUT_DIR" ]; then
+    echo "Error: --output-dir is required"
+    exit 1
+fi
 
-# Arguments
-SPLIT="${1:-stem}"           # stem, math, code, chat
-SUFFIX="${2:-30pct_cot}"     # e.g., 30pct, 30pct_cot, 50pct_cot
-TOKENIZER_MODEL="${3:-Qwen/Qwen3-8B}"  # Can override with any HuggingFace tokenizer
+if [ -z "$MLM_DIR" ]; then
+    echo "Error: --mlm-path is required"
+    exit 1
+fi
+
+if [ -z "$TOKENIZER_MODEL" ]; then
+    echo "Error: --tokenizer is required"
+    exit 1
+fi
+
+# Set defaults for optional arguments
+if [ -z "$INPUT_DIR" ]; then
+    INPUT_DIR="${OUTPUT_DIR//_preprocessed/}"
+fi
+
+if [ -z "$DATABLEND_DIR" ]; then
+    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
+fi
 
 # Normalize suffix (handle both 30pctcot and 30pct_cot)
 SUFFIX=$(echo "$SUFFIX" | sed 's/pctcot/pct_cot/g')
 
-# Paths
-MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v2"
-OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/nemotron_v2_preprocessed"
-
 mkdir -p ${OUTPUT_DIR}/${SPLIT}
+mkdir -p ${DATABLEND_DIR}
+
+# Ensure transformers is installed for tokenizer
+pip install -q transformers tokenizers
 
 # Tokenizer settings
 TOKENIZER_TYPE="HuggingFaceTokenizer"
 
-# Number of workers for parallel processing
-WORKERS=32
-
 # Full name for output files
 FULL_NAME="${SPLIT}_${SUFFIX}"
 
 echo "=========================================="
-echo "Preprocessing Nemotron-v2 Dataset for Qwen3-8B"
+echo "Preprocessing Nemotron-v2 Dataset"
 echo "=========================================="
 echo "Split: ${SPLIT}"
 echo "Suffix: ${SUFFIX}"
+echo "Tokenizer: ${TOKENIZER_MODEL}"
+echo "MLM Path: ${MLM_DIR}"
 echo "Input dir: ${INPUT_DIR}/${SPLIT}/"
 echo "Output dir: ${OUTPUT_DIR}/${SPLIT}/"
+echo "Datablend dir: ${DATABLEND_DIR}"
 echo "=========================================="
 
 # Process training split
-# File pattern: nemotron_v2/<split>/<split>_<suffix>_train.jsonl
 TRAIN_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
 if [ -f "${TRAIN_FILE}" ]; then
     echo "Processing training split: ${TRAIN_FILE}"
@@ -70,7 +136,6 @@ if [ -f "${TRAIN_FILE}" ]; then
 else
     echo "❌ Error: Training file not found: ${TRAIN_FILE}"
     echo "   Check if download was successful."
-    echo "   Expected file pattern: ${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
     ls -la ${INPUT_DIR}/${SPLIT}/ 2>/dev/null || echo "   Directory doesn't exist: ${INPUT_DIR}/${SPLIT}/"
     exit 1
 fi
@@ -108,8 +173,7 @@ else
 fi
 
 # Create datablend config
-# This matches what qwen_qad.sh expects
-BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_${FULL_NAME}.json"
+BLEND_FILE="${DATABLEND_DIR}/datablend_nemotron_v2_${FULL_NAME}.json"
 echo "Creating datablend config: ${BLEND_FILE}"
 cat > ${BLEND_FILE} << EOF
 {
@@ -127,11 +191,5 @@ echo "Datablend config: ${BLEND_FILE}"
 echo ""
 echo "To run QAD training:"
 echo "  DATASET_NAME=nemotron_v2_${SPLIT}_${SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
-echo ""
-echo "Or set in config file:"
-echo "  export DATASET_NAME=\"nemotron_v2_${SPLIT}\""
-if [[ "$SUFFIX" == *"cot"* ]]; then
-    echo "  # With chain-of-thought reasoning"
-    echo "  export DATASET_NAME=\"nemotron_v2_${SPLIT}_cot\""
-fi
 echo "=========================================="
+
diff --git a/examples/llm_qad/data_utils/process_openscience_qwen3-8B.sh b/examples/llm_qad/data_utils/process_openscience.sh
old mode 100644
new mode 100755
similarity index 57%
rename from examples/llm_qad/data_utils/process_openscience_qwen3-8B.sh
rename to examples/llm_qad/data_utils/process_openscience.sh
index bec8eb394..eb18af9ff
--- a/examples/llm_qad/data_utils/process_openscience_qwen3-8B.sh
+++ b/examples/llm_qad/data_utils/process_openscience.sh
@@ -1,19 +1,92 @@
 #!/bin/bash
-# Preprocess OpenScience dataset for Qwen3 QAD training
+# Preprocess OpenScience dataset for QAD training (general, model-agnostic)
 #
 # Usage:
-#   bash process_openscience_qwen3-8B.sh [suffix] [tokenizer]
+#   bash process_openscience.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
 #
-# Examples:
-#   bash process_openscience_qwen3-8B.sh                                          # Simple format, Qwen3-8B
-#   bash process_openscience_qwen3-8B.sh chat                                     # Chat template, Qwen3-8B
-#   bash process_openscience_qwen3-8B.sh chat Qwen/Qwen3-30B-A3B-Thinking-2507    # Chat template, Thinking model
+# Required arguments:
+#   --output-dir    Output directory for preprocessed files
+#   --mlm-path      Path to Megatron-LM directory
+#   --tokenizer     HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
+#
+# Optional arguments:
+#   --input-dir     Input directory (default: derived from output-dir)
+#   --suffix        Suffix for file naming (empty for simple format, "chat" for chat template)
+#   --workers       Number of parallel workers (default: 32)
+#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
 
 set -e
 
-# Arguments
-SUFFIX="${1:-}"  # empty for simple format, "chat" for chat template
-TOKENIZER_MODEL="${2:-Qwen/Qwen3-8B}"  # Can override with any HuggingFace tokenizer
+# Parse arguments
+OUTPUT_DIR=""
+MLM_DIR=""
+TOKENIZER_MODEL=""
+INPUT_DIR=""
+SUFFIX=""
+WORKERS=32
+DATABLEND_DIR=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --mlm-path)
+            MLM_DIR="$2"
+            shift 2
+            ;;
+        --tokenizer)
+            TOKENIZER_MODEL="$2"
+            shift 2
+            ;;
+        --input-dir)
+            INPUT_DIR="$2"
+            shift 2
+            ;;
+        --suffix)
+            SUFFIX="$2"
+            shift 2
+            ;;
+        --workers)
+            WORKERS="$2"
+            shift 2
+            ;;
+        --datablend-dir)
+            DATABLEND_DIR="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required arguments
+if [ -z "$OUTPUT_DIR" ]; then
+    echo "Error: --output-dir is required"
+    exit 1
+fi
+
+if [ -z "$MLM_DIR" ]; then
+    echo "Error: --mlm-path is required"
+    exit 1
+fi
+
+if [ -z "$TOKENIZER_MODEL" ]; then
+    echo "Error: --tokenizer is required"
+    exit 1
+fi
+
+# Set defaults for optional arguments
+if [ -z "$INPUT_DIR" ]; then
+    INPUT_DIR="${OUTPUT_DIR//_preprocessed/}"
+fi
+
+if [ -z "$DATABLEND_DIR" ]; then
+    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
+fi
 
 # Normalize suffix
 if [ -n "$SUFFIX" ]; then
@@ -22,23 +95,21 @@ else
     FILE_SUFFIX=""
 fi
 
-# Paths
-MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/openscience_splits"
-OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/openscience_splits_preprocessed"
-
 mkdir -p ${OUTPUT_DIR}
+mkdir -p ${DATABLEND_DIR}
 
 # Tokenizer settings
 TOKENIZER_TYPE="HuggingFaceTokenizer"
 
-# Number of workers for parallel processing
-WORKERS=32
-
 echo "=========================================="
 echo "Preprocessing OpenScience Dataset"
+echo "=========================================="
 echo "Format suffix: ${FILE_SUFFIX:-none (simple format)}"
 echo "Tokenizer: ${TOKENIZER_MODEL}"
+echo "MLM Path: ${MLM_DIR}"
+echo "Input dir: ${INPUT_DIR}"
+echo "Output dir: ${OUTPUT_DIR}"
+echo "Datablend dir: ${DATABLEND_DIR}"
 echo "=========================================="
 
 # Process training split
@@ -91,7 +162,7 @@ else
 fi
 
 # Create datablend config
-BLEND_FILE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience${FILE_SUFFIX}.json"
+BLEND_FILE="${DATABLEND_DIR}/datablend_openscience${FILE_SUFFIX}.json"
 echo "Creating datablend config: ${BLEND_FILE}"
 cat > ${BLEND_FILE} << EOF
 {
@@ -115,3 +186,4 @@ ls -lh ${OUTPUT_DIR}/openscience${FILE_SUFFIX}*.idx 2>/dev/null || echo "No .idx
 echo ""
 echo "To use in QAD training:"
 echo "  DATASET_NAME=openscience${FILE_SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
+
diff --git a/examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh b/examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh
deleted file mode 100644
index 98b237805..000000000
--- a/examples/llm_qad/data_utils/process_slimorca_qwen3-8B.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-# Preprocess SlimOrca dataset for Qwen3-8B QAD training
-
-set -e
-
-# Paths
-MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-INPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/slimorca"
-OUTPUT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/slimorca_preprocessed"
-
-mkdir -p ${OUTPUT_DIR}
-
-# Tokenizer settings for Qwen3-8B
-TOKENIZER_TYPE="HuggingFaceTokenizer"
-TOKENIZER_MODEL="Qwen/Qwen3-8B"
-
-# Number of workers for parallel processing
-WORKERS=32
-
-echo "=========================================="
-echo "Preprocessing SlimOrca Dataset for Qwen3-8B"
-echo "=========================================="
-
-# Process training split
-if [ -f "${INPUT_DIR}/slimorca_train.jsonl" ]; then
-    echo "Processing training split..."
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${INPUT_DIR}/slimorca_train.jsonl \
-        --output-prefix ${OUTPUT_DIR}/slimorca_train \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-fi
-
-# Process validation split
-if [ -f "${INPUT_DIR}/slimorca_validation.jsonl" ]; then
-    echo "Processing validation split..."
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${INPUT_DIR}/slimorca_validation.jsonl \
-        --output-prefix ${OUTPUT_DIR}/slimorca_validation \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-fi
-
-# Process test split
-if [ -f "${INPUT_DIR}/slimorca_test.jsonl" ]; then
-    echo "Processing test split..."
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${INPUT_DIR}/slimorca_test.jsonl \
-        --output-prefix ${OUTPUT_DIR}/slimorca_test \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-fi
-
-echo "=========================================="
-echo "✓ SlimOrca preprocessing complete!"
-echo "Output directory: ${OUTPUT_DIR}"
-echo "=========================================="
-

From fe4c12d6ebd674c1a2de67ec10d58bfe08e6d2e4 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:47 -0800
Subject: [PATCH 04/16] refactor

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_qad/README.md                    | 480 +++++----
 examples/llm_qad/configs/README.md            | 114 ---
 .../qwen3-30b-a3b-instruct-2507-moe.conf      |  56 --
 ...n3-30b-a3b-instruct-2507-moe_template.conf |  69 ++
 .../llm_qad/configs/qwen3-30b-a3b-moe.conf    |  55 --
 .../qwen3-30b-a3b-thinking-2507-moe.conf      |  56 --
 .../llm_qad/configs/qwen3-8b-nemotron.conf    |  53 -
 .../{qwen3-8b-default.conf => qwen3-8b.conf}  |   0
 examples/llm_qad/configs/template.conf        | 124 ---
 .../data_utils/download_nemotron_v1.py        | 540 ----------
 .../llm_qad/data_utils/generate_dataset.sh    | 219 ++++
 .../llm_qad/data_utils/process_nemotron_v1.sh | 192 ----
 examples/llm_qad/qad.sh                       | 522 ++++++++++
 examples/llm_qad/qwen_qad.sh                  | 932 ------------------
 .../{sbatch_qwen_qad.sh => sbatch_qad.sh}     |  74 +-
 15 files changed, 1095 insertions(+), 2391 deletions(-)
 delete mode 100644 examples/llm_qad/configs/README.md
 delete mode 100644 examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf
 create mode 100644 examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
 delete mode 100644 examples/llm_qad/configs/qwen3-30b-a3b-moe.conf
 delete mode 100644 examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf
 delete mode 100644 examples/llm_qad/configs/qwen3-8b-nemotron.conf
 rename examples/llm_qad/configs/{qwen3-8b-default.conf => qwen3-8b.conf} (100%)
 delete mode 100644 examples/llm_qad/configs/template.conf
 delete mode 100644 examples/llm_qad/data_utils/download_nemotron_v1.py
 create mode 100755 examples/llm_qad/data_utils/generate_dataset.sh
 delete mode 100755 examples/llm_qad/data_utils/process_nemotron_v1.sh
 create mode 100644 examples/llm_qad/qad.sh
 delete mode 100644 examples/llm_qad/qwen_qad.sh
 rename examples/llm_qad/{sbatch_qwen_qad.sh => sbatch_qad.sh} (71%)

diff --git a/examples/llm_qad/README.md b/examples/llm_qad/README.md
index a05ad72cc..59be3aa06 100644
--- a/examples/llm_qad/README.md
+++ b/examples/llm_qad/README.md
@@ -1,360 +1,352 @@
-# Qwen QAD Training Scripts
+# QAD Training Scripts
 
-Quantization-Aware Distillation (QAD) training scripts for Qwen models using Megatron-LM. These scripts enable training quantized (NVFP4) student models with knowledge distillation from full-precision teacher models.
+Quantization-Aware Distillation (QAD) training scripts for language models using Megatron-LM. These scripts enable training quantized (NVFP4) student models with knowledge distillation from full-precision teacher models.
 
 ## Overview
 
 | Script | Purpose |
 |--------|---------|
-| `qwen_qad.sh` | Main training script (interactive/Docker) |
-| `sbatch_qwen_qad.sh` | SLURM batch submission wrapper |
+| `qad.sh` | Main training script (run inside container) |
+| `sbatch_qad.sh` | SLURM batch submission wrapper |
 | `configs/*.conf` | Model-specific configuration files |
 
-## Quick Start
+## Requirements
 
-### SLURM Batch Submission (Recommended) for H100 x 8
+### Software Dependencies
 
-```bash
-# With HuggingFace token (for gated models)
-sbatch sbatch_qwen_qad.sh --hf-token $HF_TOKEN --config configs/qwen3-30b-a3b-thinking-2507-moe.conf
-```
+- **Container**: Nvidia PyTorch container (tested with `nvcr.io/nvidia/pytorch:25.06-py3`)
+- **Python**: 3.10+
+- **transformers**: 4.54+ (installed automatically)
 
-### Interactive Mode
+### Clone Required Repositories
 
 ```bash
-# Get interactive node first
-srun -A coreai_dlalgo_modelopt --nodes=1 -p batch --mpi=pmix \
-    -J qwen-qad:dev \
-    --container-image=/lustre/.../pytorch_25.06-py3.sqsh \
-    --container-mounts="/lustre/fsw:/lustre/fsw" \
-    -t 4:0:0 --pty bash
-
-# Run training
-bash qwen_qad.sh --config configs/qwen3-8b-default.conf
-```
-
-## Configuration Files
-
-Configuration files in `configs/` define model architecture, parallelism, and checkpoint paths.
-
-### Required Config Variables
+# Set your workspace directory
+export WORKSPACE=/path/to/your/workspace
 
-| Variable | Description | Example |
-|----------|-------------|---------|
-| `STUDENT_MODEL` | Student model name | `Qwen3-8B` |
-| `TEACHER_MODEL` | Teacher model name | `Qwen3-8B` |
-| `STUDENT_CKPT` | Path to quantized student checkpoint | `/path/to/Qwen3-8B-NVFP4-TP1-MLM` |
-| `TEACHER_CKPT` | Path to teacher checkpoint | `/path/to/Qwen3-8B-TP1-MLM` |
-| `TEACHER_MODEL_CONFIG` | Teacher model YAML config | `/path/to/Qwen3-8B-teacher.yaml` |
-| `TP_SIZE` | Tensor parallelism size | `1`, `4`, `8` |
-| `MBS` | Micro-batch size | `1`, `2`, `4` |
+# Clone Megatron-LM (with ModelOpt integration)
+git clone https://github.com/NVIDIA/Megatron-LM.git ${WORKSPACE}/Megatron-LM
+cd ${WORKSPACE}/Megatron-LM
+git checkout <modelopt-branch>  # Use branch with ModelOpt support
 
-### Optional Config Variables
+# Clone Model-Optimizer
+git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git ${WORKSPACE}/Model-Optimizer
+```
 
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `PP_SIZE` | `1` | Pipeline parallelism size |
-| `EP_SIZE` | `1` | Expert parallelism (MoE models) |
-| `NUM_GPUS` | `8` | GPUs per node |
-| `LR` | `1e-6` | Learning rate |
-| `DATASET_NAME` | `openscience` | Training dataset |
-| `TRAIN_SAMPLES` | Auto | Override training samples |
-| `BLEND_PATH` | Auto | Override datablend path |
+### Prepare Container
 
-### Example Config Structure
+For SLURM with Pyxis/Enroot, create a squashfs container:
 
 ```bash
-# configs/qwen3-8b-default.conf
-export STUDENT_MODEL="Qwen3-8B"
-export TEACHER_MODEL="Qwen3-8B"
-export STUDENT_CKPT="/path/to/Qwen3-8B-NVFP4-TP1-MLM"
-export TEACHER_CKPT="/path/to/Qwen3-8B-TP1-MLM"
-export TEACHER_MODEL_CONFIG="/path/to/Qwen3-8B-teacher.yaml"
-export TP_SIZE=1
-export PP_SIZE=1
-export MBS=4
-export NUM_GPUS=8
-export DATASET_NAME="combined_v2_cot_chat"
+# Pull and convert Docker image to sqsh
+enroot import docker://nvcr.io/nvidia/pytorch:25.06-py3
+mv nvidia+pytorch+25.06-py3.sqsh /path/to/containers/pytorch_25.06.sqsh
 ```
 
-## Dataset Options
+### Prepare Checkpoints
 
-### Naming Convention
+You need the following checkpoints before training:
 
-Datasets follow this naming pattern:
+1. **Student checkpoint**: Quantized (NVFP4) model in Megatron-LM format
+2. **Teacher checkpoint**: Full-precision (BF16) model in Megatron-LM format
+3. **Teacher config YAML**: Model architecture configuration
 
-- **Plain text**: `datablend_<dataset>.json`
-- **With COT** (chain-of-thought): `datablend_<dataset>_cot.json`
-- **With chat template**: `datablend_<dataset>_chat.json`
-- **COT + chat**: `datablend_<dataset>_cot_chat.json`
+See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt) for checkpoint conversion from HuggingFace format. 
 
-### Available Datasets
+## Creating a Configuration
 
-#### Nemotron-v1 (Large scale, ~25M samples full)
+### Use Template Configs
 
-| Name | Samples | Description |
-|------|---------|-------------|
-| `nemotron_30pct` | ~7.5M | ALL subjects @ 30% |
-| `nemotron_30pct_cot_chat` | ~7.5M | ALL @ 30% + COT + Chat |
-| `nemotron_stem_cot_chat` | ~5M | STEM only + COT + Chat |
-| `nemotron_v1_math_30pct_cot_chat` | ~583K | Math split |
-| `nemotron_v1_code_30pct_cot_chat` | ~540K | Code split |
+Template configurations are provided in `configs/`:
 
-#### Nemotron-v2 (High quality, ~400K samples @ 30%)
+| Config | Model | Description |
+|--------|-------|-------------|
+| `qwen3-30b-a3b-instruct-2507-moe_template.conf` | Qwen3-30B-A3B-Instruct | MoE template (start here) |
+| `qwen3-8b.conf` | Qwen3-8B | Dense model example |
 
-| Name | Samples | Description |
-|------|---------|-------------|
-| `nemotron_v2_30pct` | ~398K | English @ 30% |
-| `nemotron_v2_cot_chat` | ~398K | English + COT + Chat |
-| `nemotron_v2_stem_30pct_cot_chat` | ~101K | STEM split |
-| `nemotron_v2_math_30pct_cot_chat` | ~68K | Math split |
-| `nemotron_v2_code_30pct_cot_chat` | ~50K | Code split |
+### Create Your Config
 
-#### OpenScience
+1. Copy the template:
+   ```bash
+   cp configs/qwen3-30b-a3b-instruct-2507-moe_template.conf configs/my-experiment.conf
+   ```
 
-| Name | Samples | Description |
-|------|---------|-------------|
-| `openscience` | ~300K | Plain text |
-| `openscience_chat` | ~300K | With chat template |
+2. Fill in required empty fields:
+   - `STUDENT_CKPT` - Path to quantized student MLM checkpoint
+   - `TEACHER_CKPT` - Path to teacher MLM checkpoint  
+   - `TEACHER_MODEL_CONFIG` - Path to teacher YAML config (see below)
+   - `MLM_DIR` - Path to your Megatron-LM clone
 
-#### Combined Datasets (Recommended)
+3. Optionally adjust:
+   - `QAD_CHECKPOINT_ROOT`, `DATACACHE_DIR` - output paths
+   - `CONTAINER_IMAGE`, `CONTAINER_MOUNTS` - container settings
+   - `BLEND_PATH` - dataset path
 
-| Name | Samples | Description |
-|------|---------|-------------|
-| `combined_cot_chat` | ~8.2M | 20% OpenScience + 50% v1 + 30% v2 |
-| `combined_v2_cot_chat` | ~1M | Code & Math focused blend |
+### Teacher Model Config (YAML)
 
-## Parallelism Settings
-
-### Dense Models (Qwen3-8B)
+Create a YAML file with teacher model architecture (example: `configs/Qwen3-30B-A3B-teacher.yaml`):
 
-```bash
-TP_SIZE=1   # Single GPU per tensor
-PP_SIZE=1   # No pipeline parallelism
-EP_SIZE=1   # Not MoE
-MBS=4       # Can use larger micro-batch
+```yaml
+num_layers: 48
+hidden_size: 2048
+num_attention_heads: 32
+num_query_groups: 4
+kv_channels: 128
+ffn_hidden_size: 6144
 ```
 
-### MoE Models (Qwen3-30B-A3B)
+Set `TEACHER_MODEL_CONFIG` in your config to point to this file.
+
+## Dataset Generation
+
+QAD training requires preprocessed datasets in Megatron-LM format. Use the one-button script to generate datasets:
 
 ```bash
-TP_SIZE=4   # Tensor parallel across 4 GPUs
-PP_SIZE=1   # No pipeline parallelism  
-EP_SIZE=8   # 128 experts / 8 = 16 experts per rank
-MBS=1       # Small MBS for large vocab KD loss
-```
+cd data_utils/
 
-**Note**: MoE models with EP=8 require 4 nodes (32 GPUs total).
+bash generate_dataset.sh \
+    --output-dir /path/to/datasets \
+    --mlm-path /path/to/Megatron-LM \
+    --tokenizer <HF-model> (e.g., Qwen/Qwen3-30B-A3B-Instruct-2507)
+```
 
-### GPU Requirements
+### Requirements
 
-| Model | TP | EP | Nodes | Total GPUs |
-|-------|----|----|-------|------------|
-| Qwen3-8B | 1 | 1 | 1 | 8 |
-| Qwen3-30B-A3B | 4 | 4 | 2 | 16 |
-| Qwen3-30B-A3B | 4 | 8 | 4 | 32 |
+- HuggingFace token to access `nvidia/Nemotron-Post-Training-Dataset-v2`
+- Login first: `huggingface-cli login`
 
-## Multi-Node Training
+### What It Does
 
-### SLURM Multi-Node
+1. Downloads OpenScience + Nemotron-v2 datasets
+2. Preprocesses to Megatron-LM format
+3. Creates combined datablend JSON with weights:
+   - 30% Nemotron-v2 code
+   - 20% Nemotron-v2 math
+   - 20% Nemotron-v2 stem
+   - 10% Nemotron-v2 chat
+   - 20% OpenScience
 
-```bash
-# Set nodes in sbatch header or command line
-#SBATCH --nodes=4
+### Output
 
-# Or override at submission
-sbatch --nodes=4 sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-thinking-2507-moe.conf
+```
+/path/to/datasets/
+├── openscience_splits_preprocessed/  # Megatron format
+├── nemotron_v2_preprocessed/         # Megatron format
+└── datablend_combined.json           # Combined config
 ```
 
-The script automatically:
-
-- Detects `SLURM_NNODES` and `SLURM_JOB_NODELIST`
-- Sets `MASTER_ADDR` to first node
-- Exports `NODE_RANK` per process
+Set `BLEND_PATH` in your config to point to `datablend_combined.json`.
 
-### Manual Multi-Node (Interactive)
+## Quick Start
 
-On each node, set:
+### SLURM Batch Submission (Recommended)
 
 ```bash
-export NNODES=4
-export NODE_RANK=0  # 0, 1, 2, 3 for each node
-export MASTER_ADDR=<first-node-hostname>
-export MASTER_PORT=29500
-bash qwen_qad.sh --config configs/your-config.conf
-```
-
-## Resuming Training
+# Submit training job
+sbatch sbatch_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
 
-Training automatically resumes from checkpoints:
+# With HuggingFace token (for gated models)
+sbatch sbatch_qad.sh --hf-token $HF_TOKEN --config configs/qwen3-30b-a3b-thinking-2507-moe.conf
 
-1. **Fresh start**: Loads from `STUDENT_CKPT` with `--finetune`
-2. **Resume**: If `CHECKPOINT_DIR/latest_checkpointed_iteration.txt` exists, loads from there
+# Multi-node (override SLURM header)
+sbatch --nodes=4 sbatch_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
+```
 
-To force fresh start, remove the checkpoint directory:
+### Interactive Mode
 
 ```bash
-rm -rf /path/to/checkpoints/*/latest_checkpointed_iteration.txt
-```
+# Get interactive node first
+srun -A <account> --nodes=1 -p batch --mpi=pmix \
+    -J qad:dev \
+    --container-image=nvcr.io/nvidia/pytorch:25.06-py3 \
+    --container-mounts="..." \
+    -t 4:0:0 --pty bash
 
-## Job Dependencies
+# Run training
+bash qad.sh --config configs/qwen3-8b.conf
+```
 
-Chain jobs to run sequentially:
+## Required Config Variables
 
-```bash
-# Submit first job
-JOB1=$(sbatch --parsable sbatch_qwen_qad.sh --config ...)
+### Model Configuration
 
-# Submit dependent job (runs after JOB1 finishes, regardless of success/failure)
-sbatch --dependency=afterany:$JOB1 sbatch_qwen_qad.sh --config ...
-```
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `STUDENT_MODEL` | Student model name (for logging) | `Qwen3-30B-A3B` |
+| `TEACHER_MODEL` | Teacher model name (for logging) | `Qwen3-30B-A3B` |
+| `TOKENIZER_MODEL` | HuggingFace tokenizer path | `Qwen/Qwen3-30B-A3B-Instruct-2507` |
+| `IS_MOE` | Whether model is Mixture of Experts | `true` or `false` |
 
-Dependency options:
+### Checkpoint Paths
 
-- `afterany:jobid` - Run after job finishes (success or failure)
-- `afterok:jobid` - Run only if job succeeds
-- `afternotok:jobid` - Run only if job fails
+| Variable | Description |
+|----------|-------------|
+| `STUDENT_CKPT` | Path to quantized student MLM checkpoint |
+| `TEACHER_CKPT` | Path to teacher MLM checkpoint |
+| `TEACHER_MODEL_CONFIG` | Path to teacher model YAML config |
+| `STUDENT_CONFIG_FILE` | Path to student model args script (in Megatron-LM) |
 
-## Environment Variables
+### Training Hyperparameters
 
-### HuggingFace Authentication
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `LR` | Learning rate | `1e-5` |
+| `GBS` | Global batch size | `256` |
+| `MIN_LR` | Minimum learning rate | `0.0` |
+| `LR_DECAY_STYLE` | LR decay schedule | `constant`, `cosine` |
+| `SAVE_INTERVAL` | Checkpoint save interval (iterations) | `200` |
+| `LOG_INTERVAL` | Logging interval (iterations) | `10` |
 
-```bash
-# Via argument (recommended - not logged)
-sbatch sbatch_qwen_qad.sh --hf-token $HF_TOKEN --config ...
+### Data Configuration
 
-# Via environment
-export HF_TOKEN=hf_xxx
-sbatch sbatch_qwen_qad.sh --config ...
-```
+| Variable | Description |
+|----------|-------------|
+| `DATASET_NAME` | Dataset identifier (for output naming) |
+| `BLEND_PATH` | Path to datablend JSON file |
+| `TRAIN_SAMPLES` | Number of training samples |
 
-### Path Overrides
+### Parallelism
 
-```bash
-export MLM_DIR=/path/to/Megatron-LM
-export MODELOPT_DIR=/path/to/TensorRT-Model-Optimizer
-export MODELS_ROOT=/path/to/models
-export QAD_CHECKPOINT_ROOT=/path/to/checkpoints
-export DATACACHE_DIR=/path/to/data_cache
-```
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `TP_SIZE` | Tensor parallelism size | `1`, `2`, `4` |
+| `PP_SIZE` | Pipeline parallelism size | `1` |
+| `EP_SIZE` | Expert parallelism (MoE only) | `4`, `8` |
+| `MBS` | Micro-batch size | `1`, `2` |
+| `NUM_GPUS` | GPUs per node | `4`, `8` |
 
-### Training Overrides
+### Required Paths
 
-```bash
-export LR=1e-5                    # Learning rate
-export DATASET_NAME=nemotron_v2   # Dataset
-export TRAIN_SAMPLES=100000       # Override sample count
-export ITERATIONS_TO_SKIP=100     # Skip first N iterations
-```
+| Variable | Description |
+|----------|-------------|
+| `MLM_DIR` | Path to Megatron-LM directory |
+| `MODELOPT_DIR` | Path to Model-Optimizer directory |
+| `QAD_CHECKPOINT_ROOT` | Root directory for checkpoints |
+| `DATACACHE_DIR` | Directory for data cache |
 
-## Output Structure
+### Container Configuration
 
-```bash
-$QAD_CHECKPOINT_ROOT/
-├── <student>-Teacher-<teacher>-Data-<dataset>-lr<lr>/
-│   ├── checkpoints/<model-name>/
-│   │   ├── iter_0000200/
-│   │   ├── iter_0000400/
-│   │   └── latest_checkpointed_iteration.txt
-│   ├── tensorboard/<model-name>/
-│   └── logs/
-│       ├── <model>_qad_<datetime>.log
-│       └── <model>_<datetime>.env.log
-└── logs_slurm/
-    ├── coreai_dlalgo_modelopt-qwen.qad_<jobid>_<datetime>.log
-    └── err_coreai_dlalgo_modelopt-qwen.qad_<jobid>_<datetime>.log
-```
+| Variable | Description |
+|----------|-------------|
+| `CONTAINER_IMAGE` | Path to container sqsh file |
+| `CONTAINER_MOUNTS` | Container mount points |
+| `CONTAINER_WORKDIR` | Working directory inside container |
 
-## Monitoring
+## Optional Config Variables
 
-### TensorBoard
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MASTER_PORT` | `29500` | Distributed training port |
+| `MAX_SEQ` | Model default | Override sequence length |
+| `KD_CFG_PATH` | Auto-generated | Custom KD config YAML |
+| `RUN_TAG` | Empty | Custom tag for output naming |
 
-```bash
-tensorboard --logdir /path/to/tensorboard/ --port 6006 --bind_all
-```
+## Parallelism Settings
 
-### Check Job Status
+### Dense Models (e.g., Qwen3-8B)
 
 ```bash
-squeue -u $USER                    # List your jobs
-squeue -j <jobid>                  # Check specific job
-sacct -j <jobid> --format=...      # Job accounting info
+export IS_MOE=false
+export TP_SIZE=1
+export EP_SIZE=1
+export MBS=4
 ```
 
-### Estimated Time
+### MoE Models (e.g., Qwen3-30B-A3B)
 
 ```bash
-squeue -j <jobid> -o "%.18i %.9P %.30j %.8u %.2t %.10M %.10L %.6D %R"
-# %.10L shows time left
+export IS_MOE=true
+export TP_SIZE=2
+export EP_SIZE=4
+export MBS=2
 ```
 
-## Troubleshooting
+**Note**: MoE models require loading both student and teacher models, which increases memory requirements significantly.
 
-### OOM Errors
+### GPU Requirements
 
-1. **Reduce MBS**: Set `MBS=1` in config
-2. **Increase EP**: For MoE, increase `EP_SIZE` (requires more nodes)
-3. **Disable log-params-norm**: Set `LOG_PARAMS_NORM=0` in config
+| Model | TP | EP | Nodes (4 GPU/node) | Total GPUs |
+|-------|----|----|---------------------|------------|
+| Qwen3-8B | 1 | 1 | 1 | 4-8 |
+| Qwen3-30B-A3B | 2 | 4 | 2-4 | 8-16 |
 
-### Rate Limiting (429 Errors)
+## MoE Performance Optimizations
 
-Use HuggingFace token:
+For MoE models, the script automatically enables performance optimizations:
 
+- `--moe-token-dispatcher-type alltoall`
+- `--moe-shared-expert-overlap`
+- `--moe-permute-fusion`
+- `--moe-grouped-gemm`
+- `--cross-entropy-loss-fusion`
+
+To disable (if causing issues):
 ```bash
-sbatch sbatch_qwen_qad.sh --hf-token $HF_TOKEN --config ...
+export ENABLE_MOE_PERF=0
 ```
 
-### Shape Mismatch Errors
-
-Ensure teacher model config has correct GQA settings:
+## Output Structure
 
-```yaml
-num_query_groups: 4    # For Qwen3-30B-A3B
-kv_channels: 128
+```
+$QAD_CHECKPOINT_ROOT/
+├── <student>-NVFP4-Teacher-<teacher>-Data-<dataset>-lr<lr>-minlr<min>-decay<style>-gbs<gbs>-si<save>-li<log>/
+│   ├── checkpoints/<model>/
+│   │   ├── iter_0000200/
+│   │   ├── iter_0000400/
+│   │   └── latest_checkpointed_iteration.txt
+│   ├── tensorboard/<model>/
+│   └── logs/
+│       ├── _qad_<datetime>.log
+│       └── _<datetime>.env.log
+└── logs_slurm/
+    ├── <job-name>_<jobid>_<datetime>.log
+    └── err_<job-name>_<jobid>_<datetime>.log
 ```
 
-### Gradient Norm Spikes
-
-Isolated spikes are normal with heterogeneous data. Monitor if:
-
-- Spikes are persistent (every few iterations)
-- Loss doesn't recover after spike
-- Training diverges
+## Resuming Training
 
-## Advanced Usage
+Training automatically resumes from checkpoints:
 
-### Custom KD Config
+1. **Fresh start**: If no checkpoint exists, loads from `STUDENT_CKPT` with `--finetune`
+2. **Resume**: If `latest_checkpointed_iteration.txt` exists, resumes from there
 
+To force a fresh start:
 ```bash
-bash qwen_qad.sh --config configs/... 1e-6 Qwen3-8B dataset Qwen3-8B /path/to/kd_config.yaml
+rm -rf /path/to/checkpoints/*/latest_checkpointed_iteration.txt
 ```
 
-### Skip Iterations
+## Job Dependencies
 
-Resume but skip specific iterations:
+Chain jobs to run sequentially:
 
 ```bash
-export ITERATIONS_TO_SKIP=100
-sbatch sbatch_qwen_qad.sh --config ...
+# Submit first job
+JOB1=$(sbatch --parsable sbatch_qad.sh --config ...)
+
+# Submit dependent job (runs after JOB1 finishes)
+sbatch --dependency=afterany:$JOB1 sbatch_qad.sh --config ...
 ```
 
-### Custom Datablend
+## Troubleshooting
+
+### OOM Errors
+
+1. **Reduce MBS**: Set `MBS=1`
+2. **Increase parallelism**: Increase `EP_SIZE` or `TP_SIZE`
+3. **Add more nodes**: Increase `SLURM --nodes`
+4. **Disable log-params-norm**: Set `LOG_PARAMS_NORM=0`
 
+### Triton Cache Errors
+
+Clear corrupted cache:
 ```bash
-export BLEND_PATH=/path/to/custom_datablend.json
-export TRAIN_SAMPLES=500000
-sbatch sbatch_qwen_qad.sh --config ...
+rm -rf ~/.triton/cache
 ```
 
-## Requirements
-
-- **Container**: PyTorch 25.06+ with CUDA support
-- **Megatron-LM**: With ModelOpt integration
-- **TensorRT-Model-Optimizer**: Latest version
-- **transformers**: 4.54+
+The scripts automatically use per-job Triton cache directories.
 
 ## See Also
 
 - [Megatron-LM Documentation](https://github.com/NVIDIA/Megatron-LM)
-- [TensorRT-Model-Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+- [Model-Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
 - [MoE Optimization Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/api-guide/moe.html)
diff --git a/examples/llm_qad/configs/README.md b/examples/llm_qad/configs/README.md
deleted file mode 100644
index 6a70a69ad..000000000
--- a/examples/llm_qad/configs/README.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# QAD Training Configuration Files
-
-Configuration files for QAD (Quantization-Aware Distillation) training.
-Works with both `sbatch_qwen_qad.sh` (SLURM) and `qwen_qad.sh` (Docker/Interactive).
-
-## Quick Start
-
-### SLURM Batch Mode
-```bash
-sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
-sbatch --nodes=4 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
-sbatch --nodes=8 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-moe.conf
-```
-
-### Docker/Interactive Mode
-```bash
-bash qwen_qad.sh --config configs/qwen3-8b-default.conf
-bash qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
-
-# Override config values
-LR=1e-5 bash qwen_qad.sh --config configs/qwen3-8b-default.conf
-```
-
-## Available Configs
-
-| Config | Model | Dataset | Recommended SLURM |
-|--------|-------|---------|-------------------|
-| `qwen3-8b-default.conf` | Qwen3-8B | openscience | `--nodes=4 -t 4:00:00` |
-| `qwen3-8b-nemotron.conf` | Qwen3-8B | nemotron | `--nodes=4 -t 8:00:00` |
-| `qwen3-30b-a3b-moe.conf` | Qwen3-30B-A3B | nemotron | `--nodes=8 -t 8:00:00` |
-
-## Creating Custom Configs
-
-```bash
-cp configs/template.conf configs/my-experiment.conf
-# Edit my-experiment.conf (set STUDENT_CKPT and TEACHER_CKPT)
-sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/my-experiment.conf
-```
-
-## Configuration Variables
-
-### Model
-| Variable | Description | Required |
-|----------|-------------|----------|
-| `STUDENT_MODEL` | Student model architecture | Yes |
-| `TEACHER_MODEL` | Teacher model architecture | Yes |
-
-### Checkpoints (REQUIRED)
-| Variable | Description | Required |
-|----------|-------------|----------|
-| `STUDENT_CKPT` | Path to student checkpoint (FP4 for QAD) | **Yes** |
-| `TEACHER_CKPT` | Path to teacher checkpoint (BF16) | **Yes (QAD)** |
-| `TEACHER_MODEL_CONFIG` | Path to teacher config YAML | No (auto) |
-
-### Training
-| Variable | Description | Default |
-|----------|-------------|---------|
-| `LR` | Learning rate | 1e-6 |
-| `DATASET_NAME` | Dataset to use | openscience |
-| `KD_CFG_PATH` | Custom KD config YAML | (empty) |
-| `TRAIN_SAMPLES` | Override sample count | (auto) |
-
-### Parallelism
-| Variable | Description | Default |
-|----------|-------------|---------|
-| `TP_SIZE` | Tensor parallelism | 8 |
-| `PP_SIZE` | Pipeline parallelism | 1 |
-| `EP_SIZE` | Expert parallelism (MoE) | 1 |
-| `MBS` | Micro-batch size | 16 |
-| `NUM_GPUS` | GPUs per node | 8 |
-| `MASTER_PORT` | Distributed port | 29500 |
-
-### Paths
-| Variable | Description |
-|----------|-------------|
-| `MLM_DIR` | Megatron-LM directory |
-| `MODELOPT_DIR` | ModelOpt directory |
-| `MODELS_ROOT` | Model checkpoints root |
-| `QAD_CHECKPOINT_ROOT` | Output root |
-| `DATACACHE_DIR` | Data cache |
-
-### Container
-| Variable | Description |
-|----------|-------------|
-| `CONTAINER_IMAGE` | Container squashfs |
-| `CONTAINER_MOUNTS` | Mount points |
-| `CONTAINER_WORKDIR` | Working directory |
-
-## Output Directory Naming
-
-Output directories are named using the checkpoint directory names:
-```
-{QAD_CHECKPOINT_ROOT}/{STUDENT_CKPT_NAME}-Teacher-{TEACHER_CKPT_NAME}-Data-{DATASET}-lr{LR}/
-```
-
-Example:
-```
-/checkpoints/Qwen3-8B-NVFP4-TP8-MLM-Teacher-Qwen3-8B-TP8-MLM-Data-nemotron-lr1e-6/
-```
-
-## SLURM Options
-
-SLURM parameters should be passed via `sbatch` command:
-
-```bash
-sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config ...
-sbatch --nodes=8 -t 8:00:00 -p batch -A myaccount sbatch_qwen_qad.sh --config ...
-```
-
-## Variable Priority
-
-```
-Script defaults < Config file < Environment variables < Command line args
-```
diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf
deleted file mode 100644
index d5449d431..000000000
--- a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe.conf
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-########################################################
-# QAD Configuration: Qwen3-30B-A3B Instruct (MoE)
-# Mixture of Experts - requires more resources
-#
-# Usage:
-#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
-########################################################
-
-########################################################
-# MODEL
-########################################################
-export STUDENT_MODEL="Qwen3-30B-A3B"
-export TEACHER_MODEL="Qwen3-30B-A3B"
-
-########################################################
-# CHECKPOINTS (REQUIRED)
-########################################################
-# export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Instruct-2507-NVFP4-TP4-MLM"
-export STUDENT_CKPT="/home/scratch.weimingc_sw/models/modelopt_artifacts/Qwen3-30B-A3B-Instruct-2507-NVFP4-cnn_nemotron_calib-TP4-MLM"
-export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Instruct-2507-TP4-MLM"
-export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Instruct-2507-teacher.yaml"
-
-########################################################
-# TRAINING
-########################################################
-export LR="1e-6"
-export DATASET_NAME="nemotron_v2_code" #"nemotron_v2_code"
-# export KD_CFG_PATH=""  # Optional: path to custom KD config YAML
-
-########################################################
-# PARALLELISM (MoE specific)
-# Note: QAD loads both student + teacher models, requires more memory
-########################################################
-export TP_SIZE=4
-export PP_SIZE=1
-export EP_SIZE=4    # Expert parallelism for 128 experts
-export MBS=4        
-export NUM_GPUS=8
-export MASTER_PORT=29500
-
-########################################################
-# PATHS
-########################################################
-export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
-export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
-export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
-export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
-
-########################################################
-# CONTAINER
-########################################################
-export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
-export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
-export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
new file mode 100644
index 000000000..2ad736c60
--- /dev/null
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
@@ -0,0 +1,69 @@
+#!/bin/bash
+########################################################
+# QAD Configuration: Qwen3-30B-A3B Instruct (MoE)
+# Mixture of Experts - requires more resources
+#
+# Usage:
+#   sbatch sbatch_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
+########################################################
+
+########################################################
+# MODEL
+########################################################
+export STUDENT_MODEL="Qwen3-30B-A3B-Instruct-2507"
+export TEACHER_MODEL="Qwen3-30B-A3B-Instruct-2507"
+export TOKENIZER_MODEL="Qwen/Qwen3-30B-A3B-Instruct-2507"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+export STUDENT_CKPT="" # Student MLM checkpoint path
+export TEACHER_CKPT="" # Teacher MLM checkpoint path
+export TEACHER_MODEL_CONFIG="" # Teacher MLM model config yaml file, e.g., configs/Qwen3-30B-A3B-teacher.yaml
+
+########################################################
+# TRAINING (REQUIRED - no defaults in qwen_qad.sh)
+########################################################
+export LR="1e-5"
+export GBS=256
+export MIN_LR="0.0"
+export LR_DECAY_STYLE="constant"
+export SAVE_INTERVAL=200
+export LOG_INTERVAL=10
+export DATASET_NAME="combined_openscience_nemotron"
+export TRAIN_SAMPLES=1024000
+
+########################################################
+# PARALLELISM (MoE specific)
+# Note: QAD loads both student + teacher models, requires more memory
+########################################################
+export IS_MOE=true
+export TP_SIZE=2
+export PP_SIZE=1
+export EP_SIZE=4
+export MBS=2
+export NUM_GPUS=4
+export MASTER_PORT=29500
+
+########################################################
+# PATHS (REQUIRED - no defaults in qwen_qad.sh)
+########################################################
+export MLM_DIR="" # path to Megatron-LM source directory
+export MODELOPT_DIR="" # path to Model-Optimizer source directory
+export STUDENT_CONFIG_FILE="" # path to student model args script, e.g., ${MLM_DIR}/examples/post_training/modelopt/conf/Qwen/Qwen3-30B-A3B.sh
+export QAD_CHECKPOINT_ROOT="" # path to store QAD checkpoints
+export DATACACHE_DIR="" # path to data cache directory
+
+########################################################
+# CONTAINER
+########################################################
+export CONTAINER_IMAGE="" # path to container image, e.g., nvcr.io/nvidia/pytorch:25.06-py3
+export CONTAINER_MOUNTS="" # container mounts, e.g., "/lustre/fs1:/lustre/fs1"
+export CONTAINER_WORKDIR="" # container work directory, e.g., "<path-to-modelopt>/Model-Optimizer/examples/llm_qad"
+
+
+########################################################
+# DATASET
+########################################################
+# Generate with: bash data_utils/generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>
+export BLEND_PATH="" # path to datablend_combined.json from generate_dataset.sh
\ No newline at end of file
diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-moe.conf b/examples/llm_qad/configs/qwen3-30b-a3b-moe.conf
deleted file mode 100644
index 7b8941ce2..000000000
--- a/examples/llm_qad/configs/qwen3-30b-a3b-moe.conf
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-########################################################
-# QAD Configuration: Qwen3-30B-A3B (MoE)
-# Mixture of Experts - requires more resources
-#
-# Usage:
-#   sbatch --nodes=8 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-moe.conf
-########################################################
-
-########################################################
-# MODEL
-########################################################
-export STUDENT_MODEL="Qwen3-30B-A3B"
-export TEACHER_MODEL="Qwen3-30B-A3B"
-
-########################################################
-# CHECKPOINTS (REQUIRED)
-########################################################
-export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-NVFP4-TP4-MLM"
-export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-TP4-MLM"
-export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-teacher.yaml"
-
-########################################################
-# TRAINING
-########################################################
-export LR="1e-6"
-export DATASET_NAME="nemotron"
-
-########################################################
-# PARALLELISM (MoE specific)
-# Qwen3-30B-A3B: TP=4 max (num-query-groups = 4)
-# With 8 GPUs: TP=4 × EP=2 = 8 GPUs total
-########################################################
-export TP_SIZE=4
-export PP_SIZE=1
-export EP_SIZE=2    # Expert parallelism: TP×EP = 4×2 = 8 GPUs
-export MBS=16        # Smaller batch for larger model
-export NUM_GPUS=8
-export MASTER_PORT=29500
-
-########################################################
-# PATHS
-########################################################
-export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
-export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
-export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
-export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
-
-########################################################
-# CONTAINER
-########################################################
-export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
-export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
-export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf b/examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf
deleted file mode 100644
index 0f76697be..000000000
--- a/examples/llm_qad/configs/qwen3-30b-a3b-thinking-2507-moe.conf
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-########################################################
-# QAD Configuration: Qwen3-30B-A3B Instruct (MoE)
-# Mixture of Experts - requires more resources
-#
-# Usage:
-#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
-########################################################
-
-########################################################
-# MODEL
-########################################################
-export STUDENT_MODEL="Qwen3-30B-A3B"
-export TEACHER_MODEL="Qwen3-30B-A3B"
-
-########################################################
-# CHECKPOINTS (REQUIRED)
-########################################################
-export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Thinking-2507-NVFP4-TP4-MLM"
-export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Thinking-2507-TP4-MLM"
-export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-30B-A3B-Thinking-2507-teacher.yaml"
-
-########################################################
-# TRAINING
-########################################################
-export LR="1e-6"
-export DATASET_NAME="combined_v2_cot_chat"
-# export KD_CFG_PATH=""  # Optional: path to custom KD config YAML
-
-########################################################
-# PARALLELISM (MoE specific)
-# Note: QAD loads both student + teacher models, requires more memory
-########################################################
-export TP_SIZE=4
-export PP_SIZE=1
-export EP_SIZE=4    # Expert parallelism for 128 experts
-export MBS=4        # MBS=4 for H100x8
-export NUM_GPUS=8
-export MASTER_PORT=29500
-
-########################################################
-# PATHS
-########################################################
-export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
-export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
-export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
-export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
-
-########################################################
-# CONTAINER
-########################################################
-export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
-export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
-export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
-
diff --git a/examples/llm_qad/configs/qwen3-8b-nemotron.conf b/examples/llm_qad/configs/qwen3-8b-nemotron.conf
deleted file mode 100644
index 052c6fe7d..000000000
--- a/examples/llm_qad/configs/qwen3-8b-nemotron.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-########################################################
-# QAD Configuration: Qwen3-8B with Nemotron-v1 Dataset
-# Best for MMLU accuracy improvement
-#
-# Usage:
-#   sbatch --nodes=4 -t 8:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
-########################################################
-
-########################################################
-# MODEL
-########################################################
-export STUDENT_MODEL="Qwen3-8B"
-export TEACHER_MODEL="Qwen3-8B"
-
-########################################################
-# CHECKPOINTS (REQUIRED)
-########################################################
-export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-NVFP4-TP8-MLM"
-export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-TP8-MLM"
-export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-teacher.yaml"
-
-########################################################
-# TRAINING
-########################################################
-export LR="1e-6"
-export DATASET_NAME="nemotron"  # Nemotron-v1 @ 30% (~7.5M samples)
-
-########################################################
-# PARALLELISM
-########################################################
-export TP_SIZE=8
-export PP_SIZE=1
-export EP_SIZE=1
-export MBS=16
-export NUM_GPUS=8
-export MASTER_PORT=29500
-
-########################################################
-# PATHS
-########################################################
-export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
-export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
-export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
-export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
-
-########################################################
-# CONTAINER
-########################################################
-export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
-export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
-export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/qwen3-8b-default.conf b/examples/llm_qad/configs/qwen3-8b.conf
similarity index 100%
rename from examples/llm_qad/configs/qwen3-8b-default.conf
rename to examples/llm_qad/configs/qwen3-8b.conf
diff --git a/examples/llm_qad/configs/template.conf b/examples/llm_qad/configs/template.conf
deleted file mode 100644
index 814a9de02..000000000
--- a/examples/llm_qad/configs/template.conf
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/bin/bash
-########################################################
-# QAD Training Configuration Template
-########################################################
-# Copy this file and modify for your experiment:
-#   cp template.conf my-experiment.conf
-#
-# Then run:
-#   sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/my-experiment.conf
-########################################################
-
-########################################################
-# MODEL CONFIGURATION
-########################################################
-
-# Student model architecture name
-export STUDENT_MODEL="Qwen3-8B"
-
-# Teacher model architecture name
-export TEACHER_MODEL="Qwen3-8B"
-
-# Model architecture config file (optional - auto-detected from STUDENT_MODEL)
-# export STUDENT_CONFIG_FILE="/path/to/Megatron-LM/.../conf/Qwen/Qwen3-8B.sh"
-
-########################################################
-# CHECKPOINTS (REQUIRED)
-########################################################
-
-# Student checkpoint path (REQUIRED)
-# This is the FP4 quantized checkpoint for QAD training
-export STUDENT_CKPT="/path/to/student/checkpoint"
-
-# Teacher checkpoint path (REQUIRED for QAD mode)
-# This is the BF16 teacher model for knowledge distillation
-export TEACHER_CKPT="/path/to/teacher/checkpoint"
-
-# Teacher model config YAML (REQUIRED)
-# Contains: num_layers, hidden_size, num_attention_heads, ffn_hidden_size
-export TEACHER_MODEL_CONFIG="/path/to/teacher.yaml"
-
-########################################################
-# TRAINING CONFIGURATION
-########################################################
-
-# Learning rate
-export LR="1e-6"
-
-# Dataset name (selects from predefined datablends)
-# Options: openscience, nemotron, nemotron_v2, combined, slimorca
-export DATASET_NAME="openscience"
-
-# Training samples (leave empty to use dataset default)
-# export TRAIN_SAMPLES=""
-
-# KD config file path (optional - for custom distillation settings)
-# export KD_CFG_PATH="/path/to/kd_config.yaml"
-
-########################################################
-# PARALLELISM CONFIGURATION
-########################################################
-
-# Tensor Parallelism (must match checkpoint TP)
-export TP_SIZE=8
-
-# Pipeline Parallelism (increase for larger models)
-export PP_SIZE=1
-
-# Expert Parallelism (for MoE models)
-export EP_SIZE=1
-
-# Micro-batch size per GPU
-export MBS=16
-
-# Number of GPUs per node
-export NUM_GPUS=8
-
-# Master port for distributed training
-export MASTER_PORT=29500
-
-########################################################
-# ROOT PATHS
-########################################################
-
-# Megatron-LM source directory
-export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-
-# ModelOpt source directory
-export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
-
-# Root directory containing model checkpoints
-export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
-
-# Root directory for training outputs
-export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
-
-# Data cache directory
-export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
-
-########################################################
-# DATASET PATHS (optional)
-########################################################
-
-# Custom datablend JSON path (overrides DATASET_NAME)
-# export BLEND_PATH="/path/to/datablend.json"
-
-########################################################
-# CONTAINER CONFIGURATION
-########################################################
-
-export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
-export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
-export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
-
-########################################################
-# ADVANCED OPTIONS
-########################################################
-
-# HuggingFace token for accessing gated models (avoids rate limiting)
-# Recommended: pass via --hf-token arg to avoid logging
-# Example: sbatch sbatch_qwen_qad.sh --hf-token hf_xxx --config ...
-# Or set via environment: export HF_TOKEN="hf_xxx"
-
-# Iterations to skip (comma-separated)
-# export ITERATIONS_TO_SKIP=""
diff --git a/examples/llm_qad/data_utils/download_nemotron_v1.py b/examples/llm_qad/data_utils/download_nemotron_v1.py
deleted file mode 100644
index 4d35d7e9c..000000000
--- a/examples/llm_qad/data_utils/download_nemotron_v1.py
+++ /dev/null
@@ -1,540 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download and preprocess NVIDIA Nemotron-Post-Training-Dataset-v1 for QAD training.
-
-This dataset contains high-quality reasoning data generated by DeepSeek-R1 and Qwen3-235B,
-which is excellent for improving MMLU and reasoning capabilities.
-
-Splits available:
-- stem: 20.6M samples (science, reasoning, humanities) - BEST for MMLU
-- math: 2.0M samples (step-by-step math solutions)
-- code: 1.9M samples (programming challenges)
-- chat: 746K samples (conversational tuning)
-- tool_calling: 310K samples (function calling)
-
-Usage:
-    # Download all splits to separate folders (recommended)
-    python download_nemotron_v1.py --sample-percent 30 --tokenizer Qwen/Qwen3-8B --include-reasoning
-    
-    # Download specific splits
-    python download_nemotron_v1.py --splits stem,math --sample-percent 30 --include-reasoning
-    
-    # Combined mode (legacy - all splits in one file)
-    python download_nemotron_v1.py --sample-percent 30 --combined
-
-Output structure (split mode - default):
-    nemotron_v1/
-    ├── stem/
-    │   ├── stem_30pct_cot_chat_train.jsonl
-    │   ├── stem_30pct_cot_chat_validation.jsonl
-    │   └── stem_30pct_cot_chat_test.jsonl
-    ├── math/
-    │   └── ...
-    └── metadata.json
-
-Output structure (combined mode):
-    nemotron_v1/
-    ├── nemotron_all_30pct_cot_chat_train.jsonl
-    └── ...
-"""
-
-import argparse
-import json
-import os
-import random
-from datasets import load_dataset
-from tqdm import tqdm
-
-DEFAULT_OUTPUT_DIR = None  # Must be specified via --output-dir
-DEFAULT_DATABLEND_DIR = None  # Must be specified via --datablend-dir
-
-# Available splits and their sizes
-AVAILABLE_SPLITS = {
-    "stem": 20662167,      # Best for MMLU - science, reasoning, humanities
-    "math": 2044407,       # Math reasoning
-    "code": 1896395,       # Code challenges
-    "chat": 746622,        # Conversational
-    "tool_calling": 310051 # Function calling
-}
-
-# Train/valid/test split ratios
-TRAIN_RATIO = 0.95
-VALID_RATIO = 0.025
-TEST_RATIO = 0.025
-RANDOM_SEED = 42
-
-# Global tokenizer for chat template (initialized if --tokenizer is provided)
-_TOKENIZER = None
-
-
-def init_tokenizer(tokenizer_name: str):
-    """Initialize tokenizer for chat template formatting."""
-    global _TOKENIZER
-    if tokenizer_name:
-        from transformers import AutoTokenizer
-        print(f"📝 Loading tokenizer for chat template: {tokenizer_name}")
-        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
-        
-        # Show example
-        example = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
-        formatted = _TOKENIZER.apply_chat_template(example, tokenize=False)
-        print(f"   Example format:\n   {formatted[:200]}...")
-
-
-def format_messages_to_text(messages: list, reasoning: str = None) -> str:
-    """Convert messages format to text for QAD training.
-    
-    If a tokenizer is initialized, uses its chat template.
-    Otherwise, uses simple role-based formatting.
-    """
-    global _TOKENIZER
-    
-    # Optionally prepend reasoning/chain-of-thought
-    if reasoning and reasoning.strip():
-        # Insert thinking block before last assistant message
-        messages_with_cot = []
-        for i, msg in enumerate(messages):
-            if msg.get("role") == "assistant" and i == len(messages) - 1:
-                thinking_content = f"<think>\n{reasoning}\n</think>\n{msg.get('content', '')}"
-                messages_with_cot.append({"role": "assistant", "content": thinking_content})
-            else:
-                messages_with_cot.append(msg)
-        messages = messages_with_cot
-    
-    # Use chat template if tokenizer is available
-    if _TOKENIZER is not None:
-        try:
-            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
-        except Exception as e:
-            print(f"Warning: Chat template failed, using simple format: {e}")
-    
-    # Fallback: simple role-based format
-    text_parts = []
-    for msg in messages:
-        role = msg.get("role", "")
-        content = msg.get("content", "")
-        
-        if role == "system":
-            text_parts.append(f"System: {content}")
-        elif role == "user":
-            text_parts.append(f"User: {content}")
-        elif role == "assistant":
-            text_parts.append(f"Assistant: {content}")
-    
-    return "\n\n".join(text_parts)
-
-
-def download_split(split_name: str, max_samples: int, output_dir: str,
-                   suffix: str, include_reasoning: bool = False) -> dict:
-    """Download a single split and save to its own folder.
-    
-    Returns dict with sample counts for each partition.
-    """
-    print(f"\n📥 Downloading split: {split_name} (target: {max_samples:,} samples)")
-    
-    # Create split-specific directory
-    split_dir = os.path.join(output_dir, split_name)
-    os.makedirs(split_dir, exist_ok=True)
-    
-    try:
-        # Load the specific split with streaming
-        dataset = load_dataset(
-            "nvidia/Nemotron-Post-Training-Dataset-v1",
-            split=split_name,
-            streaming=True
-        )
-        
-        all_examples = []
-        count = 0
-        
-        for example in tqdm(dataset, desc=f"Processing {split_name}", total=max_samples):
-            if count >= max_samples:
-                break
-            
-            messages = example.get("messages", [])
-            reasoning = example.get("reasoning", "") if include_reasoning else ""
-            
-            # Convert to text format
-            text = format_messages_to_text(messages, reasoning)
-            
-            if text.strip():
-                all_examples.append({
-                    "text": text,
-                    "category": example.get("category", split_name),
-                })
-                count += 1
-        
-        print(f"✓ Collected {count:,} examples from {split_name}")
-        
-        if not all_examples:
-            print(f"Warning: No examples collected for {split_name}")
-            return {"train": 0, "validation": 0, "test": 0}
-        
-        # Shuffle and split into train/valid/test
-        random.seed(RANDOM_SEED)
-        random.shuffle(all_examples)
-        
-        total_size = len(all_examples)
-        train_end = int(total_size * TRAIN_RATIO)
-        valid_end = train_end + int(total_size * VALID_RATIO)
-        
-        partitions = {
-            'train': all_examples[:train_end],
-            'validation': all_examples[train_end:valid_end],
-            'test': all_examples[valid_end:]
-        }
-        
-        # Save each partition
-        counts = {}
-        for part_name, part_data in partitions.items():
-            output_file = os.path.join(split_dir, f"{split_name}_{suffix}_{part_name}.jsonl")
-            
-            with open(output_file, 'w', encoding='utf-8') as f:
-                for example in part_data:
-                    json_line = json.dumps({"text": example["text"]}, ensure_ascii=False)
-                    f.write(json_line + '\n')
-            
-            counts[part_name] = len(part_data)
-            print(f"   ✓ {part_name}: {len(part_data):,} samples → {output_file}")
-        
-        return counts
-        
-    except Exception as e:
-        print(f"Error loading {split_name}: {e}")
-        return {"train": 0, "validation": 0, "test": 0}
-
-
-def create_datablend_configs(output_dir: str, splits_downloaded: list, suffix: str, 
-                             sample_counts: dict, datablend_dir: str):
-    """Create datablend JSON configs for each split and combined."""
-    preprocessed_dir = output_dir.replace("nemotron_v1", "nemotron_v1_preprocessed")
-    
-    # Create individual datablend for each split
-    for split_name in splits_downloaded:
-        blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v1_{split_name}_{suffix}.json")
-        blend_config = {
-            "train": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_train_text_document"],
-            "valid": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_validation_text_document"],
-            "test": [1.0, f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_test_text_document"]
-        }
-        
-        with open(blend_file, 'w') as f:
-            json.dump(blend_config, f, indent=2)
-        print(f"📝 Created: {blend_file}")
-    
-    # Create combined datablend (all English splits)
-    if len(splits_downloaded) > 1:
-        english_splits = [s for s in splits_downloaded if s in ["stem", "math", "code", "chat"]]
-        if english_splits:
-            # Calculate weights based on sample counts
-            total_samples = sum(sample_counts.get(s, {}).get("train", 0) for s in english_splits)
-            
-            blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v1_all_en_{suffix}.json")
-            
-            train_entries = []
-            valid_entries = []
-            test_entries = []
-            
-            for split_name in english_splits:
-                split_count = sample_counts.get(split_name, {}).get("train", 0)
-                weight = split_count / total_samples if total_samples > 0 else 1.0 / len(english_splits)
-                
-                train_entries.extend([
-                    weight,
-                    f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_train_text_document"
-                ])
-                valid_entries.extend([
-                    weight,
-                    f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_validation_text_document"
-                ])
-                test_entries.extend([
-                    weight,
-                    f"{preprocessed_dir}/{split_name}/{split_name}_{suffix}_test_text_document"
-                ])
-            
-            blend_config = {
-                "train": train_entries,
-                "valid": valid_entries,
-                "test": test_entries
-            }
-            
-            with open(blend_file, 'w') as f:
-                json.dump(blend_config, f, indent=2)
-            print(f"📝 Created combined: {blend_file}")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Download Nemotron-v1 for QAD")
-    parser.add_argument("--output-dir", type=str, required=True,
-                        help="Output directory for JSONL files (required)")
-    parser.add_argument("--datablend-dir", type=str, required=True,
-                        help="Directory for datablend config files (required)")
-    parser.add_argument("--splits", type=str, default="stem,math,code,chat",
-                        help="Comma-separated list of splits to download (stem,math,code,chat,tool_calling)")
-    parser.add_argument("--sample-percent", type=float, default=30.0,
-                        help="Percentage of each split to use (1-100). E.g., 10 = 10%% of each split")
-    parser.add_argument("--max-samples", type=int, default=None,
-                        help="Maximum samples per split (absolute cap, used if --sample-percent not set)")
-    parser.add_argument("--include-reasoning", action="store_true",
-                        help="Include chain-of-thought reasoning in output")
-    parser.add_argument("--tokenizer", type=str, default=None,
-                        help="HuggingFace tokenizer to use for chat template (e.g., Qwen/Qwen3-8B)")
-    parser.add_argument("--combined", action="store_true",
-                        help="Legacy mode: combine all splits into single files instead of separate folders")
-    args = parser.parse_args()
-    
-    # Default to 30% if neither option is specified
-    if args.sample_percent is None and args.max_samples is None:
-        args.sample_percent = 30.0
-    
-    # Initialize tokenizer if specified
-    if args.tokenizer:
-        init_tokenizer(args.tokenizer)
-    
-    output_dir = args.output_dir
-    datablend_dir = args.datablend_dir
-    os.makedirs(output_dir, exist_ok=True)
-    os.makedirs(datablend_dir, exist_ok=True)
-    
-    splits_to_download = [s.strip() for s in args.splits.split(",")]
-    
-    # Build suffix string
-    if args.sample_percent is not None:
-        pct_str = f"{int(args.sample_percent)}pct"
-    else:
-        pct_str = ""
-    
-    cot_str = "_cot" if args.include_reasoning else ""
-    chat_str = "_chat" if args.tokenizer else ""
-    suffix = f"{pct_str}{cot_str}{chat_str}"
-    
-    print("=" * 70)
-    print("Downloading NVIDIA Nemotron-Post-Training-Dataset-v1")
-    print("=" * 70)
-    print(f"Mode: {'Combined (legacy)' if args.combined else 'Split (fine-grained)'}")
-    print(f"Splits: {splits_to_download}")
-    print(f"Sample percent: {args.sample_percent}%")
-    print(f"Include reasoning: {args.include_reasoning}")
-    print(f"Chat template: {args.tokenizer or 'None (simple format)'}")
-    print(f"Suffix: {suffix}")
-    print(f"Output directory: {output_dir}")
-    print("=" * 70)
-    
-    # Calculate samples per split (with 500K cap per split)
-    MAX_SAMPLES_PER_SPLIT = 500000  # Cap at 500K per split for manageable dataset size
-    samples_per_split = {}
-    for split_name in splits_to_download:
-        if split_name not in AVAILABLE_SPLITS:
-            continue
-        available = AVAILABLE_SPLITS[split_name]
-        if args.sample_percent is not None:
-            calculated = int(available * args.sample_percent / 100)
-            samples_per_split[split_name] = min(calculated, MAX_SAMPLES_PER_SPLIT)
-        else:
-            samples_per_split[split_name] = min(available, args.max_samples, MAX_SAMPLES_PER_SPLIT)
-    
-    print(f"\nExpected samples per split (capped at {MAX_SAMPLES_PER_SPLIT:,}):")
-    total_expected = 0
-    for split_name, count in samples_per_split.items():
-        available = AVAILABLE_SPLITS[split_name]
-        pct = count / available * 100
-        capped = " (CAPPED)" if count == MAX_SAMPLES_PER_SPLIT else ""
-        print(f"  {split_name}: {count:,} / {available:,} ({pct:.1f}%){capped}")
-        total_expected += count
-    print(f"  Total expected: {total_expected:,}")
-    
-    if args.combined:
-        # Legacy combined mode
-        download_combined_mode(args, splits_to_download, samples_per_split, suffix, datablend_dir)
-    else:
-        # New split mode (default)
-        sample_counts = {}
-        
-        for split_name in splits_to_download:
-            if split_name not in AVAILABLE_SPLITS:
-                print(f"Warning: Unknown split '{split_name}', skipping...")
-                continue
-            
-            max_samples = samples_per_split[split_name]
-            counts = download_split(
-                split_name=split_name,
-                max_samples=max_samples,
-                output_dir=output_dir,
-                suffix=suffix,
-                include_reasoning=args.include_reasoning
-            )
-            sample_counts[split_name] = counts
-        
-        # Save metadata
-        metadata = {
-            "sample_percent": args.sample_percent,
-            "include_reasoning": args.include_reasoning,
-            "tokenizer": args.tokenizer,
-            "suffix": suffix,
-            "splits": {}
-        }
-        for split_name, counts in sample_counts.items():
-            metadata["splits"][split_name] = {
-                "train": counts.get("train", 0),
-                "validation": counts.get("validation", 0),
-                "test": counts.get("test", 0),
-                "total": sum(counts.values())
-            }
-        
-        metadata_file = os.path.join(output_dir, f"metadata_{suffix}.json")
-        with open(metadata_file, 'w') as f:
-            json.dump(metadata, f, indent=2)
-        print(f"\n📝 Saved metadata: {metadata_file}")
-        
-        # Create datablend configs
-        print("\n" + "=" * 70)
-        print("Creating datablend configs...")
-        create_datablend_configs(output_dir, list(sample_counts.keys()), suffix, sample_counts, datablend_dir)
-        
-        # Print summary
-        print("\n" + "=" * 70)
-        print("✓ Nemotron-v1 download complete!")
-        print("=" * 70)
-        print(f"\nOutput structure:")
-        print(f"  {output_dir}/")
-        for split_name in sample_counts.keys():
-            print(f"  ├── {split_name}/")
-            print(f"  │   ├── {split_name}_{suffix}_train.jsonl")
-            print(f"  │   ├── {split_name}_{suffix}_validation.jsonl")
-            print(f"  │   └── {split_name}_{suffix}_test.jsonl")
-        print(f"  └── metadata_{suffix}.json")
-        
-        print(f"\nSample counts:")
-        total_train = 0
-        for split_name, counts in sample_counts.items():
-            train_count = counts.get("train", 0)
-            total_train += train_count
-            print(f"  {split_name}: {train_count:,} train samples")
-        print(f"  Total: {total_train:,} train samples")
-        
-        print(f"\nNext steps:")
-        print(f"1. Preprocess each split:")
-        for split_name in sample_counts.keys():
-            print(f"   bash process_nemotron_v1_qwen3-8B.sh {split_name} {suffix}")
-        print(f"\n2. Use individual splits:")
-        print(f"   DATASET_NAME=nemotron_v1_stem_{suffix} bash qwen_qad.sh ...")
-        print(f"\n3. Or use combined datablend:")
-        print(f"   BLEND_PATH=datablend_nemotron_v1_all_en_{suffix}.json bash qwen_qad.sh ...")
-        print("=" * 70)
-
-
-def download_combined_mode(args, splits_to_download, samples_per_split, suffix, datablend_dir):
-    """Legacy combined mode - all splits in single files."""
-    output_dir = args.output_dir
-    
-    all_examples = []
-    
-    for split_name in splits_to_download:
-        if split_name not in AVAILABLE_SPLITS:
-            print(f"Warning: Unknown split '{split_name}', skipping...")
-            continue
-        
-        max_for_split = samples_per_split[split_name]
-        print(f"\n📥 Loading split: {split_name} (target: {max_for_split:,} samples)")
-        
-        try:
-            dataset = load_dataset(
-                "nvidia/Nemotron-Post-Training-Dataset-v1",
-                split=split_name,
-                streaming=True
-            )
-            
-            count = 0
-            for example in tqdm(dataset, desc=f"Processing {split_name}", total=max_for_split):
-                if count >= max_for_split:
-                    break
-                
-                messages = example.get("messages", [])
-                reasoning = example.get("reasoning", "") if args.include_reasoning else ""
-                
-                text = format_messages_to_text(messages, reasoning)
-                
-                if text.strip():
-                    all_examples.append({
-                        "text": text,
-                        "category": example.get("category", split_name),
-                        "source": "nemotron_v1"
-                    })
-                    count += 1
-            
-            print(f"✓ Collected {count:,} examples from {split_name}")
-            
-        except Exception as e:
-            print(f"Error loading {split_name}: {e}")
-            continue
-    
-    if not all_examples:
-        print("Error: No examples collected!")
-        return
-    
-    print(f"\n📊 Total examples collected: {len(all_examples):,}")
-    
-    # Shuffle and split
-    random.seed(RANDOM_SEED)
-    random.shuffle(all_examples)
-    
-    total_size = len(all_examples)
-    train_end = int(total_size * TRAIN_RATIO)
-    valid_end = train_end + int(total_size * VALID_RATIO)
-    
-    splits = {
-        'train': all_examples[:train_end],
-        'validation': all_examples[train_end:valid_end],
-        'test': all_examples[valid_end:]
-    }
-    
-    # Generate output filename
-    if set(splits_to_download) >= {"stem", "math", "code", "chat"}:
-        split_suffix = "all"
-    else:
-        split_suffix = "_".join(splits_to_download)
-    
-    full_suffix = f"_{suffix}" if suffix else ""
-    
-    # Save each split as JSONL
-    for split_name, split_data in splits.items():
-        output_file = os.path.join(output_dir, f"nemotron_{split_suffix}{full_suffix}_{split_name}.jsonl")
-        print(f"\nWriting {output_file}...")
-        
-        with open(output_file, 'w', encoding='utf-8') as f:
-            for example in tqdm(split_data, desc=split_name):
-                json_line = json.dumps({"text": example["text"]}, ensure_ascii=False)
-                f.write(json_line + '\n')
-        
-        print(f"✓ Saved {split_name}")
-    
-    # Create datablend config
-    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_{split_suffix}{full_suffix}.json")
-    
-    preprocessed_dir = output_dir.replace("nemotron_v1", "nemotron_v1_preprocessed")
-    blend_config = {
-        "train": [1.0, f"{preprocessed_dir}/nemotron_{split_suffix}{full_suffix}_train_text_document"],
-        "valid": [1.0, f"{preprocessed_dir}/nemotron_{split_suffix}{full_suffix}_validation_text_document"],
-        "test": [1.0, f"{preprocessed_dir}/nemotron_{split_suffix}{full_suffix}_test_text_document"]
-    }
-    
-    with open(blend_file, 'w') as f:
-        json.dump(blend_config, f, indent=2)
-    print(f"\n📝 Saved datablend config: {blend_file}")
-    
-    print("\n" + "=" * 70)
-    print("✓ Nemotron-v1 download complete (combined mode)!")
-    print(f"Output directory: {output_dir}")
-    print(f"\nDataset summary:")
-    print(f"  Total samples: {len(all_examples):,}")
-    print(f"  Train: {len(splits['train']):,}")
-    print(f"  Validation: {len(splits['validation']):,}")
-    print(f"  Test: {len(splits['test']):,}")
-    print("\nNext steps:")
-    print(f"1. Preprocess: bash process_nemotron_qwen3-8B.sh {split_suffix}{full_suffix}")
-    print(f"2. Run QAD: DATASET_NAME=nemotron_{split_suffix}{full_suffix} bash qwen_qad.sh ...")
-    print("=" * 70)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/llm_qad/data_utils/generate_dataset.sh b/examples/llm_qad/data_utils/generate_dataset.sh
new file mode 100755
index 000000000..347a2924f
--- /dev/null
+++ b/examples/llm_qad/data_utils/generate_dataset.sh
@@ -0,0 +1,219 @@
+#!/bin/bash
+# =============================================================================
+# One-Button Dataset Generation for QAD Training
+# =============================================================================
+#
+# Downloads and preprocesses OpenScience + Nemotron-v2 datasets for QAD.
+# Creates a combined datablend JSON ready for training.
+#
+# USAGE:
+#   bash generate_dataset.sh --output-dir /path/to/datasets \
+#                            --mlm-path /path/to/Megatron-LM \
+#                            --tokenizer Qwen/Qwen3-30B-A3B-Instruct-2507
+#
+# REQUIREMENTS:
+#   - HuggingFace access to nvidia/Nemotron-Post-Training-Dataset-v2
+#   - Run: huggingface-cli login
+#
+# =============================================================================
+
+set -e
+
+# Parse arguments
+OUTPUT_DIR=""
+MLM_DIR=""
+TOKENIZER=""
+SAMPLE_PERCENT=30
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --mlm-path)
+            MLM_DIR="$2"
+            shift 2
+            ;;
+        --tokenizer)
+            TOKENIZER="$2"
+            shift 2
+            ;;
+        --sample-percent)
+            SAMPLE_PERCENT="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: bash generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>"
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required arguments
+if [ -z "$OUTPUT_DIR" ] || [ -z "$MLM_DIR" ] || [ -z "$TOKENIZER" ]; then
+    echo "Error: Missing required arguments"
+    echo ""
+    echo "Usage: bash generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>"
+    echo ""
+    echo "Required:"
+    echo "  --output-dir   Base output directory for datasets"
+    echo "  --mlm-path     Path to Megatron-LM directory"
+    echo "  --tokenizer    HuggingFace tokenizer (e.g., Qwen/Qwen3-30B-A3B-Instruct-2507)"
+    echo ""
+    echo "Optional:"
+    echo "  --sample-percent  Percentage of data to use (default: 30)"
+    exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SUFFIX="${SAMPLE_PERCENT}pct_chat"
+
+echo "=============================================="
+echo "QAD Dataset Generation"
+echo "=============================================="
+echo "Output:       ${OUTPUT_DIR}"
+echo "MLM path:     ${MLM_DIR}"
+echo "Tokenizer:    ${TOKENIZER}"
+echo "Sample:       ${SAMPLE_PERCENT}%"
+echo "=============================================="
+
+# Create directories
+mkdir -p "${OUTPUT_DIR}"
+
+# =============================================================================
+# Step 1: Download datasets
+# =============================================================================
+echo ""
+echo "=== Step 1: Downloading Datasets ==="
+
+echo "[1/2] Downloading OpenScience..."
+python "${SCRIPT_DIR}/download_openscience.py" \
+    --output-dir "${OUTPUT_DIR}/openscience_splits" \
+    --datablend-dir "${OUTPUT_DIR}" \
+    --tokenizer "${TOKENIZER}"
+
+echo "[2/2] Downloading Nemotron-v2 @ ${SAMPLE_PERCENT}%..."
+python "${SCRIPT_DIR}/download_nemotron_v2.py" \
+    --output-dir "${OUTPUT_DIR}/nemotron_v2" \
+    --datablend-dir "${OUTPUT_DIR}" \
+    --sample-percent "${SAMPLE_PERCENT}" \
+    --tokenizer "${TOKENIZER}"
+
+# =============================================================================
+# Step 2: Preprocess datasets to Megatron format
+# =============================================================================
+echo ""
+echo "=== Step 2: Preprocessing to Megatron Format ==="
+
+echo "[1/5] Processing OpenScience..."
+bash "${SCRIPT_DIR}/process_openscience.sh" \
+    --output-dir "${OUTPUT_DIR}/openscience_splits_preprocessed" \
+    --input-dir "${OUTPUT_DIR}/openscience_splits" \
+    --mlm-path "${MLM_DIR}" \
+    --tokenizer "${TOKENIZER}" \
+    --suffix chat \
+    --datablend-dir "${OUTPUT_DIR}"
+
+echo "[2/5] Processing Nemotron-v2 code..."
+bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
+    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
+    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
+    --mlm-path "${MLM_DIR}" \
+    --tokenizer "${TOKENIZER}" \
+    --split code \
+    --suffix "${SUFFIX}" \
+    --datablend-dir "${OUTPUT_DIR}"
+
+echo "[3/5] Processing Nemotron-v2 math..."
+bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
+    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
+    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
+    --mlm-path "${MLM_DIR}" \
+    --tokenizer "${TOKENIZER}" \
+    --split math \
+    --suffix "${SUFFIX}" \
+    --datablend-dir "${OUTPUT_DIR}"
+
+echo "[4/5] Processing Nemotron-v2 stem..."
+bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
+    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
+    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
+    --mlm-path "${MLM_DIR}" \
+    --tokenizer "${TOKENIZER}" \
+    --split stem \
+    --suffix "${SUFFIX}" \
+    --datablend-dir "${OUTPUT_DIR}"
+
+echo "[5/5] Processing Nemotron-v2 chat..."
+bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
+    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
+    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
+    --mlm-path "${MLM_DIR}" \
+    --tokenizer "${TOKENIZER}" \
+    --split chat \
+    --suffix "${SUFFIX}" \
+    --datablend-dir "${OUTPUT_DIR}"
+
+# =============================================================================
+# Step 3: Create combined datablend JSON
+# =============================================================================
+echo ""
+echo "=== Step 3: Creating Combined Datablend ==="
+
+DATABLEND_FILE="${OUTPUT_DIR}/datablend_combined.json"
+
+cat > "${DATABLEND_FILE}" << EOF
+{
+    "train": [
+        0.3, "${OUTPUT_DIR}/nemotron_v2_preprocessed/code/code_${SUFFIX}_train_text_document",
+        0.2, "${OUTPUT_DIR}/nemotron_v2_preprocessed/math/math_${SUFFIX}_train_text_document",
+        0.2, "${OUTPUT_DIR}/nemotron_v2_preprocessed/stem/stem_${SUFFIX}_train_text_document",
+        0.1, "${OUTPUT_DIR}/nemotron_v2_preprocessed/chat/chat_${SUFFIX}_train_text_document",
+        0.2, "${OUTPUT_DIR}/openscience_splits_preprocessed/openscience_chat_train_text_document"
+    ],
+    "valid": [
+        0.5, "${OUTPUT_DIR}/nemotron_v2_preprocessed/stem/stem_${SUFFIX}_validation_text_document",
+        0.5, "${OUTPUT_DIR}/openscience_splits_preprocessed/openscience_chat_validation_text_document"
+    ],
+    "test": [
+        0.5, "${OUTPUT_DIR}/nemotron_v2_preprocessed/stem/stem_${SUFFIX}_test_text_document",
+        0.5, "${OUTPUT_DIR}/openscience_splits_preprocessed/openscience_chat_test_text_document"
+    ]
+}
+EOF
+
+echo "Created: ${DATABLEND_FILE}"
+
+# =============================================================================
+# Done
+# =============================================================================
+echo ""
+echo "=============================================="
+echo "Dataset generation complete!"
+echo "=============================================="
+echo ""
+echo "Output structure:"
+echo "  ${OUTPUT_DIR}/"
+echo "  ├── openscience_splits/           # Raw JSONL"
+echo "  ├── openscience_splits_preprocessed/  # Megatron format"
+echo "  ├── nemotron_v2/                  # Raw JSONL"
+echo "  ├── nemotron_v2_preprocessed/     # Megatron format"
+echo "  └── datablend_combined.json       # Combined dataset config"
+echo ""
+echo "Dataset weights (train):"
+echo "  - 30% Nemotron-v2 code"
+echo "  - 20% Nemotron-v2 math"
+echo "  - 20% Nemotron-v2 stem"
+echo "  - 10% Nemotron-v2 chat"
+echo "  - 20% OpenScience"
+echo ""
+echo "Next steps:"
+echo "  1. Set in your config:"
+echo "     export BLEND_PATH=\"${DATABLEND_FILE}\""
+echo ""
+echo "  2. Run training:"
+echo "     sbatch sbatch_qad.sh --config configs/your-config.conf"
+echo "=============================================="
+
diff --git a/examples/llm_qad/data_utils/process_nemotron_v1.sh b/examples/llm_qad/data_utils/process_nemotron_v1.sh
deleted file mode 100755
index 1cf9aed6e..000000000
--- a/examples/llm_qad/data_utils/process_nemotron_v1.sh
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/bin/bash
-# Preprocess Nemotron-v1 dataset for QAD training (general, model-agnostic)
-#
-# Usage:
-#   bash process_nemotron_v1.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
-#
-# Required arguments:
-#   --output-dir    Output directory for preprocessed files
-#   --mlm-path      Path to Megatron-LM directory
-#   --tokenizer     HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
-#
-# Optional arguments:
-#   --input-dir     Input directory (default: derived from output-dir)
-#   --split         Split name: stem, math, code, chat (default: stem)
-#   --suffix        Suffix for file naming (default: 30pct_cot_chat)
-#   --workers       Number of parallel workers (default: 32)
-#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
-
-set -e
-
-# Parse arguments
-OUTPUT_DIR=""
-MLM_DIR=""
-TOKENIZER_MODEL=""
-INPUT_DIR=""
-SPLIT="stem"
-SUFFIX="30pct_cot_chat"
-WORKERS=32
-DATABLEND_DIR=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --output-dir)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        --mlm-path)
-            MLM_DIR="$2"
-            shift 2
-            ;;
-        --tokenizer)
-            TOKENIZER_MODEL="$2"
-            shift 2
-            ;;
-        --input-dir)
-            INPUT_DIR="$2"
-            shift 2
-            ;;
-        --split)
-            SPLIT="$2"
-            shift 2
-            ;;
-        --suffix)
-            SUFFIX="$2"
-            shift 2
-            ;;
-        --workers)
-            WORKERS="$2"
-            shift 2
-            ;;
-        --datablend-dir)
-            DATABLEND_DIR="$2"
-            shift 2
-            ;;
-        *)
-            echo "Unknown option: $1"
-            exit 1
-            ;;
-    esac
-done
-
-# Validate required arguments
-if [ -z "$OUTPUT_DIR" ]; then
-    echo "Error: --output-dir is required"
-    exit 1
-fi
-
-if [ -z "$MLM_DIR" ]; then
-    echo "Error: --mlm-path is required"
-    exit 1
-fi
-
-if [ -z "$TOKENIZER_MODEL" ]; then
-    echo "Error: --tokenizer is required"
-    exit 1
-fi
-
-# Set defaults for optional arguments
-if [ -z "$INPUT_DIR" ]; then
-    INPUT_DIR="${OUTPUT_DIR//_preprocessed/}"
-fi
-
-if [ -z "$DATABLEND_DIR" ]; then
-    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
-fi
-
-mkdir -p ${OUTPUT_DIR}/${SPLIT}
-mkdir -p ${DATABLEND_DIR}
-
-# Ensure transformers is installed for tokenizer
-pip install -q transformers tokenizers
-
-# Tokenizer settings
-TOKENIZER_TYPE="HuggingFaceTokenizer"
-
-# Full name for output files
-FULL_NAME="${SPLIT}_${SUFFIX}"
-
-echo "=========================================="
-echo "Preprocessing Nemotron-v1 Dataset"
-echo "=========================================="
-echo "Split: ${SPLIT}"
-echo "Suffix: ${SUFFIX}"
-echo "Tokenizer: ${TOKENIZER_MODEL}"
-echo "MLM Path: ${MLM_DIR}"
-echo "Input dir: ${INPUT_DIR}/${SPLIT}/"
-echo "Output dir: ${OUTPUT_DIR}/${SPLIT}/"
-echo "Datablend dir: ${DATABLEND_DIR}"
-echo "=========================================="
-
-# Process training split
-TRAIN_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
-if [ -f "${TRAIN_FILE}" ]; then
-    echo "Processing training split: ${TRAIN_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TRAIN_FILE} \
-        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "❌ Error: Training file not found: ${TRAIN_FILE}"
-    echo "   Check if download was successful."
-    ls -la ${INPUT_DIR}/${SPLIT}/ 2>/dev/null || echo "   Directory doesn't exist: ${INPUT_DIR}/${SPLIT}/"
-    exit 1
-fi
-
-# Process validation split
-VALID_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_validation.jsonl"
-if [ -f "${VALID_FILE}" ]; then
-    echo "Processing validation split: ${VALID_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${VALID_FILE} \
-        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Validation file not found: ${VALID_FILE}"
-fi
-
-# Process test split
-TEST_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_test.jsonl"
-if [ -f "${TEST_FILE}" ]; then
-    echo "Processing test split: ${TEST_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TEST_FILE} \
-        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Test file not found: ${TEST_FILE}"
-fi
-
-# Create datablend config
-BLEND_FILE="${DATABLEND_DIR}/datablend_nemotron_v1_${FULL_NAME}.json"
-echo "Creating datablend config: ${BLEND_FILE}"
-cat > ${BLEND_FILE} << EOF
-{
-    "train": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train_text_document"],
-    "valid": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation_text_document"],
-    "test": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test_text_document"]
-}
-EOF
-
-echo "=========================================="
-echo "✓ Nemotron-v1 (${FULL_NAME}) preprocessing complete!"
-echo "=========================================="
-echo "Output directory: ${OUTPUT_DIR}/${SPLIT}/"
-echo "Datablend config: ${BLEND_FILE}"
-echo ""
-echo "To run QAD training:"
-echo "  DATASET_NAME=nemotron_v1_${SPLIT}_${SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
-echo "=========================================="
-
diff --git a/examples/llm_qad/qad.sh b/examples/llm_qad/qad.sh
new file mode 100644
index 000000000..941032ea5
--- /dev/null
+++ b/examples/llm_qad/qad.sh
@@ -0,0 +1,522 @@
+#!/bin/bash
+# =============================================================================
+# QAD (Quantization-Aware Distillation) Training Script
+# =============================================================================
+#
+# This script trains quantized language models using knowledge distillation
+# from a teacher model. Supports both dense and MoE (Mixture of Experts) models.
+#
+# USAGE:
+#   bash qad.sh --config configs/qwen3-8b.conf
+#   bash qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
+#   bash qad.sh --hf-token hf_xxx --config configs/qwen3-8b.conf
+#
+# REQUIRED CONFIG VARIABLES:
+#   Model:      STUDENT_MODEL, TEACHER_MODEL, IS_MOE, TOKENIZER_MODEL
+#   Training:   LR, GBS, MIN_LR, LR_DECAY_STYLE, SAVE_INTERVAL, LOG_INTERVAL
+#   Data:       DATASET_NAME, BLEND_PATH, TRAIN_SAMPLES
+#   Parallel:   TP_SIZE, MBS
+#   Paths:      STUDENT_CKPT, TEACHER_CKPT, TEACHER_MODEL_CONFIG,
+#               STUDENT_CONFIG_FILE, MLM_DIR, MODELOPT_DIR,
+#               QAD_CHECKPOINT_ROOT, DATACACHE_DIR
+#
+# =============================================================================
+
+set -euo pipefail
+
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+
+log_info()  { echo "[INFO] $*"; }
+log_warn()  { echo "[WARN] $*"; }
+log_error() { echo "[ERROR] $*" >&2; }
+
+die() {
+    log_error "$@"
+    exit 1
+}
+
+require_var() {
+    local var_name="$1"
+    local var_value="${!var_name:-}"
+    if [[ -z "$var_value" ]]; then
+        die "$var_name must be set in config"
+    fi
+}
+
+require_file() {
+    local path="$1"
+    local desc="${2:-File}"
+    [[ -f "$path" ]] || die "$desc not found: $path"
+}
+
+require_dir() {
+    local path="$1"
+    local desc="${2:-Directory}"
+    [[ -d "$path" ]] || die "$desc not found: $path"
+}
+
+sanitize_for_path() {
+    echo "$1" | sed -e 's/[\/ :]/_/g' -e 's/[=]/_/g'
+}
+
+# =============================================================================
+# ENVIRONMENT SETUP
+# =============================================================================
+
+# NCCL and distributed training settings
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_DEBUG=WARN
+export NCCL_SHM_DISABLE=1
+export NCCL_NVLS_ENABLE=0
+
+# CUDA settings
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export UB_TIMEOUT=720
+
+# Transformer Engine margins
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+
+# PyTorch settings (disable features that cause issues during training)
+export TORCHINDUCTOR_COMPILE_THREADS=1
+export TORCH_COMPILE_DISABLE=1
+export PYTORCH_NO_CUDA_MEMORY_CACHING=0
+export TORCH_DISTRIBUTED_DEBUG=OFF
+export PYTORCH_JIT=0
+export TORCH_USE_CUDA_DSA=0
+
+# Network interface
+export GLOO_SOCKET_IFNAME=ibp26s0
+
+# =============================================================================
+# ARGUMENT PARSING
+# =============================================================================
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CONFIG_FILE=""
+HF_TOKEN_ARG=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --config|-c)
+            CONFIG_FILE="$2"
+            shift 2
+            ;;
+        --hf-token)
+            HF_TOKEN_ARG="$2"
+            shift 2
+            ;;
+        *)
+            die "Unknown argument: $1"
+            ;;
+    esac
+done
+
+# HuggingFace token (from arg takes precedence)
+if [[ -n "$HF_TOKEN_ARG" ]]; then
+    export HF_TOKEN="$HF_TOKEN_ARG"
+fi
+
+if [[ -n "${HF_TOKEN:-}" ]]; then
+    export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
+    log_info "HuggingFace token configured"
+fi
+
+# =============================================================================
+# CONFIG LOADING
+# =============================================================================
+
+if [[ -z "$CONFIG_FILE" ]]; then
+    log_error "Config file is required. Use --config <path>"
+    echo "Available configs:"
+    ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null || echo "  (none found)"
+    exit 1
+fi
+
+# Handle relative paths
+[[ "$CONFIG_FILE" = /* ]] || CONFIG_FILE="${SCRIPT_DIR}/${CONFIG_FILE}"
+
+require_file "$CONFIG_FILE" "Config file"
+log_info "Loading config from: ${CONFIG_FILE}"
+source "$CONFIG_FILE"
+
+# =============================================================================
+# CONFIG VALIDATION
+# =============================================================================
+
+# Required: Training hyperparameters
+require_var LR
+require_var GBS
+require_var MIN_LR
+require_var LR_DECAY_STYLE
+require_var SAVE_INTERVAL
+require_var LOG_INTERVAL
+
+# Required: Model and data
+require_var STUDENT_MODEL
+require_var TEACHER_MODEL
+require_var DATASET_NAME
+require_var BLEND_PATH
+require_var TRAIN_SAMPLES
+require_var IS_MOE
+require_var TOKENIZER_MODEL
+
+# Required: Parallelism
+require_var TP_SIZE
+require_var MBS
+
+# Required: Checkpoints
+require_var STUDENT_CKPT
+require_var TEACHER_CKPT
+require_var TEACHER_MODEL_CONFIG
+
+# Required: Paths
+require_var STUDENT_CONFIG_FILE
+require_var MLM_DIR
+require_var MODELOPT_DIR
+require_var QAD_CHECKPOINT_ROOT
+require_var DATACACHE_DIR
+
+# =============================================================================
+# OPTIONAL CONFIG WITH DEFAULTS
+# =============================================================================
+
+# Parallelism defaults
+EP_SIZE="${EP_SIZE:-1}"
+PP_SIZE="${PP_SIZE:-1}"
+NUM_GPUS="${NUM_GPUS:-8}"
+NNODES="${NNODES:-1}"
+NODE_RANK="${NODE_RANK:-0}"
+MASTER_ADDR="${MASTER_ADDR:-localhost}"
+MASTER_PORT="${MASTER_PORT:-29500}"
+
+# Training schedule (derived from TRAIN_SAMPLES if not set)
+LR_DECAY_SAMPLES="${LR_DECAY_SAMPLES:-$(( TRAIN_SAMPLES * 99 / 100 ))}"
+LR_WARMUP_SAMPLES="${LR_WARMUP_SAMPLES:-$(( TRAIN_SAMPLES / 100 ))}"
+
+# Checkpoint intervals
+SAVE_RETAIN_INTERVAL="${SAVE_RETAIN_INTERVAL:-$SAVE_INTERVAL}"
+EVAL_INTERVAL="${EVAL_INTERVAL:-$SAVE_INTERVAL}"
+EVAL_ITERS="${EVAL_ITERS:-20}"
+
+# Optional overrides
+MAX_SEQ="${MAX_SEQ:-}"
+RUN_TAG="${RUN_TAG:-}"
+KD_CFG_PATH="${KD_CFG_PATH:-}"
+ITERATIONS_TO_SKIP="${ITERATIONS_TO_SKIP:-}"
+
+# MoE performance flags
+ENABLE_MOE_PERF="${ENABLE_MOE_PERF:-1}"
+ENABLE_MOE_EXPERIMENTAL="${ENABLE_MOE_EXPERIMENTAL:-0}"
+
+# Logging
+LOG_PARAMS_NORM="${LOG_PARAMS_NORM:-}"
+
+# =============================================================================
+# MODEL CONFIGURATION
+# =============================================================================
+
+require_file "$STUDENT_CONFIG_FILE" "Student model config"
+log_info "Loading student model config from: ${STUDENT_CONFIG_FILE}"
+
+# Temporarily disable strict mode for external config (may use unset vars)
+set +u
+source "$STUDENT_CONFIG_FILE"
+set -u
+
+STUDENT_MODEL_ARGS="${MODEL_ARGS}"
+
+# Log params norm setting
+if [[ "${LOG_PARAMS_NORM}" == "1" ]]; then
+    LOG_PARAMS_NORM_ARG="--log-params-norm"
+elif [[ "$IS_MOE" == "true" ]]; then
+    LOG_PARAMS_NORM_ARG=""
+    log_warn "log-params-norm disabled for MoE model to save memory"
+else
+    LOG_PARAMS_NORM_ARG="--log-params-norm"
+fi
+
+log_info "Model: ${STUDENT_MODEL}"
+log_info "Parallelism: TP=${TP_SIZE}, PP=${PP_SIZE}, EP=${EP_SIZE}, MBS=${MBS}, MoE=${IS_MOE}"
+
+# =============================================================================
+# CHECKPOINT VALIDATION
+# =============================================================================
+
+require_dir "$STUDENT_CKPT" "Student checkpoint"
+require_dir "$TEACHER_CKPT" "Teacher checkpoint"
+require_file "$TEACHER_MODEL_CONFIG" "Teacher model config"
+
+log_info "Student checkpoint: ${STUDENT_CKPT}"
+log_info "Teacher checkpoint: ${TEACHER_CKPT}"
+
+# =============================================================================
+# OUTPUT PATH SETUP
+# =============================================================================
+
+DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
+STUDENT_CKPT_NAME=$(basename "${STUDENT_CKPT}")
+TEACHER_CKPT_NAME=$(basename "${TEACHER_CKPT}")
+
+# Build descriptive run name from hyperparameters
+TAG_PARTS="lr$(sanitize_for_path "$LR")"
+TAG_PARTS="${TAG_PARTS}-minlr$(sanitize_for_path "$MIN_LR")"
+TAG_PARTS="${TAG_PARTS}-decay$(sanitize_for_path "$LR_DECAY_STYLE")"
+[[ -n "$MAX_SEQ" ]] && TAG_PARTS="${TAG_PARTS}-seq${MAX_SEQ}"
+[[ -n "$RUN_TAG" ]] && TAG_PARTS="${TAG_PARTS}-tag$(sanitize_for_path "$RUN_TAG")"
+
+OUTPUT_ROOT="${QAD_CHECKPOINT_ROOT}/${STUDENT_CKPT_NAME}-Teacher-${TEACHER_CKPT_NAME}-Data-${DATASET_NAME}-${TAG_PARTS}"
+NAME="${STUDENT_CKPT_NAME}"
+
+RUN_DIR="${OUTPUT_ROOT}"
+LOGS_DIR="${RUN_DIR}/logs"
+CHECKPOINT_DIR="${RUN_DIR}/checkpoints/${NAME}"
+TENSORBOARD_DIR="${RUN_DIR}/tensorboard/${NAME}"
+
+# Create directories
+mkdir -p "${LOGS_DIR}" "${CHECKPOINT_DIR}" "${DATACACHE_DIR}" "${TENSORBOARD_DIR}"
+
+# =============================================================================
+# RESUME LOGIC
+# =============================================================================
+
+if [[ -f "${CHECKPOINT_DIR}/latest_checkpointed_iteration.txt" ]]; then
+    log_info "Resuming from existing checkpoint: ${CHECKPOINT_DIR}"
+    LOAD_CHECKPOINT_DIR="${CHECKPOINT_DIR}"
+    FINETUNE_FLAG=""
+    LOAD_OPTIM_ARGS=""
+    CKPT_PARALLEL_LOAD_ARG="--ckpt-fully-parallel-load"
+else
+    log_info "Starting fresh from base student checkpoint"
+    LOAD_CHECKPOINT_DIR="${STUDENT_CKPT}"
+    FINETUNE_FLAG="--finetune"
+    LOAD_OPTIM_ARGS="--no-load-optim --no-load-rng"
+    CKPT_PARALLEL_LOAD_ARG=""
+fi
+
+# =============================================================================
+# TRAINING CONFIGURATION LOGGING
+# =============================================================================
+
+ENV_LOG="${LOGS_DIR}/${NAME}_${DATETIME}.env.log"
+
+{
+    echo "========================================"
+    echo "QAD Training: ${STUDENT_MODEL}"
+    echo "Time: ${DATETIME}"
+    echo "========================================"
+    echo ""
+    echo "MODEL CONFIG"
+    echo "  Student: ${STUDENT_MODEL}"
+    echo "  Teacher: ${TEACHER_MODEL}"
+    echo "  Config: ${STUDENT_CONFIG_FILE}"
+    echo "  MoE: ${IS_MOE}"
+    echo ""
+    echo "TRAINING HYPERPARAMETERS"
+    echo "  LR: ${LR}, Min LR: ${MIN_LR}"
+    echo "  LR Decay: ${LR_DECAY_STYLE}"
+    echo "  GBS: ${GBS}, MBS: ${MBS}"
+    echo "  Train Samples: ${TRAIN_SAMPLES}"
+    echo "  Save Interval: ${SAVE_INTERVAL}, Log Interval: ${LOG_INTERVAL}"
+    echo ""
+    echo "PARALLELISM"
+    echo "  TP: ${TP_SIZE}, PP: ${PP_SIZE}, EP: ${EP_SIZE}"
+    echo "  Nodes: ${NNODES}, GPUs/node: ${NUM_GPUS}"
+    echo "  Total GPUs: $((NNODES * NUM_GPUS))"
+    echo ""
+    echo "PATHS"
+    echo "  MLM_DIR: ${MLM_DIR}"
+    echo "  Checkpoint: ${CHECKPOINT_DIR}"
+    echo "  TensorBoard: ${TENSORBOARD_DIR}"
+    echo ""
+    echo "ENVIRONMENT"
+    env
+    echo "========================================"
+} | tee "$ENV_LOG"
+
+# =============================================================================
+# BUILD TRAINING ARGUMENTS
+# =============================================================================
+
+# Checkpoint loading
+CHECKPOINT_ARGS=" \
+    --auto-detect-ckpt-format \
+    --export-te-mcore-model \
+    --dist-ckpt-strictness log_unexpected \
+    ${FINETUNE_FLAG} \
+    ${LOAD_OPTIM_ARGS} \
+    --load ${LOAD_CHECKPOINT_DIR} \
+    --export-kd-teacher-load ${TEACHER_CKPT} \
+    --teacher-model-config ${TEACHER_MODEL_CONFIG}"
+
+# KD config (optional)
+if [[ -n "$KD_CFG_PATH" && -f "$KD_CFG_PATH" ]]; then
+    CHECKPOINT_ARGS="${CHECKPOINT_ARGS} --export-kd-cfg ${KD_CFG_PATH}"
+    log_info "Using KD config: ${KD_CFG_PATH}"
+fi
+
+# Tokenizer
+TOKENIZER_ARGS=" \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL}"
+
+# Data
+DATA_ARGS=" \
+    --per-split-data-args-path ${BLEND_PATH} \
+    --data-cache-path ${DATACACHE_DIR} \
+    --no-mmap-bin-files \
+    --num-dataset-builder-threads 16 \
+    --no-create-attention-mask-in-dataloader"
+
+# Sequence length override
+SEQ_ARGS=""
+if [[ -n "$MAX_SEQ" ]]; then
+    SEQ_ARGS="--seq-length ${MAX_SEQ} --max-position-embeddings ${MAX_SEQ}"
+    log_info "Sequence length override: ${MAX_SEQ}"
+fi
+
+# Training
+TRAINING_ARGS=" \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --train-samples ${TRAIN_SAMPLES} \
+    --lr-decay-samples ${LR_DECAY_SAMPLES} \
+    --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --bf16 \
+    ${SEQ_ARGS}"
+
+# Optimizer
+OPTIMIZER_ARGS=" \
+    --lr ${LR} \
+    --min-lr ${MIN_LR} \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --lr-decay-style ${LR_DECAY_STYLE} \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --use-distributed-optimizer \
+    --overlap-grad-reduce \
+    --overlap-param-gather"
+
+# Parallelism
+PARALLEL_ARGS=" \
+    --tensor-model-parallel-size ${TP_SIZE} \
+    --pipeline-model-parallel-size ${PP_SIZE} \
+    --distributed-timeout-minutes 360 \
+    --disable-gloo-process-groups \
+    --ddp-num-buckets 7"
+
+# Expert parallelism for MoE
+if [[ "$IS_MOE" == "true" && "$EP_SIZE" -gt 1 ]]; then
+    PARALLEL_ARGS="${PARALLEL_ARGS} --expert-model-parallel-size ${EP_SIZE}"
+    log_info "MoE Expert Parallelism: EP=${EP_SIZE}"
+fi
+
+# Sequence parallel (add if not in model config)
+if ! echo "$STUDENT_MODEL_ARGS" | grep -q "sequence-parallel"; then
+    PARALLEL_ARGS="${PARALLEL_ARGS} --sequence-parallel"
+fi
+
+# MoE performance optimizations
+MOE_PERF_ARGS=""
+if [[ "$IS_MOE" == "true" && "$ENABLE_MOE_PERF" == "1" ]]; then
+    log_info "MoE Performance Optimizations: ENABLED"
+    MOE_PERF_ARGS=" \
+        --moe-token-dispatcher-type alltoall \
+        --moe-shared-expert-overlap \
+        --moe-permute-fusion \
+        --moe-grouped-gemm \
+        --cross-entropy-loss-fusion \
+        --cross-entropy-fusion-impl native"
+    
+    if [[ "$ENABLE_MOE_EXPERIMENTAL" == "1" ]]; then
+        MOE_PERF_ARGS="${MOE_PERF_ARGS} --enable-experimental"
+        log_warn "Experimental MoE features enabled"
+    fi
+elif [[ "$IS_MOE" == "true" ]]; then
+    log_warn "MoE Performance Optimizations: DISABLED"
+fi
+
+# Memory optimization
+MEMORY_ARGS=" \
+    --recompute-granularity full \
+    --recompute-method uniform \
+    --recompute-num-layers 1 \
+    --no-gradient-accumulation-fusion"
+
+# Checkpoint saving
+SAVE_ARGS=" \
+    --save ${CHECKPOINT_DIR} \
+    --save-interval ${SAVE_INTERVAL} \
+    --save-retain-interval ${SAVE_RETAIN_INTERVAL} \
+    --ckpt-format torch_dist \
+    --ckpt-fully-parallel-save \
+    --ckpt-assume-constant-structure \
+    ${CKPT_PARALLEL_LOAD_ARG}"
+
+# Logging
+LOGGING_ARGS=" \
+    --log-interval ${LOG_INTERVAL} \
+    --eval-iters ${EVAL_ITERS} \
+    --eval-interval ${EVAL_INTERVAL} \
+    --log-progress \
+    --timing-log-option minmax \
+    ${LOG_PARAMS_NORM_ARG:-} \
+    --log-num-zeros-in-grad \
+    --log-throughput \
+    --log-straggler \
+    --disable-straggler-on-startup \
+    --straggler-minmax-count 16 \
+    --tensorboard-dir ${TENSORBOARD_DIR}"
+
+# Runtime
+RUNTIME_ARGS=" \
+    --exit-duration-in-mins 1200 \
+    --num-workers 8 \
+    --no-check-for-nan-in-loss-and-grad"
+
+# Combine all arguments
+ALL_ARGS=" \
+    ${CHECKPOINT_ARGS} \
+    ${STUDENT_MODEL_ARGS} \
+    ${TOKENIZER_ARGS} \
+    ${DATA_ARGS} \
+    ${TRAINING_ARGS} \
+    ${OPTIMIZER_ARGS} \
+    ${PARALLEL_ARGS} \
+    ${MOE_PERF_ARGS} \
+    ${MEMORY_ARGS} \
+    ${SAVE_ARGS} \
+    ${LOGGING_ARGS} \
+    ${RUNTIME_ARGS}"
+
+# Optional: iterations to skip
+[[ -n "$ITERATIONS_TO_SKIP" ]] && ALL_ARGS="${ALL_ARGS} --iterations-to-skip ${ITERATIONS_TO_SKIP}"
+
+# =============================================================================
+# LAUNCH TRAINING
+# =============================================================================
+
+export PYTHONPATH="${MODELOPT_DIR}:${MLM_DIR}:${PYTHONPATH:-}"
+
+LOG_FILE="${LOGS_DIR}/${NAME}_qad_${DATETIME}.log"
+
+log_info "Starting training..."
+log_info "Log file: ${LOG_FILE}"
+log_info "Distributed: ${NNODES} nodes × ${NUM_GPUS} GPUs = $((NNODES * NUM_GPUS)) total"
+
+torchrun \
+    --nproc_per_node="${NUM_GPUS}" \
+    --nnodes="${NNODES}" \
+    --node_rank="${NODE_RANK}" \
+    --master_addr="${MASTER_ADDR}" \
+    --master_port="${MASTER_PORT}" \
+    "${MLM_DIR}/pretrain_gpt.py" ${ALL_ARGS} 2>&1 | tee "${LOG_FILE}"
+
+log_info "Training completed. Logs: ${LOG_FILE}"
diff --git a/examples/llm_qad/qwen_qad.sh b/examples/llm_qad/qwen_qad.sh
deleted file mode 100644
index 05707e99f..000000000
--- a/examples/llm_qad/qwen_qad.sh
+++ /dev/null
@@ -1,932 +0,0 @@
-#!/bin/bash
-# Generic QAD training script for Qwen models - Docker/Interactive Version
-# Supports: Qwen3-8B, Qwen3-30B-A3B (MoE), and other Qwen variants
-#
-# Usage:
-#   # With config file (recommended)
-#   bash qwen_qad.sh --config configs/qwen3-8b-default.conf
-#   bash qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
-#
-#   # With HuggingFace token (secure, not logged)
-#   bash qwen_qad.sh --hf-token hf_xxx --config configs/qwen3-8b-default.conf
-#
-#   # With command line args
-#   bash qwen_qad.sh [LR] [TEACHER_MODEL] [DATASET_NAME] [STUDENT_MODEL] [KD_CFG_PATH]
-#   bash qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B
-#   bash qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B /path/to/kd_config.yaml
-#
-#   # Config + override
-#   LR=1e-5 bash qwen_qad.sh --config configs/qwen3-8b-default.conf
-#
-# Get interactive node:
-#   srun -A coreai_dlalgo_modelopt --nodes=1 -p batch --mpi=pmix \
-#     -J qwen-qad:dev \
-#     --container-image=/lustre/.../pytorch_25.06-py3.sqsh \
-#     --container-mounts="/lustre/fsw:/lustre/fsw" \
-#     --container-workdir="/lustre/.../workspace" \
-#     -t 4:0:0 --pty bash
-
-set -e  # Exit on error
-
-export NCCL_IB_SL=1
-export NCCL_IB_TIMEOUT=19
-export UB_TIMEOUT=720
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NVTE_FWD_LAYERNORM_SM_MARGIN=16
-export NVTE_BWD_LAYERNORM_SM_MARGIN=16
-export NCCL_P2P_NET_CHUNKSIZE=2097152
-export NCCL_DEBUG=WARN
-export NCCL_SHM_DISABLE=1
-export NCCL_NVLS_ENABLE=0
-export GLOO_SOCKET_IFNAME=ibp26s0
-# Disable torch inductor subprocess compilation to avoid CUDA fork issues
-export TORCHINDUCTOR_COMPILE_THREADS=1
-# Disable PyTorch compilation to avoid Triton/cubin errors during training
-export TORCH_COMPILE_DISABLE=1
-# Workaround for B300 autograd issues with quantization
-export PYTORCH_NO_CUDA_MEMORY_CACHING=0
-export TORCH_DISTRIBUTED_DEBUG=OFF
-# Force fallback for missing autograd kernels
-export PYTORCH_JIT=0
-export TORCH_USE_CUDA_DSA=0
-
-# HuggingFace token for accessing gated models (avoids rate limiting)
-# Set via: export HF_TOKEN=hf_xxx or in config file
-if [ -n "${HF_TOKEN:-}" ]; then
-    export HF_TOKEN="${HF_TOKEN}"
-    export HUGGING_FACE_HUB_TOKEN="${HF_TOKEN}"  # Legacy variable name
-    echo "🔑 HuggingFace token configured"
-fi
-
-########################################################
-#### CONFIG FILE LOADING ####
-########################################################
-
-SCRIPT_PATH=$(realpath "$0")
-SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
-CONFIG_FILE=""
-HF_TOKEN_ARG=""
-
-# Parse arguments
-POSITIONAL_ARGS=()
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --config|-c)
-            CONFIG_FILE="$2"
-            shift 2
-            ;;
-        --hf-token)
-            HF_TOKEN_ARG="$2"
-            shift 2
-            ;;
-        *)
-            POSITIONAL_ARGS+=("$1")
-            shift
-            ;;
-    esac
-done
-
-# Set HF_TOKEN from arg (takes precedence, doesn't appear in logs)
-if [ -n "$HF_TOKEN_ARG" ]; then
-    export HF_TOKEN="$HF_TOKEN_ARG"
-fi
-# Restore positional args (handle empty array for set -u)
-if [ ${#POSITIONAL_ARGS[@]} -gt 0 ]; then
-    set -- "${POSITIONAL_ARGS[@]}"
-else
-    set --
-fi
-
-# Load config file if specified
-if [ -n "$CONFIG_FILE" ]; then
-    # Handle relative paths
-    if [[ ! "$CONFIG_FILE" = /* ]]; then
-        CONFIG_FILE="${SCRIPT_DIR}/${CONFIG_FILE}"
-    fi
-    
-    if [ -f "$CONFIG_FILE" ]; then
-        echo "📄 Loading config from: ${CONFIG_FILE}"
-        source "$CONFIG_FILE"
-    else
-        echo "❌ ERROR: Config file not found: ${CONFIG_FILE}"
-        echo "Available configs:"
-        ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null || echo "  (none found)"
-        exit 1
-    fi
-fi
-
-########################################################
-#### CONFIGURATION PARAMETERS ####
-########################################################
-
-CURRENT_DIR=$(pwd)
-
-# Command line args override config/env
-# Order: LR, TEACHER_MODEL, DATASET_NAME, STUDENT_MODEL, KD_CFG_PATH
-LR="${1:-${LR:-1e-6}}"
-TEACHER_MODEL="${2:-${TEACHER_MODEL:-Qwen3-8B}}"
-DATASET_NAME="${3:-${DATASET_NAME:-openscience}}"
-STUDENT_MODEL="${4:-${STUDENT_MODEL:-Qwen3-8B}}"
-KD_CFG_PATH="${5:-${KD_CFG_PATH:-}}"
-
-# Allow environment variable override (takes precedence)
-STUDENT_MODEL="${STUDENT_MODEL_ENV:-$STUDENT_MODEL}"
-TEACHER_MODEL="${TEACHER_MODEL_ENV:-$TEACHER_MODEL}"
-
-########################################################
-#### PATH CONFIGURATION ####
-########################################################
-
-MLM_DIR="${MLM_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM}"
-MODELOPT_DIR="${MODELOPT_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer}"
-MODEL_CONF_DIR="${MLM_DIR}/examples/post_training/modelopt/conf/Qwen"
-MODELS_ROOT="${MODELS_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models}"
-QAD_CHECKPOINT_ROOT="${QAD_CHECKPOINT_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints}"
-DATACACHE_DIR="${DATACACHE_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache}"
-
-########################################################
-#### MODEL CONFIGURATION - Source from conf file ####
-########################################################
-
-# Load student model architecture config (MODEL_ARGS)
-# Config file path can be set explicitly or auto-detected
-STUDENT_CONFIG_FILE="${STUDENT_CONFIG_FILE:-${MODEL_CONF_DIR}/${STUDENT_MODEL}.sh}"
-
-if [ ! -f "${STUDENT_CONFIG_FILE}" ]; then
-    echo "❌ ERROR: Student model config not found: ${STUDENT_CONFIG_FILE}"
-    echo "Available model configs:"
-    ls -1 "${MODEL_CONF_DIR}/"*.sh 2>/dev/null | xargs -n1 basename | sed 's/.sh$//' || echo "  (none)"
-    exit 1
-fi
-
-echo "📄 Loading student model config from: ${STUDENT_CONFIG_FILE}"
-source "${STUDENT_CONFIG_FILE}"
-STUDENT_MODEL_ARGS="${MODEL_ARGS}"
-
-# Parallelism settings (from config file, required)
-TP_SIZE="${TP_SIZE:?ERROR: TP_SIZE must be set in config}"
-EP_SIZE="${EP_SIZE:-1}"
-MBS="${MBS:?ERROR: MBS must be set in config}"
-
-# Detect MoE from EP_SIZE
-if [ "${EP_SIZE}" -gt 1 ]; then
-    IS_MOE=true
-else
-    IS_MOE=false
-fi
-
-# Disable log-params-norm for MoE models by default (causes OOM due to FP32 conversion)
-# Can be overridden with LOG_PARAMS_NORM=1 in config
-if [ "${LOG_PARAMS_NORM:-}" = "1" ]; then
-    LOG_PARAMS_NORM_ARG="--log-params-norm"
-elif [ "$IS_MOE" = "true" ]; then
-    LOG_PARAMS_NORM_ARG=""  # Disabled for MoE to save memory
-    echo "⚠️  log-params-norm disabled for MoE model (saves ~2GB memory)"
-else
-    LOG_PARAMS_NORM_ARG="--log-params-norm"
-fi
-
-echo "🔧 Model: ${STUDENT_MODEL}"
-echo "   TP=${TP_SIZE}, EP=${EP_SIZE}, MBS=${MBS}, MoE=${IS_MOE}"
-
-########################################################
-#### CHECKPOINT PATHS (REQUIRED) ####
-########################################################
-
-# STUDENT_CKPT: Path to student checkpoint (REQUIRED)
-# TEACHER_CKPT: Path to teacher checkpoint (REQUIRED)
-# These must be set in config file or environment
-
-if [ -z "${STUDENT_CKPT:-}" ]; then
-    echo "❌ ERROR: STUDENT_CKPT is required. Set it in config or environment."
-    exit 1
-fi
-if [ -z "${TEACHER_CKPT:-}" ]; then
-    echo "❌ ERROR: TEACHER_CKPT is required. Set it in config or environment."
-    exit 1
-fi
-
-BASE_STUDENT_CKPT="${STUDENT_CKPT}"
-TEACHER_CKPT_DIR="${TEACHER_CKPT}"
-
-# TEACHER_MODEL_CONFIG is required
-if [ -z "${TEACHER_MODEL_CONFIG:-}" ]; then
-    echo "❌ ERROR: TEACHER_MODEL_CONFIG is required. Set it in config or environment."
-    exit 1
-fi
-
-if [ ! -f "${TEACHER_MODEL_CONFIG}" ]; then
-    echo "❌ ERROR: Teacher model config file not found: ${TEACHER_MODEL_CONFIG}"
-    exit 1
-fi
-
-echo "📚 Student checkpoint: ${STUDENT_CKPT}"
-echo "🎓 Teacher checkpoint: ${TEACHER_CKPT}"
-
-########################################################
-#### OUTPUT PATHS ####
-########################################################
-
-DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
-
-# Use checkpoint directory name for output path
-STUDENT_CKPT_NAME=$(basename "${STUDENT_CKPT}")
-TEACHER_CKPT_NAME=$(basename "${TEACHER_CKPT}")
-OUTPUT_ROOT="${QAD_CHECKPOINT_ROOT}/${STUDENT_CKPT_NAME}-Teacher-${TEACHER_CKPT_NAME}-Data-${DATASET_NAME}-lr${LR}"
-NAME="${STUDENT_CKPT_NAME}"
-
-RUN_DIR="${OUTPUT_ROOT}"
-LOGS_DIR="${RUN_DIR}/logs"
-CHECKPOINT_DIR="${RUN_DIR}/checkpoints/${NAME}"
-TENSORBOARD_DIR="${RUN_DIR}/tensorboard/${NAME}"
-ENV_LOG_FILENAME=${NAME}_${DATETIME}.env.log
-
-########################################################
-#### KD CONFIG ####
-########################################################
-
-# KD_CFG_PATH: Path to custom KD config YAML (optional)
-# If set, uses custom distillation configuration
-if [ -n "${KD_CFG_PATH}" ]; then
-    if [ -f "${KD_CFG_PATH}" ]; then
-        KD_CFG_ARGS="--export-kd-cfg ${KD_CFG_PATH}"
-        echo "🎓 Using KD config: ${KD_CFG_PATH}"
-    else
-        echo "⚠️  Warning: KD config not found: ${KD_CFG_PATH}, using default KD settings"
-        KD_CFG_ARGS=""
-    fi
-else
-    KD_CFG_ARGS=""
-fi
-
-########################################################
-#### DATASET SELECTION ####
-########################################################
-
-# Select Datablend based on argument
-# Naming convention:
-#   - Plain text: datablend_<dataset>.json
-#   - With COT (chain-of-thought): datablend_<dataset>_cot.json
-#   - With chat template: datablend_<dataset>_chat.json
-#   - With both COT and chat: datablend_<dataset>_cot_chat.json
-case "$DATASET_NAME" in
-    # ====================
-    # Nemotron-v1 options (plain text)
-    # ====================
-    nemotron_10pct|nemotron_all_10pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct.json}"
-        DEFAULT_TRAIN_SAMPLES=2500000
-        echo "📊 Using Nemotron-v1 ALL Subjects @ 10% (~2.5M samples)"
-        ;;
-    nemotron|nemotron_30pct|nemotron_all_30pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct.json}"
-        DEFAULT_TRAIN_SAMPLES=7500000
-        echo "📊 Using Nemotron-v1 ALL Subjects @ 30% (~7.5M samples)"
-        ;;
-    nemotron_50pct|nemotron_all_50pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct.json}"
-        DEFAULT_TRAIN_SAMPLES=12500000
-        echo "📊 Using Nemotron-v1 ALL Subjects @ 50% (~12.5M samples)"
-        ;;
-    nemotron_100pct|nemotron_all_100pct|nemotron_full)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_100pct.json}"
-        DEFAULT_TRAIN_SAMPLES=25000000
-        echo "📊 Using Nemotron-v1 ALL Subjects @ 100% (~25M samples)"
-        ;;
-    nemotron_stem)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem.json}"
-        DEFAULT_TRAIN_SAMPLES=5000000
-        echo "📊 Using Nemotron-v1 STEM Dataset (Best for MMLU)"
-        ;;
-    nemotron_math)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_math.json}"
-        DEFAULT_TRAIN_SAMPLES=2000000
-        echo "📊 Using Nemotron-v1 Math Dataset"
-        ;;
-
-    # ====================
-    # Nemotron-v1 with COT (chain-of-thought reasoning)
-    # ====================
-    nemotron_10pct_cot|nemotron_all_10pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=2500000
-        echo "📊 Using Nemotron-v1 ALL @ 10% + COT (~2.5M samples)"
-        ;;
-    nemotron_30pct_cot|nemotron_all_30pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=7500000
-        echo "📊 Using Nemotron-v1 ALL @ 30% + COT (~7.5M samples)"
-        ;;
-    nemotron_50pct_cot|nemotron_all_50pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=12500000
-        echo "📊 Using Nemotron-v1 ALL @ 50% + COT (~12.5M samples)"
-        ;;
-    nemotron_stem_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=5000000
-        echo "📊 Using Nemotron-v1 STEM + COT"
-        ;;
-
-    # ====================
-    # Nemotron-v1 with chat template (no COT)
-    # ====================
-    nemotron_10pct_chat|nemotron_all_10pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=2500000
-        echo "📊 Using Nemotron-v1 ALL @ 10% + Chat Template (~2.5M samples)"
-        ;;
-    nemotron_chat|nemotron_30pct_chat|nemotron_all_30pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=7500000
-        echo "📊 Using Nemotron-v1 ALL @ 30% + Chat Template (~7.5M samples)"
-        ;;
-    nemotron_50pct_chat|nemotron_all_50pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=12500000
-        echo "📊 Using Nemotron-v1 ALL @ 50% + Chat Template (~12.5M samples)"
-        ;;
-    nemotron_stem_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=5000000
-        echo "📊 Using Nemotron-v1 STEM + Chat Template"
-        ;;
-
-    # ====================
-    # Nemotron-v1 with COT + chat template
-    # ====================
-    nemotron_10pct_cot_chat|nemotron_all_10pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_10pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=2500000
-        echo "📊 Using Nemotron-v1 ALL @ 10% + COT + Chat Template (~2.5M samples)"
-        ;;
-    nemotron_cot_chat|nemotron_30pct_cot_chat|nemotron_all_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=7500000
-        echo "📊 Using Nemotron-v1 ALL @ 30% + COT + Chat Template (~7.5M samples)"
-        ;;
-    nemotron_50pct_cot_chat|nemotron_all_50pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_all_50pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=12500000
-        echo "📊 Using Nemotron-v1 ALL @ 50% + COT + Chat Template (~12.5M samples)"
-        ;;
-    nemotron_stem_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_stem_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=5000000
-        echo "📊 Using Nemotron-v1 STEM + COT + Chat Template"
-        ;;
-
-    # ====================
-    # Nemotron-v1 individual splits (fine-grained control)
-    # Format: nemotron_v1_<split>_<pct>pct_cot_chat
-    # ====================
-    nemotron_v1_stem|nemotron_v1_stem_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_stem_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=5886717  # ~6M (30% of 20.6M * 0.95 train split)
-        echo "📊 Using Nemotron-v1 STEM @ 30% + COT + Chat (~5.9M samples)"
-        ;;
-    nemotron_v1_math|nemotron_v1_math_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_math_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=582654   # ~583K (30% of 2M * 0.95)
-        echo "📊 Using Nemotron-v1 Math @ 30% + COT + Chat (~583K samples)"
-        ;;
-    nemotron_v1_code|nemotron_v1_code_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_code_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=540472   # ~540K (30% of 1.9M * 0.95)
-        echo "📊 Using Nemotron-v1 Code @ 30% + COT + Chat (~540K samples)"
-        ;;
-    nemotron_v1_chat|nemotron_v1_chat_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_chat_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=212786   # ~213K (30% of 746K * 0.95)
-        echo "📊 Using Nemotron-v1 Chat @ 30% + COT + Chat (~213K samples)"
-        ;;
-    nemotron_v1_all|nemotron_v1_all_en_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v1_all_en_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=7222629  # Sum of all splits
-        echo "📊 Using Nemotron-v1 ALL splits @ 30% + COT + Chat (~7.2M samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 combined options (plain text)
-    # Total @ 30%: stem(101K) + math(68K) + code(50K) + chat(179K) = ~398K
-    # ====================
-    nemotron_v2|nemotron_v2_30pct|nemotron_v2_all_en_30pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct.json}"
-        DEFAULT_TRAIN_SAMPLES=398198
-        echo "📊 Using Nemotron-v2 English @ 30% (~398K samples)"
-        ;;
-    nemotron_v2_50pct|nemotron_v2_all_en_50pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct.json}"
-        DEFAULT_TRAIN_SAMPLES=663663
-        echo "📊 Using Nemotron-v2 English @ 50% (~664K samples)"
-        ;;
-    nemotron_v2_multilingual|nemotron_v2_all_multilingual_30pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_multilingual_30pct.json}"
-        DEFAULT_TRAIN_SAMPLES=600000
-        echo "📊 Using Nemotron-v2 ALL Languages @ 30% (~600K samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 combined with chat template
-    # ====================
-    nemotron_v2_chat_tmpl|nemotron_v2_30pct_chat|nemotron_v2_all_en_30pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=398198
-        echo "📊 Using Nemotron-v2 English @ 30% + Chat Template (~398K samples)"
-        ;;
-    nemotron_v2_50pct_chat|nemotron_v2_all_en_50pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=663663
-        echo "📊 Using Nemotron-v2 English @ 50% + Chat Template (~664K samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 combined with COT (chain-of-thought reasoning)
-    # ====================
-    nemotron_v2_cot|nemotron_v2_30pct_cot|nemotron_v2_all_en_30pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=398198
-        echo "📊 Using Nemotron-v2 English @ 30% + COT (~398K samples)"
-        ;;
-    nemotron_v2_50pct_cot|nemotron_v2_all_en_50pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=663663
-        echo "📊 Using Nemotron-v2 English @ 50% + COT (~664K samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 combined with COT + chat template
-    # ====================
-    nemotron_v2_cot_chat|nemotron_v2_30pct_cot_chat|nemotron_v2_all_en_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=398198
-        echo "📊 Using Nemotron-v2 English @ 30% + COT + Chat Template (~398K samples)"
-        ;;
-    nemotron_v2_50pct_cot_chat|nemotron_v2_all_en_50pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_all_en_50pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=663663
-        echo "📊 Using Nemotron-v2 English @ 50% + COT + Chat Template (~664K samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 individual splits (plain text)
-    # ====================
-    nemotron_v2_stem|nemotron_v2_stem_30pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct.json}"
-        DEFAULT_TRAIN_SAMPLES=101175
-        echo "📊 Using Nemotron-v2 STEM split @ 30% (~101K samples)"
-        ;;
-    nemotron_v2_math|nemotron_v2_math_30pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct.json}"
-        DEFAULT_TRAIN_SAMPLES=68248
-        echo "📊 Using Nemotron-v2 Math split @ 30% (~68K samples)"
-        ;;
-    nemotron_v2_code|nemotron_v2_code_30pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct.json}"
-        DEFAULT_TRAIN_SAMPLES=99750
-        echo "📊 Using Nemotron-v2 Code split @ 30% (~50K x2 epochs)"
-        ;;
-    nemotron_v2_chat|nemotron_v2_chat_30pct)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct.json}"
-        DEFAULT_TRAIN_SAMPLES=178900
-        echo "📊 Using Nemotron-v2 Chat split @ 30% (~179K samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 individual splits with chat template
-    # ====================
-    nemotron_v2_stem_chat|nemotron_v2_stem_30pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=101175
-        echo "📊 Using Nemotron-v2 STEM @ 30% + Chat Template (~101K samples)"
-        ;;
-    nemotron_v2_math_chat|nemotron_v2_math_30pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=68248
-        echo "📊 Using Nemotron-v2 Math @ 30% + Chat Template (~68K samples)"
-        ;;
-    nemotron_v2_code_chat|nemotron_v2_code_30pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=99750
-        echo "📊 Using Nemotron-v2 Code @ 30% + Chat Template (~50K x2 epochs)"
-        ;;
-    nemotron_v2_chat_chat|nemotron_v2_chat_30pct_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=178900
-        echo "📊 Using Nemotron-v2 Chat @ 30% + Chat Template (~179K samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 individual splits with COT (chain-of-thought reasoning)
-    # ====================
-    nemotron_v2_stem_cot|nemotron_v2_stem_30pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=101175
-        echo "📊 Using Nemotron-v2 STEM split @ 30% + COT (~101K samples)"
-        ;;
-    nemotron_v2_math_cot|nemotron_v2_math_30pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=68248
-        echo "📊 Using Nemotron-v2 Math split @ 30% + COT (~68K samples)"
-        ;;
-    nemotron_v2_code_cot|nemotron_v2_code_30pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=49875
-        echo "📊 Using Nemotron-v2 Code split @ 30% + COT (~50K samples)"
-        ;;
-    nemotron_v2_chat_cot|nemotron_v2_chat_30pct_cot)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct_cot.json}"
-        DEFAULT_TRAIN_SAMPLES=178900
-        echo "📊 Using Nemotron-v2 Chat split @ 30% + COT (~179K samples)"
-        ;;
-
-    # ====================
-    # Nemotron-v2 individual splits with COT + chat template
-    # ====================
-    nemotron_v2_stem_cot_chat|nemotron_v2_stem_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_stem_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=101175
-        echo "📊 Using Nemotron-v2 STEM @ 30% + COT + Chat Template (~101K samples)"
-        ;;
-    nemotron_v2_math_cot_chat|nemotron_v2_math_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_math_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=68248
-        echo "📊 Using Nemotron-v2 Math @ 30% + COT + Chat Template (~68K samples)"
-        ;;
-    nemotron_v2_code_cot_chat|nemotron_v2_code_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_code_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=49875
-        echo "📊 Using Nemotron-v2 Code @ 30% + COT + Chat Template (~50K samples)"
-        ;;
-    nemotron_v2_chat_cot_chat|nemotron_v2_chat_30pct_cot_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_nemotron_v2_chat_30pct_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=178900
-        echo "📊 Using Nemotron-v2 Chat @ 30% + COT + Chat Template (~179K samples)"
-        ;;
-
-    # ====================
-    # OpenScience datasets
-    # ====================
-    openscience)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience.json}"
-        DEFAULT_TRAIN_SAMPLES=299800
-        echo "📊 Using OpenScience Dataset (plain text)"
-        ;;
-    openscience_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=299800
-        echo "📊 Using OpenScience Dataset + Chat Template"
-        ;;
-
-    # ====================
-    # Combined datasets
-    # ====================
-    combined|combined_v1_v2_openscience)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_v1_v2_openscience.json}"
-        DEFAULT_TRAIN_SAMPLES=10000000
-        echo "📊 Using Combined Dataset: 50% Nemotron-v1 + 30% Nemotron-v2 + 20% OpenScience (~10M samples)"
-        ;;
-    combined_chat|combined_v1_v2_openscience_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_v1_v2_openscience_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=10000000
-        echo "📊 Using Combined Dataset + Chat Template (~10M samples)"
-        ;;
-    combined_cot_chat|combined_all_cot_chat)
-        # Combined: 20% OpenScience + 50% Nemotron-v1 + 30% Nemotron-v2 (all splits)
-        # All with COT reasoning + Qwen3 chat template
-        # Nemotron-v2 breakdown: 7.5% stem + 7.5% math + 5% code + 10% chat = 30%
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_cot_chat.json}"
-        # Total samples: ~300K OpenScience + ~7.5M Nemotron-v1 + ~398K Nemotron-v2 ≈ 8.2M
-        DEFAULT_TRAIN_SAMPLES=8200000
-        echo "📊 Using Combined Dataset + COT + Chat Template (~8.2M samples)"
-        echo "   - 20% OpenScience (chat)"
-        echo "   - 50% Nemotron-v1 @ 30% (cot+chat)"
-        echo "   - 30% Nemotron-v2 @ 30% (stem+math+code+chat, cot+chat)"
-        ;;
-    combined_v2|combined_v2_cot_chat)
-        # Combined V2: Code & Math focused
-        # All with COT reasoning + Qwen3 chat template
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_combined_v2_cot_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=1024000  # manually set to 1M samples
-        echo "📊 Using Combined V2 (Code & Math focused) + COT + Chat (~8.2M samples)"
-        echo "   - 20% OpenScience"
-        echo "   - 40% Nemotron-v1 (10% stem, 10% math, 15% code, 5% chat)"
-        echo "   - 40% Nemotron-v2 (5% stem, 10% math, 15% code, 10% chat)"
-        ;;
-
-    # ====================
-    # Other datasets
-    # ====================
-    slimorca)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_slimorca.json}"
-        DEFAULT_TRAIN_SAMPLES=500000
-        echo "📊 Using SlimOrca Dataset"
-        ;;
-    slimorca_chat)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_slimorca_chat.json}"
-        DEFAULT_TRAIN_SAMPLES=500000
-        echo "📊 Using SlimOrca Dataset + Chat Template"
-        ;;
-
-    # ====================
-    # Default fallback
-    # ====================
-    *)
-        BLEND_PATH="${BLEND_PATH:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/datasets/datablend_openscience.json}"
-        DEFAULT_TRAIN_SAMPLES=299800
-        echo "📊 Using OpenScience Dataset (Default)"
-        ;;
-esac
-
-# Allow override via environment variable
-TRAIN_SAMPLES=${TRAIN_SAMPLES:-$DEFAULT_TRAIN_SAMPLES}
-LR_DECAY_SAMPLES=$(python3 -c "print(int(${TRAIN_SAMPLES} * 0.99))")
-LR_WARMUP_SAMPLES=$(python3 -c "print(int(${TRAIN_SAMPLES} * 0.01))")
-
-echo "📈 Training samples configuration:"
-echo "    Train samples: ${TRAIN_SAMPLES}"
-echo "    LR decay samples: ${LR_DECAY_SAMPLES}"
-echo "    LR warmup samples: ${LR_WARMUP_SAMPLES}"
-
-########################################################
-#### RESUME LOGIC ####
-########################################################
-
-if [ -f "${CHECKPOINT_DIR}/latest_checkpointed_iteration.txt" ]; then
-    echo "🔄 Found existing checkpoint at ${CHECKPOINT_DIR}"
-    echo "   Resuming training from there..."
-    LOAD_CHECKPOINT_DIR="${CHECKPOINT_DIR}"
-    FINETUNE_FLAG=""
-    LOAD_OPTIM_ARGS=""
-    CKPT_PARALLEL_LOAD_ARG="--ckpt-fully-parallel-load"
-else
-    echo "🆕 No existing checkpoint found. Starting fresh from base student."
-    LOAD_CHECKPOINT_DIR="${BASE_STUDENT_CKPT}"
-    FINETUNE_FLAG="--finetune"
-    LOAD_OPTIM_ARGS="--no-load-optim --no-load-rng"
-    CKPT_PARALLEL_LOAD_ARG=""
-fi
-
-########################################################
-#### CREATE DIRECTORIES ####
-########################################################
-
-mkdir -p ${LOGS_DIR}
-mkdir -p ${CHECKPOINT_DIR}
-mkdir -p ${DATACACHE_DIR}
-mkdir -p ${TENSORBOARD_DIR}
-
-########################################################
-#### LOG ENVIRONMENT ####
-########################################################
-
-echo "========================================" | tee ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "Starting ${STUDENT_MODEL} NVFP4 QAD Training" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "Time: ${DATETIME}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-echo "<< MODEL CONFIG >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "STUDENT_MODEL=${STUDENT_MODEL}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "TEACHER_MODEL=${TEACHER_MODEL}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "CONFIG_FILE=${STUDENT_CONFIG_FILE}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "IS_MOE=${IS_MOE}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "<< END MODEL CONFIG >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo -e "\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-echo "<< START PATHS >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "SCRIPT_DIR=${SCRIPT_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "MLM_DIR=${MLM_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "RUN_DIR=${RUN_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "LOGS_DIR=${LOGS_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "CHECKPOINT_DIR=${CHECKPOINT_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "DATACACHE_DIR=${DATACACHE_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "TENSORBOARD_DIR=${TENSORBOARD_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "LOAD_CHECKPOINT_DIR=${LOAD_CHECKPOINT_DIR}" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "<< END PATHS >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-echo "<< START GIT >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "GIT LOG" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-git -C ${MLM_DIR} log --oneline -1 |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "GIT STATUS" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-git -C ${MLM_DIR} status --porcelain --branch |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "GIT DIFF" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-git -C ${MLM_DIR} diff |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "<< END GIT >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo -e "\n\n" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-echo "<< START ENV >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-env |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "<< END ENV >>" |& tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-########################################################
-#### TRAINING ARGUMENTS ####
-########################################################
-
-# Iterations to skip (if any)
-ITERATIONS_TO_SKIP="${ITERATIONS_TO_SKIP:-}"
-
-# Number of GPUs to use
-NUM_GPUS=${NUM_GPUS:-8}
-MASTER_ADDR=${MASTER_ADDR:-localhost}
-MASTER_PORT=${MASTER_PORT:-29500}
-
-# Checkpoint and Model Loading
-CHECKPOINT_ARGS=" \
-    --auto-detect-ckpt-format \
-    --export-te-mcore-model \
-    --dist-ckpt-strictness log_unexpected \
-    ${FINETUNE_FLAG} \
-    ${LOAD_OPTIM_ARGS} \
-    --load ${LOAD_CHECKPOINT_DIR}"
-
-# Add KD teacher args (always enabled - TEACHER_CKPT and TEACHER_MODEL_CONFIG are required)
-CHECKPOINT_ARGS="${CHECKPOINT_ARGS} \
-    --export-quant-cfg nvfp4 \
-    --export-kd-teacher-load ${TEACHER_CKPT_DIR} \
-    --teacher-model-config ${TEACHER_MODEL_CONFIG} \
-    ${KD_CFG_ARGS}"
-
-# Tokenizer Settings (from sourced config or default)
-TOKENIZER_MODEL="${TOKENIZER_MODEL:-${HF_MODEL_CKPT:-Qwen/${STUDENT_MODEL}}}"
-TOKENIZER_ARGS=" \
-    --tokenizer-type HuggingFaceTokenizer \
-    --tokenizer-model ${TOKENIZER_MODEL}"
-
-# Data Settings
-DATA_ARGS=" \
-    --per-split-data-args-path ${BLEND_PATH} \
-    --data-cache-path ${DATACACHE_DIR} \
-    --no-mmap-bin-files \
-    --num-dataset-builder-threads 16 \
-    --no-create-attention-mask-in-dataloader"
-
-# Training Hyperparameters
-TRAINING_ARGS=" \
-    --micro-batch-size ${MBS} \
-    --global-batch-size 256 \
-    --train-samples ${TRAIN_SAMPLES} \
-    --lr-decay-samples ${LR_DECAY_SAMPLES} \
-    --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --bf16 \
-    --no-masked-softmax-fusion"
-
-# Optimizer Settings
-OPTIMIZER_ARGS=" \
-    --lr ${LR} \
-    --min-lr 0.0 \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --lr-decay-style cosine \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --use-distributed-optimizer \
-    --overlap-grad-reduce \
-    --overlap-param-gather"
-
-# Parallelism Settings
-# Build parallel args based on model type
-PARALLEL_ARGS=" \
-    --tensor-model-parallel-size ${TP_SIZE} \
-    --pipeline-model-parallel-size ${PP_SIZE:-1} \
-    --distributed-timeout-minutes 360 \
-    --disable-gloo-process-groups \
-    --ddp-num-buckets 7"
-
-# Add expert parallelism for MoE models
-if [ "$IS_MOE" = "true" ] && [ "$EP_SIZE" -gt 1 ]; then
-    PARALLEL_ARGS="${PARALLEL_ARGS} \
-    --expert-model-parallel-size ${EP_SIZE}"
-    echo "🔧 MoE Expert Parallelism: EP=${EP_SIZE}"
-fi
-
-# Add sequence parallel if supported (check if it's in MODEL_ARGS)
-if echo "$STUDENT_MODEL_ARGS" | grep -q "sequence-parallel"; then
-    echo "🔧 Sequence Parallel: enabled (from model config)"
-else
-    PARALLEL_ARGS="${PARALLEL_ARGS} --sequence-parallel"
-fi
-
-# Memory Optimization
-MEMORY_ARGS=" \
-    --recompute-granularity full \
-    --recompute-method uniform \
-    --recompute-num-layers 1 \
-    --no-gradient-accumulation-fusion"
-
-# Checkpoint Saving
-SAVE_ARGS=" \
-    --save ${CHECKPOINT_DIR} \
-    --save-interval 200 \
-    --save-retain-interval 200 \
-    --ckpt-format torch_dist \
-    --ckpt-fully-parallel-save \
-    --ckpt-assume-constant-structure \
-    --exit-duration-in-mins 230 \
-    ${CKPT_PARALLEL_LOAD_ARG}"
-
-# Logging and Monitoring
-LOGGING_ARGS=" \
-    --log-interval 10 \
-    --eval-iters 20 \
-    --eval-interval 200 \
-    --log-progress \
-    --timing-log-option minmax \
-    ${LOG_PARAMS_NORM_ARG:-} \
-    --log-num-zeros-in-grad \
-    --log-throughput \
-    --log-straggler \
-    --disable-straggler-on-startup \
-    --straggler-minmax-count 16 \
-    --tensorboard-dir ${TENSORBOARD_DIR}"
-
-# Runtime Settings
-RUNTIME_ARGS=" \
-    --exit-duration-in-mins 1200 \
-    --num-workers 8 \
-    --no-check-for-nan-in-loss-and-grad"
-
-# Combine all arguments
-# NOTE: Argument order matters! Later args override earlier ones (argparse behavior)
-# 
-# Order explanation:
-#   1. CHECKPOINT_ARGS   - Loading/saving config
-#   2. STUDENT_MODEL_ARGS - From conf file (may contain --micro-batch-size, --bf16, --save-interval, etc.)
-#   3. TOKENIZER_ARGS    - Overrides --tokenizer-type from conf file
-#   4. DATA_ARGS         - Dataset configuration
-#   5. TRAINING_ARGS     - Overrides --micro-batch-size, --bf16 from conf file
-#   6. OPTIMIZER_ARGS    - Learning rate, optimizer settings
-#   7. PARALLEL_ARGS     - TP/PP/EP settings
-#   8. MEMORY_ARGS       - Recompute settings
-#   9. SAVE_ARGS         - Overrides --save-interval from conf file
-#   10. LOGGING_ARGS     - Logging configuration
-#   11. RUNTIME_ARGS     - Runtime settings
-#
-# This allows conf files to set defaults that QAD script can override
-ALL_ARGS=" \
-    ${CHECKPOINT_ARGS} \
-    ${STUDENT_MODEL_ARGS} \
-    ${TOKENIZER_ARGS} \
-    ${DATA_ARGS} \
-    ${TRAINING_ARGS} \
-    ${OPTIMIZER_ARGS} \
-    ${PARALLEL_ARGS} \
-    ${MEMORY_ARGS} \
-    ${SAVE_ARGS} \
-    ${LOGGING_ARGS} \
-    ${RUNTIME_ARGS}"
-
-if [ -n "${ITERATIONS_TO_SKIP}" ]; then
-    ALL_ARGS="${ALL_ARGS} --iterations-to-skip ${ITERATIONS_TO_SKIP}"
-fi
-
-# Update PYTHONPATH
-export PYTHONPATH="${MODELOPT_DIR}:${MLM_DIR}:${PYTHONPATH:-}"
-
-########################################################
-#### LAUNCH TRAINING ####
-########################################################
-
-echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "Running training command..." | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-LOG_FILE="${LOGS_DIR}/${MODEL_SHORT_NAME}_qad_${DATETIME}.log"
-
-echo "Output will be written to: ${LOG_FILE}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-# Multi-node configuration
-NNODES=${NNODES:-1}
-NODE_RANK=${NODE_RANK:-0}
-
-echo "<< DISTRIBUTED CONFIG >>" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  NNODES: ${NNODES}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  NODE_RANK: ${NODE_RANK}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  NUM_GPUS per node: ${NUM_GPUS}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  MASTER_ADDR: ${MASTER_ADDR}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  MASTER_PORT: ${MASTER_PORT}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  TP: ${TP_SIZE}, PP: ${PP_SIZE:-1}, EP: ${EP_SIZE}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  MBS: ${MBS}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "  Total GPUs: $((NNODES * NUM_GPUS))" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "<< END DISTRIBUTED CONFIG >>" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-
-# Launch training
-torchrun \
-    --nproc_per_node=${NUM_GPUS} \
-    --nnodes=${NNODES} \
-    --node_rank=${NODE_RANK} \
-    --master_addr=${MASTER_ADDR} \
-    --master_port=${MASTER_PORT} \
-    ${MLM_DIR}/pretrain_gpt.py ${ALL_ARGS} 2>&1 | tee ${LOG_FILE}
-
-echo "" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "Training completed or exited" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "Check logs at: ${LOG_FILE}" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
-echo "========================================" | tee -a ${LOGS_DIR}/${ENV_LOG_FILENAME}
diff --git a/examples/llm_qad/sbatch_qwen_qad.sh b/examples/llm_qad/sbatch_qad.sh
similarity index 71%
rename from examples/llm_qad/sbatch_qwen_qad.sh
rename to examples/llm_qad/sbatch_qad.sh
index 1d86a959c..16dd515bb 100755
--- a/examples/llm_qad/sbatch_qwen_qad.sh
+++ b/examples/llm_qad/sbatch_qad.sh
@@ -6,25 +6,17 @@
 #SBATCH -t 4:00:00
 #SBATCH --exclusive
 #SBATCH --mem=0
+#SBATCH --gres=gpu:4
 #SBATCH --ntasks-per-node=1
 #SBATCH --job-name=coreai_dlalgo_modelopt-qwen.qad
 
 # Usage:
-#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
-#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-8b-nemotron.conf
-#   sbatch sbatch_qwen_qad.sh --config configs/qwen3-30b-a3b-moe.conf
+#   sbatch sbatch_qad.sh --config configs/qwen3-8b.conf
+#   sbatch sbatch_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
 #
 # With HuggingFace token:
-#   sbatch sbatch_qwen_qad.sh --hf-token hf_xxx --config configs/qwen3-8b-default.conf
+#   sbatch sbatch_qad.sh --hf-token hf_xxx --config configs/qwen3-8b.conf
 #
-# Override config values:
-#   LR=1e-5 sbatch sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
-#   STUDENT_FP4_CKPT=/path/to/ckpt sbatch sbatch_qwen_qad.sh --config ...
-#
-# Command line usage:
-#   sbatch sbatch_qwen_qad.sh [LR] [TEACHER_MODEL] [DATASET_NAME] [STUDENT_MODEL] [KD_CFG_PATH]
-#   sbatch sbatch_qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B
-#   sbatch sbatch_qwen_qad.sh 1e-6 Qwen3-8B nemotron Qwen3-8B /path/to/kd_config.yaml
 
 set -x -e
 
@@ -98,17 +90,17 @@ STUDENT_MODEL="${4:-${STUDENT_MODEL:-Qwen3-8B}}"
 KD_CFG_PATH="${5:-${KD_CFG_PATH:-}}"
 
 # Paths
-MLM_DIR="${MLM_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM}"
-MODELOPT_DIR="${MODELOPT_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer}"
-MODELS_ROOT="${MODELS_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models}"
-QAD_CHECKPOINT_ROOT="${QAD_CHECKPOINT_ROOT:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints}"
-DATACACHE_DIR="${DATACACHE_DIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache}"
+MLM_DIR="${MLM_DIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/workspace/Megatron-LM}"
+MODELOPT_DIR="${MODELOPT_DIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/workspace/TensorRT-Model-Optimizer}"
+MODELS_ROOT="${MODELS_ROOT:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/models}"
+QAD_CHECKPOINT_ROOT="${QAD_CHECKPOINT_ROOT:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/checkpoints}"
+DATACACHE_DIR="${DATACACHE_DIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/data_cache}"
 LOG_DIR="${LOG_DIR:-${QAD_CHECKPOINT_ROOT}/logs_slurm}"
 
 # Container
-CONTAINER_IMAGE="${CONTAINER_IMAGE:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh}"
-CONTAINER_MOUNTS="${CONTAINER_MOUNTS:-/lustre/fsw:/lustre/fsw}"
-CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad}"
+CONTAINER_IMAGE="${CONTAINER_IMAGE:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/containers/pytorch_25.06-py3.sqsh}"
+CONTAINER_MOUNTS="${CONTAINER_MOUNTS:-/lustre/fs1:/lustre/fs1}"
+CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad}"
 
 # Parallelism settings (from config, required)
 TP_SIZE="${TP_SIZE:?ERROR: TP_SIZE must be set in config}"
@@ -200,22 +192,45 @@ fi
 # Build Container Environment Exports
 ########################################################
 
-# Core exports
-EXPORTS="export NODE_RANK=\${SLURM_PROCID} && \
+# Core exports (environment variables that qwen_qad.sh will read)
+# Use local /tmp for Triton cache to avoid race conditions on shared filesystem
+EXPORTS="export TRITON_CACHE_DIR=/tmp/triton_cache_\${SLURM_JOB_ID}_\${SLURM_PROCID} && \
+export NODE_RANK=\${SLURM_PROCID} && \
 export NNODES=${NNODES} && \
 export NUM_GPUS=${NUM_GPUS} && \
 export TP_SIZE=${TP_SIZE} && \
 export PP_SIZE=${PP_SIZE} && \
 export EP_SIZE=${EP_SIZE} && \
 export MBS=${MBS} && \
+export IS_MOE=${IS_MOE:-false} && \
 export MASTER_ADDR=${MASTER_ADDR} && \
 export MASTER_PORT=${MASTER_PORT} && \
 export MLM_DIR=${MLM_DIR} && \
 export MODELOPT_DIR=${MODELOPT_DIR} && \
-export MODELS_ROOT=${MODELS_ROOT} && \
 export QAD_CHECKPOINT_ROOT=${QAD_CHECKPOINT_ROOT} && \
 export DATACACHE_DIR=${DATACACHE_DIR}"
 
+# Training hyperparameters (required by qwen_qad.sh)
+EXPORTS="${EXPORTS} && export LR=${LR:-}"
+EXPORTS="${EXPORTS} && export GBS=${GBS:-}"
+EXPORTS="${EXPORTS} && export MIN_LR=${MIN_LR:-}"
+EXPORTS="${EXPORTS} && export LR_DECAY_STYLE=${LR_DECAY_STYLE:-}"
+EXPORTS="${EXPORTS} && export SAVE_INTERVAL=${SAVE_INTERVAL:-}"
+EXPORTS="${EXPORTS} && export LOG_INTERVAL=${LOG_INTERVAL:-}"
+EXPORTS="${EXPORTS} && export STUDENT_MODEL=${STUDENT_MODEL:-}"
+EXPORTS="${EXPORTS} && export TEACHER_MODEL=${TEACHER_MODEL:-}"
+EXPORTS="${EXPORTS} && export DATASET_NAME=${DATASET_NAME:-}"
+
+# Student config file (required)
+if [ -n "${STUDENT_CONFIG_FILE:-}" ]; then
+    EXPORTS="${EXPORTS} && export STUDENT_CONFIG_FILE=${STUDENT_CONFIG_FILE}"
+fi
+
+# Tokenizer model (optional - defaults to Qwen/${STUDENT_MODEL} in qad.sh)
+if [ -n "${TOKENIZER_MODEL:-}" ]; then
+    EXPORTS="${EXPORTS} && export TOKENIZER_MODEL=${TOKENIZER_MODEL}"
+fi
+
 # Checkpoint exports (required)
 EXPORTS="${EXPORTS} && export STUDENT_CKPT=${STUDENT_CKPT}"
 EXPORTS="${EXPORTS} && export TEACHER_CKPT=${TEACHER_CKPT}"
@@ -249,12 +264,21 @@ fi
 # Launch Training
 ########################################################
 
-SCRIPT_NAME="qwen_qad.sh"
+SCRIPT_NAME="qad.sh"
+
+# Build config args for qwen_qad.sh
+CONFIG_ARGS=""
+if [ -n "${CONFIG_FILE}" ]; then
+    CONFIG_ARGS="--config ${CONFIG_FILE}"
+fi
+if [ -n "${HF_TOKEN:-}" ]; then
+    CONFIG_ARGS="${CONFIG_ARGS} --hf-token ${HF_TOKEN}"
+fi
 
 run_cmd="pip install transformers==4.54 && \
 ${EXPORTS} && \
 cd ${CONTAINER_WORKDIR} && \
-bash ${SCRIPT_NAME} ${LR} ${TEACHER_MODEL} ${DATASET_NAME} ${STUDENT_MODEL} ${KD_CFG_PATH}"
+bash ${SCRIPT_NAME} ${CONFIG_ARGS}"
 
 echo ""
 echo "Running command:"

From bf741b8307e3241e4d450b7392e4ed9296422973 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:49 -0800
Subject: [PATCH 05/16] clean up

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_qad/README.md                    | 308 ++++--------------
 examples/llm_qad/configs/qwen3-8b.conf        |  53 ---
 .../llm_qad/configs/qwen3-8b_template.conf    |  67 ++++
 examples/llm_qad/sbatch_qad.sh                |  12 +-
 4 files changed, 136 insertions(+), 304 deletions(-)
 delete mode 100644 examples/llm_qad/configs/qwen3-8b.conf
 create mode 100644 examples/llm_qad/configs/qwen3-8b_template.conf

diff --git a/examples/llm_qad/README.md b/examples/llm_qad/README.md
index 59be3aa06..07f584ea7 100644
--- a/examples/llm_qad/README.md
+++ b/examples/llm_qad/README.md
@@ -1,6 +1,6 @@
 # QAD Training Scripts
 
-Quantization-Aware Distillation (QAD) training scripts for language models using Megatron-LM. These scripts enable training quantized (NVFP4) student models with knowledge distillation from full-precision teacher models.
+Quantization-Aware Distillation (QAD) training scripts for language models using Megatron-LM. These scripts enable training quantized (e.g., NVFP4) student models with knowledge distillation from full-precision teacher models.
 
 ## Overview
 
@@ -12,12 +12,6 @@ Quantization-Aware Distillation (QAD) training scripts for language models using
 
 ## Requirements
 
-### Software Dependencies
-
-- **Container**: Nvidia PyTorch container (tested with `nvcr.io/nvidia/pytorch:25.06-py3`)
-- **Python**: 3.10+
-- **transformers**: 4.54+ (installed automatically)
-
 ### Clone Required Repositories
 
 ```bash
@@ -26,28 +20,16 @@ export WORKSPACE=/path/to/your/workspace
 
 # Clone Megatron-LM (with ModelOpt integration)
 git clone https://github.com/NVIDIA/Megatron-LM.git ${WORKSPACE}/Megatron-LM
-cd ${WORKSPACE}/Megatron-LM
-git checkout <modelopt-branch>  # Use branch with ModelOpt support
 
 # Clone Model-Optimizer
 git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git ${WORKSPACE}/Model-Optimizer
 ```
 
-### Prepare Container
-
-For SLURM with Pyxis/Enroot, create a squashfs container:
-
-```bash
-# Pull and convert Docker image to sqsh
-enroot import docker://nvcr.io/nvidia/pytorch:25.06-py3
-mv nvidia+pytorch+25.06-py3.sqsh /path/to/containers/pytorch_25.06.sqsh
-```
-
 ### Prepare Checkpoints
 
 You need the following checkpoints before training:
 
-1. **Student checkpoint**: Quantized (NVFP4) model in Megatron-LM format
+1. **Student checkpoint**: Quantized (e.g., NVFP4) model in Megatron-LM format
 2. **Teacher checkpoint**: Full-precision (BF16) model in Megatron-LM format
 3. **Teacher config YAML**: Model architecture configuration
 
@@ -55,32 +37,53 @@ See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/m
 
 ## Creating a Configuration
 
-### Use Template Configs
-
-Template configurations are provided in `configs/`:
+### Available Templates
 
-| Config | Model | Description |
-|--------|-------|-------------|
-| `qwen3-30b-a3b-instruct-2507-moe_template.conf` | Qwen3-30B-A3B-Instruct | MoE template (start here) |
-| `qwen3-8b.conf` | Qwen3-8B | Dense model example |
+| Config | Model | Type |
+|--------|-------|------|
+| `qwen3-30b-a3b-instruct-2507-moe_template.conf` | Qwen3-30B-A3B-Instruct | MoE |
+| `qwen3-8b_template.conf` | Qwen3-8B | Dense |
 
 ### Create Your Config
 
-1. Copy the template:
+1. Copy a template:
    ```bash
+   # For MoE models
    cp configs/qwen3-30b-a3b-instruct-2507-moe_template.conf configs/my-experiment.conf
+   
+   # For Dense models
+   cp configs/qwen3-8b_template.conf configs/my-experiment.conf
    ```
 
-2. Fill in required empty fields:
-   - `STUDENT_CKPT` - Path to quantized student MLM checkpoint
-   - `TEACHER_CKPT` - Path to teacher MLM checkpoint  
-   - `TEACHER_MODEL_CONFIG` - Path to teacher YAML config (see below)
-   - `MLM_DIR` - Path to your Megatron-LM clone
-
-3. Optionally adjust:
-   - `QAD_CHECKPOINT_ROOT`, `DATACACHE_DIR` - output paths
-   - `CONTAINER_IMAGE`, `CONTAINER_MOUNTS` - container settings
-   - `BLEND_PATH` - dataset path
+2. Fill in required fields:
+
+   **Checkpoints** (required):
+   | Variable | Description |
+   |----------|-------------|
+   | `STUDENT_CKPT` | Path to quantized student MLM checkpoint |
+   | `TEACHER_CKPT` | Path to teacher MLM checkpoint |
+   | `TEACHER_MODEL_CONFIG` | Path to teacher YAML config (see below) |
+
+   **Paths** (required):
+   | Variable | Description |
+   |----------|-------------|
+   | `MLM_DIR` | Path to Megatron-LM directory |
+   | `BLEND_PATH` | Path to datablend JSON (from dataset generation) |
+
+   **Parallelism** (adjust for your hardware):
+   | Variable | Dense Model | MoE Model |
+   |----------|-------------|-----------|
+   | `IS_MOE` | `false` | `true` |
+   | `TP_SIZE` | `1` | `2` |
+   | `EP_SIZE` | `1` | `4` |
+   | `MBS` | `4` | `2` |
+
+   **Training** (tune as needed):
+   | Variable | Default | Description |
+   |----------|---------|-------------|
+   | `LR` | `1e-5` | Learning rate |
+   | `GBS` | `256` | Global batch size |
+   | `SAVE_INTERVAL` | `200` | Checkpoint interval |
 
 ### Teacher Model Config (YAML)
 
@@ -95,11 +98,9 @@ kv_channels: 128
 ffn_hidden_size: 6144
 ```
 
-Set `TEACHER_MODEL_CONFIG` in your config to point to this file.
-
 ## Dataset Generation
 
-QAD training requires preprocessed datasets in Megatron-LM format. Use the one-button script to generate datasets:
+Use the one-button script to generate the default datablend:
 
 ```bash
 cd data_utils/
@@ -107,57 +108,37 @@ cd data_utils/
 bash generate_dataset.sh \
     --output-dir /path/to/datasets \
     --mlm-path /path/to/Megatron-LM \
-    --tokenizer <HF-model> (e.g., Qwen/Qwen3-30B-A3B-Instruct-2507)
+    --tokenizer <HF-model>  # e.g., Qwen/Qwen3-30B-A3B-Instruct-2507
 ```
 
-### Requirements
+**Requirements**: HuggingFace token for `nvidia/Nemotron-Post-Training-Dataset-v2`. Login first: `huggingface-cli login`
 
-- HuggingFace token to access `nvidia/Nemotron-Post-Training-Dataset-v2`
-- Login first: `huggingface-cli login`
-
-### What It Does
-
-1. Downloads OpenScience + Nemotron-v2 datasets
-2. Preprocesses to Megatron-LM format
-3. Creates combined datablend JSON with weights:
-   - 30% Nemotron-v2 code
-   - 20% Nemotron-v2 math
-   - 20% Nemotron-v2 stem
-   - 10% Nemotron-v2 chat
-   - 20% OpenScience
-
-### Output
-
-```
-/path/to/datasets/
-├── openscience_splits_preprocessed/  # Megatron format
-├── nemotron_v2_preprocessed/         # Megatron format
-└── datablend_combined.json           # Combined config
-```
-
-Set `BLEND_PATH` in your config to point to `datablend_combined.json`.
+**Output**: Creates `datablend_combined.json` with OpenScience + Nemotron-v2 datasets. Set `BLEND_PATH` in your config to point to this file.
 
 ## Quick Start
 
 ### SLURM Batch Submission (Recommended)
 
+First, update `sbatch_qad.sh` SLURM header with your cluster settings:
+- `--account=<your-account>`
+- `--nodes`, `--gres=gpu`, `-t` as needed
+
 ```bash
-# Submit training job
-sbatch sbatch_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
+# Submit training job (override account on command line)
+sbatch --account=<your-account> sbatch_qad.sh --config configs/my-experiment.conf
 
 # With HuggingFace token (for gated models)
-sbatch sbatch_qad.sh --hf-token $HF_TOKEN --config configs/qwen3-30b-a3b-thinking-2507-moe.conf
+sbatch --account=<your-account> sbatch_qad.sh --hf-token $HF_TOKEN --config configs/my-experiment.conf
 
-# Multi-node (override SLURM header)
-sbatch --nodes=4 sbatch_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
+# Adjust nodes and time
+sbatch --account=<your-account> --nodes=4 -t 8:00:00 sbatch_qad.sh --config configs/my-experiment.conf
 ```
 
 ### Interactive Mode
 
 ```bash
-# Get interactive node first
+# Get interactive node
 srun -A <account> --nodes=1 -p batch --mpi=pmix \
-    -J qad:dev \
     --container-image=nvcr.io/nvidia/pytorch:25.06-py3 \
     --container-mounts="..." \
     -t 4:0:0 --pty bash
@@ -166,187 +147,16 @@ srun -A <account> --nodes=1 -p batch --mpi=pmix \
 bash qad.sh --config configs/qwen3-8b.conf
 ```
 
-## Required Config Variables
-
-### Model Configuration
-
-| Variable | Description | Example |
-|----------|-------------|---------|
-| `STUDENT_MODEL` | Student model name (for logging) | `Qwen3-30B-A3B` |
-| `TEACHER_MODEL` | Teacher model name (for logging) | `Qwen3-30B-A3B` |
-| `TOKENIZER_MODEL` | HuggingFace tokenizer path | `Qwen/Qwen3-30B-A3B-Instruct-2507` |
-| `IS_MOE` | Whether model is Mixture of Experts | `true` or `false` |
-
-### Checkpoint Paths
-
-| Variable | Description |
-|----------|-------------|
-| `STUDENT_CKPT` | Path to quantized student MLM checkpoint |
-| `TEACHER_CKPT` | Path to teacher MLM checkpoint |
-| `TEACHER_MODEL_CONFIG` | Path to teacher model YAML config |
-| `STUDENT_CONFIG_FILE` | Path to student model args script (in Megatron-LM) |
-
-### Training Hyperparameters
-
-| Variable | Description | Example |
-|----------|-------------|---------|
-| `LR` | Learning rate | `1e-5` |
-| `GBS` | Global batch size | `256` |
-| `MIN_LR` | Minimum learning rate | `0.0` |
-| `LR_DECAY_STYLE` | LR decay schedule | `constant`, `cosine` |
-| `SAVE_INTERVAL` | Checkpoint save interval (iterations) | `200` |
-| `LOG_INTERVAL` | Logging interval (iterations) | `10` |
-
-### Data Configuration
-
-| Variable | Description |
-|----------|-------------|
-| `DATASET_NAME` | Dataset identifier (for output naming) |
-| `BLEND_PATH` | Path to datablend JSON file |
-| `TRAIN_SAMPLES` | Number of training samples |
-
-### Parallelism
-
-| Variable | Description | Example |
-|----------|-------------|---------|
-| `TP_SIZE` | Tensor parallelism size | `1`, `2`, `4` |
-| `PP_SIZE` | Pipeline parallelism size | `1` |
-| `EP_SIZE` | Expert parallelism (MoE only) | `4`, `8` |
-| `MBS` | Micro-batch size | `1`, `2` |
-| `NUM_GPUS` | GPUs per node | `4`, `8` |
-
-### Required Paths
-
-| Variable | Description |
-|----------|-------------|
-| `MLM_DIR` | Path to Megatron-LM directory |
-| `MODELOPT_DIR` | Path to Model-Optimizer directory |
-| `QAD_CHECKPOINT_ROOT` | Root directory for checkpoints |
-| `DATACACHE_DIR` | Directory for data cache |
-
-### Container Configuration
-
-| Variable | Description |
-|----------|-------------|
-| `CONTAINER_IMAGE` | Path to container sqsh file |
-| `CONTAINER_MOUNTS` | Container mount points |
-| `CONTAINER_WORKDIR` | Working directory inside container |
-
-## Optional Config Variables
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `MASTER_PORT` | `29500` | Distributed training port |
-| `MAX_SEQ` | Model default | Override sequence length |
-| `KD_CFG_PATH` | Auto-generated | Custom KD config YAML |
-| `RUN_TAG` | Empty | Custom tag for output naming |
-
-## Parallelism Settings
-
-### Dense Models (e.g., Qwen3-8B)
-
-```bash
-export IS_MOE=false
-export TP_SIZE=1
-export EP_SIZE=1
-export MBS=4
-```
-
-### MoE Models (e.g., Qwen3-30B-A3B)
-
-```bash
-export IS_MOE=true
-export TP_SIZE=2
-export EP_SIZE=4
-export MBS=2
-```
-
-**Note**: MoE models require loading both student and teacher models, which increases memory requirements significantly.
-
-### GPU Requirements
-
-| Model | TP | EP | Nodes (4 GPU/node) | Total GPUs |
-|-------|----|----|---------------------|------------|
-| Qwen3-8B | 1 | 1 | 1 | 4-8 |
-| Qwen3-30B-A3B | 2 | 4 | 2-4 | 8-16 |
-
-## MoE Performance Optimizations
-
-For MoE models, the script automatically enables performance optimizations:
-
-- `--moe-token-dispatcher-type alltoall`
-- `--moe-shared-expert-overlap`
-- `--moe-permute-fusion`
-- `--moe-grouped-gemm`
-- `--cross-entropy-loss-fusion`
-
-To disable (if causing issues):
-```bash
-export ENABLE_MOE_PERF=0
-```
-
-## Output Structure
-
-```
-$QAD_CHECKPOINT_ROOT/
-├── <student>-NVFP4-Teacher-<teacher>-Data-<dataset>-lr<lr>-minlr<min>-decay<style>-gbs<gbs>-si<save>-li<log>/
-│   ├── checkpoints/<model>/
-│   │   ├── iter_0000200/
-│   │   ├── iter_0000400/
-│   │   └── latest_checkpointed_iteration.txt
-│   ├── tensorboard/<model>/
-│   └── logs/
-│       ├── _qad_<datetime>.log
-│       └── _<datetime>.env.log
-└── logs_slurm/
-    ├── <job-name>_<jobid>_<datetime>.log
-    └── err_<job-name>_<jobid>_<datetime>.log
-```
-
 ## Resuming Training
 
-Training automatically resumes from checkpoints:
-
-1. **Fresh start**: If no checkpoint exists, loads from `STUDENT_CKPT` with `--finetune`
-2. **Resume**: If `latest_checkpointed_iteration.txt` exists, resumes from there
-
-To force a fresh start:
+Training automatically resumes from checkpoints. To force a fresh start:
 ```bash
 rm -rf /path/to/checkpoints/*/latest_checkpointed_iteration.txt
 ```
 
-## Job Dependencies
-
-Chain jobs to run sequentially:
-
-```bash
-# Submit first job
-JOB1=$(sbatch --parsable sbatch_qad.sh --config ...)
-
-# Submit dependent job (runs after JOB1 finishes)
-sbatch --dependency=afterany:$JOB1 sbatch_qad.sh --config ...
-```
-
 ## Troubleshooting
 
 ### OOM Errors
-
-1. **Reduce MBS**: Set `MBS=1`
-2. **Increase parallelism**: Increase `EP_SIZE` or `TP_SIZE`
-3. **Add more nodes**: Increase `SLURM --nodes`
-4. **Disable log-params-norm**: Set `LOG_PARAMS_NORM=0`
-
-### Triton Cache Errors
-
-Clear corrupted cache:
-```bash
-rm -rf ~/.triton/cache
-```
-
-The scripts automatically use per-job Triton cache directories.
-
-## See Also
-
-- [Megatron-LM Documentation](https://github.com/NVIDIA/Megatron-LM)
-- [Model-Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
-- [MoE Optimization Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/api-guide/moe.html)
+- Reduce `MBS` to `1`
+- Increase `EP_SIZE`,`TP_SIZE`,`PP_SIZE`
+- Add more nodes
diff --git a/examples/llm_qad/configs/qwen3-8b.conf b/examples/llm_qad/configs/qwen3-8b.conf
deleted file mode 100644
index eedf0f00d..000000000
--- a/examples/llm_qad/configs/qwen3-8b.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-########################################################
-# QAD Configuration: Qwen3-8B Default (OpenScience)
-# 
-# Usage:
-#   sbatch --nodes=4 -t 4:00:00 sbatch_qwen_qad.sh --config configs/qwen3-8b-default.conf
-########################################################
-
-########################################################
-# MODEL
-########################################################
-export STUDENT_MODEL="Qwen3-8B"
-export TEACHER_MODEL="Qwen3-8B"
-
-########################################################
-# CHECKPOINTS (REQUIRED)
-########################################################
-export STUDENT_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-NVFP4-TP8-MLM"
-export TEACHER_CKPT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-TP8-MLM"
-export TEACHER_MODEL_CONFIG="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models/Qwen3-8B-teacher.yaml"
-
-########################################################
-# TRAINING
-########################################################
-export LR="1e-6"
-export DATASET_NAME="openscience"
-# export KD_CFG_PATH=""  # Optional: path to custom KD config YAML
-
-########################################################
-# PARALLELISM
-########################################################
-export TP_SIZE=8
-export PP_SIZE=1
-export EP_SIZE=1
-export MBS=16
-export NUM_GPUS=8
-export MASTER_PORT=29500
-
-########################################################
-# PATHS
-########################################################
-export MLM_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/Megatron-LM"
-export MODELOPT_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer"
-export MODELS_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/models"
-export QAD_CHECKPOINT_ROOT="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/checkpoints"
-export DATACACHE_DIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/data_cache"
-
-########################################################
-# CONTAINER
-########################################################
-export CONTAINER_IMAGE="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/containers/pytorch_25.06-py3.sqsh"
-export CONTAINER_MOUNTS="/lustre/fsw:/lustre/fsw"
-export CONTAINER_WORKDIR="/lustre/fsw/coreai_dlalgo_modelopt/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad"
diff --git a/examples/llm_qad/configs/qwen3-8b_template.conf b/examples/llm_qad/configs/qwen3-8b_template.conf
new file mode 100644
index 000000000..c01d45a1f
--- /dev/null
+++ b/examples/llm_qad/configs/qwen3-8b_template.conf
@@ -0,0 +1,67 @@
+#!/bin/bash
+########################################################
+# QAD Configuration: Qwen3-8B (Dense Model)
+#
+# Usage:
+#   sbatch sbatch_qad.sh --config configs/qwen3-8b_template.conf
+########################################################
+
+########################################################
+# MODEL
+########################################################
+export STUDENT_MODEL="Qwen3-8B"
+export TEACHER_MODEL="Qwen3-8B"
+export TOKENIZER_MODEL="Qwen/Qwen3-8B"
+
+########################################################
+# CHECKPOINTS (REQUIRED)
+########################################################
+export STUDENT_CKPT="" # Student MLM checkpoint path
+export TEACHER_CKPT="" # Teacher MLM checkpoint path
+export TEACHER_MODEL_CONFIG="" # Teacher MLM model config yaml file
+
+########################################################
+# TRAINING
+########################################################
+export LR="1e-5"
+export GBS=256
+export MIN_LR="0.0"
+export LR_DECAY_STYLE="constant"
+export SAVE_INTERVAL=200
+export LOG_INTERVAL=10
+export DATASET_NAME="openscience"
+export TRAIN_SAMPLES=1024000
+
+########################################################
+# PARALLELISM (Dense model - simpler settings)
+########################################################
+export IS_MOE=false
+export TP_SIZE=1
+export PP_SIZE=1
+export EP_SIZE=1
+export MBS=4
+export NUM_GPUS=8
+export MASTER_PORT=29500
+
+########################################################
+# PATHS (REQUIRED)
+########################################################
+export MLM_DIR="" # path to Megatron-LM source directory
+export MODELOPT_DIR="" # path to Model-Optimizer source directory
+export STUDENT_CONFIG_FILE="" # path to student model args script, e.g., ${MLM_DIR}/examples/post_training/modelopt/conf/Qwen/Qwen3-8B.sh
+export QAD_CHECKPOINT_ROOT="" # path to store QAD checkpoints
+export DATACACHE_DIR="" # path to data cache directory
+
+########################################################
+# CONTAINER
+########################################################
+export CONTAINER_IMAGE="" # path to container image, e.g., nvcr.io/nvidia/pytorch:25.06-py3
+export CONTAINER_MOUNTS="" # container mounts, e.g., "/lustre/fs1:/lustre/fs1"
+export CONTAINER_WORKDIR="" # container work directory
+
+########################################################
+# DATASET
+########################################################
+# Generate with: bash data_utils/generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>
+export BLEND_PATH="" # path to datablend_combined.json from generate_dataset.sh
+
diff --git a/examples/llm_qad/sbatch_qad.sh b/examples/llm_qad/sbatch_qad.sh
index 16dd515bb..032825ca5 100755
--- a/examples/llm_qad/sbatch_qad.sh
+++ b/examples/llm_qad/sbatch_qad.sh
@@ -1,14 +1,22 @@
 #!/bin/bash
+# =============================================================================
+# QAD SLURM Batch Submission Script
+# =============================================================================
+#
+# Override these SLURM settings via command line:
+#   sbatch --nodes=4 --account=<your-account> sbatch_qad.sh --config ...
+#
+# Or set defaults below for your cluster:
 
 #SBATCH -p batch
-#SBATCH --account=coreai_dlalgo_modelopt
+#SBATCH --account=<your-account>
 #SBATCH --nodes=4
 #SBATCH -t 4:00:00
 #SBATCH --exclusive
 #SBATCH --mem=0
 #SBATCH --gres=gpu:4
 #SBATCH --ntasks-per-node=1
-#SBATCH --job-name=coreai_dlalgo_modelopt-qwen.qad
+#SBATCH --job-name=qad-training
 
 # Usage:
 #   sbatch sbatch_qad.sh --config configs/qwen3-8b.conf

From 33bb9ab9cb7955c0e115cae670f909b3074472e9 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:50 -0800
Subject: [PATCH 06/16] refactor

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 ...n3-30b-a3b-instruct-2507-moe_template.conf |   2 +-
 .../llm_qad/configs/qwen3-8b_template.conf    |   2 +-
 .../data_utils/download_nemotron_v2.py        | 546 ------------------
 .../data_utils/download_openscience.py        | 155 -----
 .../llm_qad/data_utils/generate_dataset.sh    |  27 +-
 .../data_utils/process_all_datasets.sh        | 215 -------
 examples/llm_qad/qad.sh                       |  15 +
 examples/llm_qad/sbatch_qad.sh                |  15 +
 8 files changed, 54 insertions(+), 923 deletions(-)
 delete mode 100644 examples/llm_qad/data_utils/download_nemotron_v2.py
 delete mode 100644 examples/llm_qad/data_utils/download_openscience.py
 delete mode 100755 examples/llm_qad/data_utils/process_all_datasets.sh

diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
index 2ad736c60..8915b5009 100644
--- a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
@@ -31,7 +31,7 @@ export LR_DECAY_STYLE="constant"
 export SAVE_INTERVAL=200
 export LOG_INTERVAL=10
 export DATASET_NAME="combined_openscience_nemotron"
-export TRAIN_SAMPLES=1024000
+export TRAIN_SAMPLES=102400
 
 ########################################################
 # PARALLELISM (MoE specific)
diff --git a/examples/llm_qad/configs/qwen3-8b_template.conf b/examples/llm_qad/configs/qwen3-8b_template.conf
index c01d45a1f..61f6d8582 100644
--- a/examples/llm_qad/configs/qwen3-8b_template.conf
+++ b/examples/llm_qad/configs/qwen3-8b_template.conf
@@ -30,7 +30,7 @@ export LR_DECAY_STYLE="constant"
 export SAVE_INTERVAL=200
 export LOG_INTERVAL=10
 export DATASET_NAME="openscience"
-export TRAIN_SAMPLES=1024000
+export TRAIN_SAMPLES=102400
 
 ########################################################
 # PARALLELISM (Dense model - simpler settings)
diff --git a/examples/llm_qad/data_utils/download_nemotron_v2.py b/examples/llm_qad/data_utils/download_nemotron_v2.py
deleted file mode 100644
index e4cd51d46..000000000
--- a/examples/llm_qad/data_utils/download_nemotron_v2.py
+++ /dev/null
@@ -1,546 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download and preprocess NVIDIA Nemotron-Post-Training-Dataset-v2 for QAD training.
-
-Each split is saved to its own folder for fine-grained control over datablends.
-
-Splits available:
-- stem: Science, reasoning, humanities (English)
-- math: Step-by-step math solutions (English)
-- code: Programming challenges (English)
-- chat: Conversational tuning (English)
-- multilingual_ja: Japanese
-- multilingual_de: German
-- multilingual_it: Italian
-- multilingual_es: Spanish
-- multilingual_fr: French
-
-NOTE: This dataset is GATED. You need to:
-1. Go to https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2
-2. Request access and wait for approval
-3. Login with: huggingface-cli login
-
-Usage:
-    # Download all English splits (each to separate folder)
-    python download_nemotron_v2.py --sample-percent 30
-    
-    # Download specific splits
-    python download_nemotron_v2.py --splits stem,math --sample-percent 50
-    
-    # Include multilingual
-    python download_nemotron_v2.py --sample-percent 30 --include-multilingual
-
-Output structure:
-    nemotron_v2/
-    ├── stem/
-    │   ├── stem_30pct_train.jsonl
-    │   ├── stem_30pct_validation.jsonl
-    │   └── stem_30pct_test.jsonl
-    ├── math/
-    │   ├── math_30pct_train.jsonl
-    │   └── ...
-    └── ...
-
-Datablend configs:
-    datasets/
-    ├── datablend_nemotron_v2_stem_30pct.json      # Per-split configs
-    ├── datablend_nemotron_v2_math_30pct.json
-    └── datablend_nemotron_v2_all_en_30pct.json    # Combined config
-"""
-
-import argparse
-import json
-import os
-from datasets import load_dataset, get_dataset_config_names, load_dataset_builder
-from tqdm import tqdm
-
-DEFAULT_OUTPUT_DIR = None  # Must be specified via --output-dir
-DEFAULT_DATABLEND_DIR = None  # Must be specified via --datablend-dir
-DATASET_NAME = "nvidia/Nemotron-Post-Training-Dataset-v2"
-
-# Known splits (actual sizes will be fetched from HuggingFace)
-ENGLISH_SPLITS = ["stem", "math", "code", "chat"]
-MULTILINGUAL_SPLITS = ["multilingual_ja", "multilingual_de", "multilingual_it", 
-                       "multilingual_es", "multilingual_fr"]
-ALL_SPLIT_NAMES = ENGLISH_SPLITS + MULTILINGUAL_SPLITS
-
-
-def get_split_sizes(splits_to_check: list) -> dict:
-    """Fetch actual split sizes from HuggingFace dataset info."""
-    print("\n📊 Fetching actual dataset sizes from HuggingFace...")
-    
-    split_sizes = {}
-    
-    for split_name in splits_to_check:
-        try:
-            # Try to get dataset info without downloading
-            builder = load_dataset_builder(DATASET_NAME, split_name)
-            info = builder.info
-            
-            # Get the split info
-            if info.splits and split_name in info.splits:
-                split_sizes[split_name] = info.splits[split_name].num_examples
-                print(f"  ✓ {split_name}: {split_sizes[split_name]:,} samples")
-            else:
-                # If split info not available, try loading a small sample to estimate
-                print(f"  ⚠ {split_name}: size not in metadata, will count during download")
-                split_sizes[split_name] = None
-                
-        except Exception as e:
-            if "gated" in str(e).lower() or "access" in str(e).lower():
-                print(f"\n❌ ACCESS DENIED - Please request access at:")
-                print(f"   https://huggingface.co/datasets/{DATASET_NAME}")
-                print("   Then login with: huggingface-cli login")
-                raise
-            else:
-                print(f"  ⚠ {split_name}: could not fetch size ({e})")
-                split_sizes[split_name] = None
-    
-    return split_sizes
-
-# Train/valid/test split ratios
-TRAIN_RATIO = 0.95
-VALID_RATIO = 0.025
-TEST_RATIO = 0.025
-RANDOM_SEED = 42
-
-# Global tokenizer for chat template (initialized if --tokenizer is provided)
-_TOKENIZER = None
-
-
-def init_tokenizer(tokenizer_name: str):
-    """Initialize tokenizer for chat template formatting."""
-    global _TOKENIZER
-    if tokenizer_name:
-        from transformers import AutoTokenizer
-        print(f"📝 Loading tokenizer for chat template: {tokenizer_name}")
-        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
-        
-        # Show example
-        example = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
-        formatted = _TOKENIZER.apply_chat_template(example, tokenize=False)
-        print(f"   Example format:\n   {formatted[:200]}...")
-
-
-def format_messages_to_text(messages: list, reasoning: str = None) -> str:
-    """Convert messages format to text for QAD training.
-    
-    If a tokenizer is initialized, uses its chat template.
-    Otherwise, uses simple role-based formatting.
-    """
-    global _TOKENIZER
-    
-    # Optionally prepend reasoning/chain-of-thought as thinking block
-    if reasoning and reasoning.strip():
-        # Insert thinking block before last assistant message
-        messages_with_cot = []
-        for i, msg in enumerate(messages):
-            if msg.get("role") == "assistant" and i == len(messages) - 1:
-                # Add thinking before final assistant response
-                thinking_content = f"<think>\n{reasoning}\n</think>\n{msg.get('content', '')}"
-                messages_with_cot.append({"role": "assistant", "content": thinking_content})
-            else:
-                messages_with_cot.append(msg)
-        messages = messages_with_cot
-    
-    # Use chat template if tokenizer is available
-    if _TOKENIZER is not None:
-        try:
-            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
-        except Exception as e:
-            print(f"Warning: Chat template failed, using simple format: {e}")
-    
-    # Fallback: simple role-based format
-    text_parts = []
-    for msg in messages:
-        role = msg.get("role", "")
-        content = msg.get("content", "")
-        
-        if role == "system":
-            text_parts.append(f"System: {content}")
-        elif role == "user":
-            text_parts.append(f"User: {content}")
-        elif role == "assistant":
-            text_parts.append(f"Assistant: {content}")
-    
-    return "\n\n".join(text_parts)
-
-
-def download_split(split_name: str, max_samples: int, output_dir: str, 
-                   pct_str: str, include_reasoning: bool = False,
-                   sample_percent: float = None) -> dict:
-    """Download a single split and save to its own folder.
-    
-    Args:
-        split_name: Name of the split to download
-        max_samples: Maximum samples to download (None = download all, then sample)
-        output_dir: Output directory
-        pct_str: Percentage string for filenames
-        include_reasoning: Include chain-of-thought reasoning
-        sample_percent: If max_samples is None, use this percentage after counting
-    """
-    
-    split_dir = os.path.join(output_dir, split_name)
-    os.makedirs(split_dir, exist_ok=True)
-    
-    if max_samples is not None:
-        print(f"\n📥 Loading split: {split_name} (target: {max_samples:,} samples)")
-    else:
-        print(f"\n📥 Loading split: {split_name} (downloading all, will sample {sample_percent}%)")
-    
-    examples = []
-    
-    try:
-        # Load the specific split
-        dataset = load_dataset(
-            DATASET_NAME,
-            split=split_name,
-            streaming=True  # Use streaming for large datasets
-        )
-        
-        count = 0
-        for example in tqdm(dataset, desc=f"Processing {split_name}", total=max_samples):
-            if max_samples is not None and count >= max_samples:
-                break
-            
-            messages = example.get("messages", [])
-            reasoning = example.get("reasoning", "") if include_reasoning else ""
-            
-            # Convert to text format
-            text = format_messages_to_text(messages, reasoning)
-            
-            if text.strip():
-                examples.append({
-                    "text": text,
-                    "category": example.get("category", split_name),
-                    "source": "nemotron_v2",
-                    "split": split_name,
-                    "language": "multilingual" if "multilingual" in split_name else "en"
-                })
-                count += 1
-        
-        print(f"✓ Collected {count:,} examples from {split_name}")
-        
-        # If we downloaded all and need to sample
-        if max_samples is None and sample_percent is not None:
-            import random
-            random.seed(RANDOM_SEED)
-            target_samples = int(len(examples) * sample_percent / 100)
-            if target_samples < len(examples):
-                examples = random.sample(examples, target_samples)
-                print(f"  Sampled {len(examples):,} examples ({sample_percent}% of {count:,})")
-        
-    except Exception as e:
-        if "gated" in str(e).lower() or "access" in str(e).lower():
-            print(f"\n❌ ACCESS DENIED for {split_name}")
-            print("   Please request access at:")
-            print("   https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2")
-            print("   Then login with: huggingface-cli login")
-            return None
-        else:
-            print(f"Error loading {split_name}: {e}")
-            return None
-    
-    if not examples:
-        print(f"Warning: No examples collected from {split_name}")
-        return None
-    
-    # Shuffle and split into train/valid/test
-    import random
-    random.seed(RANDOM_SEED)
-    random.shuffle(examples)
-    
-    total_size = len(examples)
-    train_end = int(total_size * TRAIN_RATIO)
-    valid_end = train_end + int(total_size * VALID_RATIO)
-    
-    splits = {
-        'train': examples[:train_end],
-        'validation': examples[train_end:valid_end],
-        'test': examples[valid_end:]
-    }
-    
-    # Save each split
-    saved_files = {}
-    for data_split, data in splits.items():
-        output_file = os.path.join(split_dir, f"{split_name}{pct_str}_{data_split}.jsonl")
-        
-        with open(output_file, 'w', encoding='utf-8') as f:
-            for ex in data:
-                json_line = json.dumps({"text": ex["text"]}, ensure_ascii=False)
-                f.write(json_line + '\n')
-        
-        saved_files[data_split] = output_file
-        print(f"  Saved {data_split}: {len(data):,} examples -> {output_file}")
-    
-    return {
-        'split_name': split_name,
-        'total': len(examples),
-        'train': len(splits['train']),
-        'validation': len(splits['validation']),
-        'test': len(splits['test']),
-        'files': saved_files
-    }
-
-
-def create_datablend_config(split_info: dict, output_dir: str, pct_str: str, datablend_dir: str) -> str:
-    """Create datablend config for a single split."""
-    split_name = split_info['split_name']
-    
-    # Preprocessed path pattern
-    preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
-    split_preprocessed_dir = os.path.join(preprocessed_dir, split_name)
-    
-    blend_config = {
-        "train": [1.0, f"{split_preprocessed_dir}/{split_name}{pct_str}_train_text_document"],
-        "valid": [1.0, f"{split_preprocessed_dir}/{split_name}{pct_str}_validation_text_document"],
-        "test": [1.0, f"{split_preprocessed_dir}/{split_name}{pct_str}_test_text_document"]
-    }
-    
-    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_{split_name}{pct_str}.json")
-    with open(blend_file, 'w') as f:
-        json.dump(blend_config, f, indent=2)
-    
-    return blend_file
-
-
-def create_combined_datablend(all_split_infos: list, output_dir: str, pct_str: str, 
-                               datablend_dir: str, suffix: str = "all_en") -> str:
-    """Create combined datablend config for multiple splits with equal weighting."""
-    
-    preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
-    
-    # Calculate total samples for weighting
-    total_train = sum(info['train'] for info in all_split_infos)
-    
-    train_blend = []
-    valid_blend = []
-    test_blend = []
-    
-    for info in all_split_infos:
-        split_name = info['split_name']
-        split_preprocessed_dir = os.path.join(preprocessed_dir, split_name)
-        
-        # Weight proportional to sample count
-        weight = info['train'] / total_train if total_train > 0 else 1.0 / len(all_split_infos)
-        
-        train_blend.extend([weight, f"{split_preprocessed_dir}/{split_name}{pct_str}_train_text_document"])
-        valid_blend.extend([weight, f"{split_preprocessed_dir}/{split_name}{pct_str}_validation_text_document"])
-        test_blend.extend([weight, f"{split_preprocessed_dir}/{split_name}{pct_str}_test_text_document"])
-    
-    blend_config = {
-        "train": train_blend,
-        "valid": valid_blend,
-        "test": test_blend
-    }
-    
-    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_{suffix}{pct_str}.json")
-    with open(blend_file, 'w') as f:
-        json.dump(blend_config, f, indent=2)
-    
-    return blend_file
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Download Nemotron-v2 for QAD (per-split folders)")
-    parser.add_argument("--output-dir", type=str, required=True,
-                        help="Output directory for JSONL files (required)")
-    parser.add_argument("--datablend-dir", type=str, required=True,
-                        help="Directory for datablend config files (required)")
-    parser.add_argument("--splits", type=str, default="stem,math,code,chat",
-                        help="Comma-separated list of English splits to download")
-    parser.add_argument("--include-multilingual", action="store_true",
-                        help="Include all multilingual splits (ja, de, it, es, fr)")
-    parser.add_argument("--sample-percent", type=float, default=30.0,
-                        help="Percentage of each split to use (1-100). Default: 30%%")
-    parser.add_argument("--max-samples", type=int, default=None,
-                        help="Maximum samples per split (absolute cap)")
-    parser.add_argument("--include-reasoning", action="store_true", default=False,
-                        help="Include chain-of-thought reasoning in output (for Thinking models)")
-    parser.add_argument("--tokenizer", type=str, default=None,
-                        help="HuggingFace tokenizer to use for chat template (e.g., Qwen/Qwen3-8B). "
-                             "If not specified, uses simple role-based formatting.")
-    args = parser.parse_args()
-    
-    # Handle reasoning flag
-    include_reasoning = args.include_reasoning
-    
-    # Initialize tokenizer if specified
-    if args.tokenizer:
-        init_tokenizer(args.tokenizer)
-    
-    output_dir = args.output_dir
-    datablend_dir = args.datablend_dir
-    os.makedirs(output_dir, exist_ok=True)
-    os.makedirs(datablend_dir, exist_ok=True)
-    
-    # Build list of splits to download
-    splits_to_download = [s.strip() for s in args.splits.split(",")]
-    if args.include_multilingual:
-        splits_to_download.extend(MULTILINGUAL_SPLITS.keys())
-    
-    # Remove duplicates while preserving order
-    splits_to_download = list(dict.fromkeys(splits_to_download))
-    
-    pct_str = f"_{int(args.sample_percent)}pct"
-    reasoning_str = "_cot" if include_reasoning else ""  # chain-of-thought suffix
-    chat_str = "_chat" if args.tokenizer else ""  # chat template suffix
-    
-    print("=" * 70)
-    print("Downloading NVIDIA Nemotron-Post-Training-Dataset-v2")
-    print("=" * 70)
-    print("⚠️  NOTE: This dataset requires HuggingFace access approval!")
-    print("   If you get an access error, visit:")
-    print("   https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2")
-    print("=" * 70)
-    print(f"Splits: {splits_to_download}")
-    print(f"Sample percent: {args.sample_percent}%")
-    print(f"Include reasoning: {include_reasoning}")
-    print(f"Chat template: {args.tokenizer or 'Simple role-based format'}")
-    print(f"Output directory: {output_dir}")
-    print(f"Each split saved to: {output_dir}/<split_name>/")
-    print("=" * 70)
-    
-    # Get actual split sizes from HuggingFace
-    try:
-        actual_sizes = get_split_sizes(splits_to_download)
-    except Exception as e:
-        print(f"\n❌ Failed to fetch dataset info: {e}")
-        return
-    
-    # Calculate samples per split based on actual sizes
-    print(f"\nTarget samples per split:")
-    samples_per_split = {}
-    for split_name in splits_to_download:
-        available = actual_sizes.get(split_name)
-        if available is None:
-            print(f"  ⚠ {split_name}: size unknown, will download all and sample")
-            # If size unknown, set a large number and we'll sample during download
-            samples_per_split[split_name] = None  # Download all, then sample
-            continue
-        
-        if args.max_samples is not None:
-            samples_per_split[split_name] = min(available, args.max_samples)
-        else:
-            samples_per_split[split_name] = int(available * args.sample_percent / 100)
-        pct = samples_per_split[split_name] / available * 100
-        print(f"  {split_name}: {samples_per_split[split_name]:,} ({pct:.1f}% of {available:,})")
-    
-    print("=" * 70)
-    
-    # Download each split to its own folder
-    all_split_infos = []
-    
-    for split_name in splits_to_download:
-        if split_name not in samples_per_split:
-            continue
-        
-        max_for_split = samples_per_split.get(split_name)
-        split_info = download_split(
-            split_name=split_name,
-            max_samples=max_for_split,
-            output_dir=output_dir,
-            pct_str=pct_str + reasoning_str + chat_str,  # Include reasoning and chat template suffix
-            include_reasoning=include_reasoning,
-            sample_percent=args.sample_percent if max_for_split is None else None
-        )
-        
-        if split_info:
-            all_split_infos.append(split_info)
-            
-            # Create per-split datablend config
-            blend_file = create_datablend_config(split_info, output_dir, pct_str + reasoning_str + chat_str, datablend_dir)
-            print(f"  📝 Datablend config: {blend_file}")
-    
-    if not all_split_infos:
-        print("\n❌ Error: No splits were successfully downloaded!")
-        return
-    
-    # Create combined datablend config
-    print("\n" + "=" * 70)
-    print("Creating combined datablend configs...")
-    
-    # English-only combined
-    full_suffix = pct_str + reasoning_str + chat_str
-    en_splits = [info for info in all_split_infos if "multilingual" not in info['split_name']]
-    if en_splits:
-        combined_file = create_combined_datablend(en_splits, output_dir, full_suffix, datablend_dir, "all_en")
-        print(f"📝 Combined English datablend: {combined_file}")
-    
-    # All splits combined (if multilingual included)
-    if len(all_split_infos) > len(en_splits):
-        combined_all_file = create_combined_datablend(all_split_infos, output_dir, full_suffix, datablend_dir, "all_multilingual")
-        print(f"📝 Combined all datablend: {combined_all_file}")
-    
-    # Save metadata JSON with sample counts
-    total_samples = sum(info['total'] for info in all_split_infos)
-    total_train = sum(info['train'] for info in all_split_infos)
-    total_valid = sum(info['validation'] for info in all_split_infos)
-    total_test = sum(info['test'] for info in all_split_infos)
-    
-    metadata = {
-        "dataset": DATASET_NAME,
-        "sample_percent": args.sample_percent,
-        "include_reasoning": include_reasoning,
-        "chat_template": args.tokenizer or "none (simple role format)",
-        "download_date": __import__('datetime').datetime.now().isoformat(),
-        "total_samples": total_samples,
-        "total_train": total_train,
-        "total_validation": total_valid,
-        "total_test": total_test,
-        "splits": {}
-    }
-    
-    for info in all_split_infos:
-        split_name = info['split_name']
-        metadata["splits"][split_name] = {
-            "available_in_dataset": actual_sizes.get(split_name),  # Actual HF count
-            "downloaded": info['total'],
-            "train": info['train'],
-            "validation": info['validation'],
-            "test": info['test'],
-            "files": info['files']
-        }
-    
-    metadata_file = os.path.join(output_dir, f"metadata{full_suffix}.json")
-    with open(metadata_file, 'w') as f:
-        json.dump(metadata, f, indent=2)
-    print(f"📊 Metadata saved: {metadata_file}")
-    
-    # Summary
-    print("\n" + "=" * 70)
-    print("✓ Download complete!")
-    print("=" * 70)
-    
-    print(f"\nSummary:")
-    print(f"  Total splits downloaded: {len(all_split_infos)}")
-    print(f"  Total samples: {total_samples:,}")
-    print(f"  Total train samples: {total_train:,}")
-    
-    print(f"\nPer-split breakdown:")
-    for info in all_split_infos:
-        print(f"  {info['split_name']}:")
-        print(f"    Total: {info['total']:,} | Train: {info['train']:,} | Valid: {info['validation']:,} | Test: {info['test']:,}")
-    
-    print(f"\nOutput structure:")
-    print(f"  {output_dir}/")
-    for info in all_split_infos:
-        print(f"  └── {info['split_name']}/")
-        print(f"      ├── {info['split_name']}{full_suffix}_train.jsonl")
-        print(f"      ├── {info['split_name']}{full_suffix}_validation.jsonl")
-        print(f"      └── {info['split_name']}{full_suffix}_test.jsonl")
-    
-    print(f"\nNext steps:")
-    print(f"1. Preprocess each split:")
-    for info in all_split_infos:
-        print(f"   bash process_nemotron_v2_qwen3-8B.sh {info['split_name']} {full_suffix.replace('_', '')}")
-    print(f"\n2. Or use individual datablend configs:")
-    for info in all_split_infos:
-        print(f"   DATASET_NAME=nemotron_v2_{info['split_name']}{full_suffix}")
-    print(f"\n3. Or use combined config:")
-    print(f"   DATASET_NAME=nemotron_v2_all_en{full_suffix}")
-    print("=" * 70)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/llm_qad/data_utils/download_openscience.py b/examples/llm_qad/data_utils/download_openscience.py
deleted file mode 100644
index 6133b07c3..000000000
--- a/examples/llm_qad/data_utils/download_openscience.py
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download and preprocess NVIDIA OpenScience dataset for QAD training.
-
-Usage:
-    # Simple format (default)
-    python download_openscience.py
-    
-    # With chat template (Qwen format)
-    python download_openscience.py --tokenizer Qwen/Qwen3-8B
-"""
-
-import argparse
-from datasets import load_dataset
-import json
-import os
-from tqdm import tqdm
-
-DEFAULT_OUTPUT_DIR = None  # Must be specified via --output-dir
-
-# Split configuration
-TRAIN_RATIO = 0.95
-VALID_RATIO = 0.025
-TEST_RATIO = 0.025
-RANDOM_SEED = 42
-
-# Global tokenizer for chat template
-_TOKENIZER = None
-
-
-def init_tokenizer(tokenizer_name: str):
-    """Initialize tokenizer for chat template formatting."""
-    global _TOKENIZER
-    if tokenizer_name:
-        from transformers import AutoTokenizer
-        print(f"📝 Loading tokenizer for chat template: {tokenizer_name}")
-        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
-
-
-def format_example(example: dict) -> str:
-    """Format a single example to text."""
-    global _TOKENIZER
-    
-    # OpenScience has input/output format
-    input_text = example.get("input", "")
-    output_text = example.get("output", "")
-    
-    if _TOKENIZER is not None:
-        # Use chat template
-        messages = [
-            {"role": "user", "content": input_text},
-            {"role": "assistant", "content": output_text}
-        ]
-        try:
-            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
-        except Exception as e:
-            print(f"Warning: Chat template failed: {e}")
-    
-    # Simple format
-    return f"User: {input_text}\n\nAssistant: {output_text}"
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Download OpenScience dataset")
-    parser.add_argument("--output-dir", type=str, required=True,
-                        help="Output directory (required)")
-    parser.add_argument("--datablend-dir", type=str, required=True,
-                        help="Directory for datablend config files (required)")
-    parser.add_argument("--tokenizer", type=str, default=None,
-                        help="HuggingFace tokenizer for chat template (e.g., Qwen/Qwen3-8B)")
-    args = parser.parse_args()
-    
-    OUTPUT_DIR = args.output_dir
-    DATABLEND_DIR = args.datablend_dir
-    chat_suffix = "_chat" if args.tokenizer else ""
-    
-    os.makedirs(OUTPUT_DIR, exist_ok=True)
-    os.makedirs(DATABLEND_DIR, exist_ok=True)
-    
-    if args.tokenizer:
-        init_tokenizer(args.tokenizer)
-    
-    print("Loading NVIDIA/OpenScience dataset...")
-    try:
-        dataset = load_dataset("nvidia/OpenScience", "OS-Q3-235B-4")
-        
-        # create output directory if it doesn't exist
-        os.makedirs(OUTPUT_DIR, exist_ok=True)
-        
-        # Handle different dataset structures
-        if 'train' in dataset:
-            full_data = dataset['train']
-        else:
-            # If no 'train' key, use the first available split
-            split_name = list(dataset.keys())[0]
-            print(f"No 'train' split found, using '{split_name}' split")
-            full_data = dataset[split_name]
-        
-        print(f"Shuffling {len(full_data)} examples with seed {RANDOM_SEED}...")
-        shuffled_data = full_data.shuffle(seed=RANDOM_SEED)
-        
-        total_size = len(shuffled_data)
-        train_end = int(total_size * TRAIN_RATIO)
-        valid_end = train_end + int(total_size * VALID_RATIO)
-        
-        splits_config = {
-            'train': shuffled_data.select(range(0, train_end)),
-            'validation': shuffled_data.select(range(train_end, valid_end)),
-            'test': shuffled_data.select(range(valid_end, total_size))
-        }
-        
-        print(f"\nCreated splits:")
-        for name, data in splits_config.items():
-            print(f"  {name}: {len(data)} examples ({len(data)/total_size*100:.2f}%)")
-        
-        print(f"\nFormat: {'Chat template' if args.tokenizer else 'Simple role format'}")
-        
-        # Save splits to JSONL
-        for split_name, split_data in splits_config.items():
-            output_file = os.path.join(OUTPUT_DIR, f"openscience{chat_suffix}_{split_name}.jsonl")
-            print(f"\nWriting {output_file}...")
-            
-            with open(output_file, 'w', encoding='utf-8') as f:
-                for example in tqdm(split_data, desc=split_name):
-                    # Format using chat template or simple format
-                    text = format_example(example)
-                    
-                    json_line = json.dumps({"text": text}, ensure_ascii=False)
-                    f.write(json_line + '\n')
-            
-            print(f"✓ Saved {len(split_data)} examples")
-        
-        # Create datablend config
-        preprocessed_dir = OUTPUT_DIR.replace("openscience_splits", "openscience_splits_preprocessed")
-        blend_file = os.path.join(DATABLEND_DIR, f"datablend_openscience{chat_suffix}.json")
-        blend_config = {
-            "train": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_train_text_document"],
-            "valid": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_validation_text_document"],
-            "test": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_test_text_document"]
-        }
-        with open(blend_file, 'w') as f:
-            json.dump(blend_config, f, indent=2)
-        print(f"📝 Created datablend config: {blend_file}")
-        
-        print("\n✓ Dataset splitting complete!")
-        print(f"\nOutput files: openscience{chat_suffix}_*.jsonl")
-        
-    except Exception as e:
-        print(f"Error loading dataset: {e}")
-        print("\nTrying alternative loading method...")
-        raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/llm_qad/data_utils/generate_dataset.sh b/examples/llm_qad/data_utils/generate_dataset.sh
index 347a2924f..e027f8318 100755
--- a/examples/llm_qad/data_utils/generate_dataset.sh
+++ b/examples/llm_qad/data_utils/generate_dataset.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # =============================================================================
 # One-Button Dataset Generation for QAD Training
 # =============================================================================
@@ -83,20 +98,22 @@ echo "=============================================="
 mkdir -p "${OUTPUT_DIR}"
 
 # =============================================================================
-# Step 1: Download datasets
+# Step 1: Download datasets (using unified download script)
 # =============================================================================
 echo ""
 echo "=== Step 1: Downloading Datasets ==="
 
 echo "[1/2] Downloading OpenScience..."
-python "${SCRIPT_DIR}/download_openscience.py" \
-    --output-dir "${OUTPUT_DIR}/openscience_splits" \
+python "${SCRIPT_DIR}/download_dataset.py" \
+    --dataset openscience \
+    --output-dir "${OUTPUT_DIR}" \
     --datablend-dir "${OUTPUT_DIR}" \
     --tokenizer "${TOKENIZER}"
 
 echo "[2/2] Downloading Nemotron-v2 @ ${SAMPLE_PERCENT}%..."
-python "${SCRIPT_DIR}/download_nemotron_v2.py" \
-    --output-dir "${OUTPUT_DIR}/nemotron_v2" \
+python "${SCRIPT_DIR}/download_dataset.py" \
+    --dataset nemotron-v2 \
+    --output-dir "${OUTPUT_DIR}" \
     --datablend-dir "${OUTPUT_DIR}" \
     --sample-percent "${SAMPLE_PERCENT}" \
     --tokenizer "${TOKENIZER}"
diff --git a/examples/llm_qad/data_utils/process_all_datasets.sh b/examples/llm_qad/data_utils/process_all_datasets.sh
deleted file mode 100755
index cdd9e6be4..000000000
--- a/examples/llm_qad/data_utils/process_all_datasets.sh
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/bin/bash
-# Download and process all datasets (general, model-agnostic)
-#
-# Usage:
-#   bash process_all_datasets.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
-#
-# Required arguments:
-#   --output-dir      Base output directory for datasets
-#   --mlm-path        Path to Megatron-LM directory
-#   --tokenizer       HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
-#
-# Optional arguments:
-#   --datablend-dir     Directory for datablend configs (default: output-dir)
-#   --suffix            Suffix for file naming (default: 30pct_chat)
-#   --sample-percent    Percentage of data to use (default: 30)
-#   --include-reasoning Include chain-of-thought reasoning (for Thinking models)
-#                       Default: OFF (suitable for Instruct models)
-#
-# Examples:
-#   # For Instruct models (no COT):
-#   bash process_all_datasets.sh --output-dir /data --mlm-path /mlm --tokenizer Qwen/Qwen3-30B-A3B-Instruct-2507
-#
-#   # For Thinking models (with COT):
-#   bash process_all_datasets.sh --output-dir /data --mlm-path /mlm --tokenizer Qwen/Qwen3-30B-A3B-Thinking-2507 --include-reasoning
-
-set -e
-
-# Parse arguments
-OUTPUT_DIR=""
-MLM_DIR=""
-TOKENIZER=""
-DATABLEND_DIR=""
-SUFFIX=""  # Will be set based on --include-reasoning
-SAMPLE_PERCENT=30
-INCLUDE_REASONING=false
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --output-dir)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        --mlm-path)
-            MLM_DIR="$2"
-            shift 2
-            ;;
-        --tokenizer)
-            TOKENIZER="$2"
-            shift 2
-            ;;
-        --datablend-dir)
-            DATABLEND_DIR="$2"
-            shift 2
-            ;;
-        --suffix)
-            SUFFIX="$2"
-            shift 2
-            ;;
-        --sample-percent)
-            SAMPLE_PERCENT="$2"
-            shift 2
-            ;;
-        --include-reasoning)
-            INCLUDE_REASONING=true
-            shift
-            ;;
-        *)
-            echo "Unknown option: $1"
-            exit 1
-            ;;
-    esac
-done
-
-# Set default suffix based on reasoning flag
-if [ -z "$SUFFIX" ]; then
-    if [ "$INCLUDE_REASONING" = true ]; then
-        SUFFIX="${SAMPLE_PERCENT}pct_cot_chat"
-    else
-        SUFFIX="${SAMPLE_PERCENT}pct_chat"
-    fi
-fi
-
-# Validate required arguments
-if [ -z "$OUTPUT_DIR" ]; then
-    echo "Error: --output-dir is required"
-    echo "Usage: bash process_all_datasets.sh --output-dir <path> --mlm-path <path> --tokenizer <model>"
-    exit 1
-fi
-
-if [ -z "$MLM_DIR" ]; then
-    echo "Error: --mlm-path is required"
-    exit 1
-fi
-
-if [ -z "$TOKENIZER" ]; then
-    echo "Error: --tokenizer is required"
-    exit 1
-fi
-
-# Set defaults
-if [ -z "$DATABLEND_DIR" ]; then
-    DATABLEND_DIR="${OUTPUT_DIR}"
-fi
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-echo "=========================================="
-echo "Downloading and Processing All Datasets"
-echo "=========================================="
-echo "Output dir: ${OUTPUT_DIR}"
-echo "Datablend dir: ${DATABLEND_DIR}"
-echo "MLM path: ${MLM_DIR}"
-echo "Tokenizer: ${TOKENIZER}"
-echo "Suffix: ${SUFFIX}"
-echo "Sample percent: ${SAMPLE_PERCENT}%"
-echo "Include reasoning (COT): ${INCLUDE_REASONING}"
-echo "=========================================="
-
-# Create directories
-mkdir -p "${OUTPUT_DIR}"
-mkdir -p "${DATABLEND_DIR}"
-
-# 1. Download datasets
-echo ""
-echo "=== Step 1: Downloading Datasets ==="
-
-echo ">>> Downloading OpenScience..."
-python "${SCRIPT_DIR}/download_openscience.py" \
-    --output-dir "${OUTPUT_DIR}/openscience_splits" \
-    --datablend-dir "${DATABLEND_DIR}" \
-    --tokenizer "${TOKENIZER}"
-
-# Build reasoning flag for download commands
-REASONING_FLAG=""
-if [ "$INCLUDE_REASONING" = true ]; then
-    REASONING_FLAG="--include-reasoning"
-fi
-
-echo ">>> Downloading Nemotron-v1 @ ${SAMPLE_PERCENT}%..."
-python "${SCRIPT_DIR}/download_nemotron_v1.py" \
-    --output-dir "${OUTPUT_DIR}/nemotron_v1" \
-    --datablend-dir "${DATABLEND_DIR}" \
-    --sample-percent "${SAMPLE_PERCENT}" \
-    ${REASONING_FLAG} \
-    --tokenizer "${TOKENIZER}"
-
-echo ">>> Downloading Nemotron-v2 @ ${SAMPLE_PERCENT}%..."
-python "${SCRIPT_DIR}/download_nemotron_v2.py" \
-    --output-dir "${OUTPUT_DIR}/nemotron_v2" \
-    --datablend-dir "${DATABLEND_DIR}" \
-    --sample-percent "${SAMPLE_PERCENT}" \
-    ${REASONING_FLAG} \
-    --tokenizer "${TOKENIZER}"
-
-# 2. Process datasets
-echo ""
-echo "=== Step 2: Processing Datasets ==="
-
-echo ">>> Processing OpenScience..."
-bash "${SCRIPT_DIR}/process_openscience.sh" \
-    --output-dir "${OUTPUT_DIR}/openscience_splits_preprocessed" \
-    --input-dir "${OUTPUT_DIR}/openscience_splits" \
-    --mlm-path "${MLM_DIR}" \
-    --tokenizer "${TOKENIZER}" \
-    --suffix chat \
-    --datablend-dir "${DATABLEND_DIR}"
-
-echo ">>> Processing Nemotron-v1 splits..."
-for split in stem math code chat; do
-    echo "    Processing nemotron_v1/${split}..."
-    bash "${SCRIPT_DIR}/process_nemotron_v1.sh" \
-        --output-dir "${OUTPUT_DIR}/nemotron_v1_preprocessed" \
-        --input-dir "${OUTPUT_DIR}/nemotron_v1" \
-        --mlm-path "${MLM_DIR}" \
-        --tokenizer "${TOKENIZER}" \
-        --split "${split}" \
-        --suffix "${SUFFIX}" \
-        --datablend-dir "${DATABLEND_DIR}"
-done
-
-echo ">>> Processing Nemotron-v2 splits..."
-for split in stem math code chat; do
-    echo "    Processing nemotron_v2/${split}..."
-    bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
-        --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
-        --input-dir "${OUTPUT_DIR}/nemotron_v2" \
-        --mlm-path "${MLM_DIR}" \
-        --tokenizer "${TOKENIZER}" \
-        --split "${split}" \
-        --suffix "${SUFFIX}" \
-        --datablend-dir "${DATABLEND_DIR}"
-done
-
-echo ""
-echo "=========================================="
-echo "✓ All datasets downloaded and processed!"
-echo "=========================================="
-echo ""
-echo "Available datasets:"
-echo "  - openscience_chat"
-echo "  - nemotron_v1_stem_${SUFFIX}"
-echo "  - nemotron_v1_math_${SUFFIX}"
-echo "  - nemotron_v1_code_${SUFFIX}"
-echo "  - nemotron_v1_chat_${SUFFIX}"
-echo "  - nemotron_v2_stem_${SUFFIX}"
-echo "  - nemotron_v2_math_${SUFFIX}"
-echo "  - nemotron_v2_code_${SUFFIX}"
-echo "  - nemotron_v2_chat_${SUFFIX}"
-echo ""
-echo "Datablend configs are in: ${DATABLEND_DIR}"
-echo ""
-echo "Usage:"
-echo "  DATASET_NAME=combined_cot_chat sbatch sbatch_qwen_qad.sh --config configs/your-config.conf"
-echo "  DATASET_NAME=nemotron_v1_stem_${SUFFIX} sbatch sbatch_qwen_qad.sh --config ..."
-echo "=========================================="
diff --git a/examples/llm_qad/qad.sh b/examples/llm_qad/qad.sh
index 941032ea5..4998cf6c2 100644
--- a/examples/llm_qad/qad.sh
+++ b/examples/llm_qad/qad.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # =============================================================================
 # QAD (Quantization-Aware Distillation) Training Script
 # =============================================================================
diff --git a/examples/llm_qad/sbatch_qad.sh b/examples/llm_qad/sbatch_qad.sh
index 032825ca5..a1435eb53 100755
--- a/examples/llm_qad/sbatch_qad.sh
+++ b/examples/llm_qad/sbatch_qad.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # =============================================================================
 # QAD SLURM Batch Submission Script
 # =============================================================================

From 49eade824343e4fe7245e1d2ec9efe22ca2e664c Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:52 -0800
Subject: [PATCH 07/16] refactor dataset

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../llm_qad/data_utils/download_dataset.py    | 479 ++++++++++++++++++
 .../llm_qad/data_utils/process_dataset.sh     | 268 ++++++++++
 .../llm_qad/data_utils/process_nemotron_v2.sh | 195 -------
 .../llm_qad/data_utils/process_openscience.sh | 189 -------
 4 files changed, 747 insertions(+), 384 deletions(-)
 create mode 100644 examples/llm_qad/data_utils/download_dataset.py
 create mode 100644 examples/llm_qad/data_utils/process_dataset.sh
 delete mode 100755 examples/llm_qad/data_utils/process_nemotron_v2.sh
 delete mode 100755 examples/llm_qad/data_utils/process_openscience.sh

diff --git a/examples/llm_qad/data_utils/download_dataset.py b/examples/llm_qad/data_utils/download_dataset.py
new file mode 100644
index 000000000..2dc642e22
--- /dev/null
+++ b/examples/llm_qad/data_utils/download_dataset.py
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Unified dataset downloader for QAD training.
+
+Supports:
+  - nvidia/OpenScience (OS-Q3-235B-4)
+  - nvidia/Nemotron-Post-Training-Dataset-v2 (stem, math, code, chat)
+
+Usage:
+    # Download OpenScience
+    python download_dataset.py --dataset openscience --output-dir /path/to/data --tokenizer Qwen/Qwen3-8B
+
+    # Download Nemotron-v2 (all English splits)
+    python download_dataset.py --dataset nemotron-v2 --output-dir /path/to/data --tokenizer Qwen/Qwen3-8B
+
+    # Download specific Nemotron-v2 splits
+    python download_dataset.py --dataset nemotron-v2 --splits stem,math --sample-percent 30 ...
+
+NOTE: Nemotron-v2 is GATED. You need:
+  1. Request access at: https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2
+  2. Login with: huggingface-cli login
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import random
+from typing import Any
+
+from tqdm import tqdm
+
+# Constants
+TRAIN_RATIO = 0.95
+VALID_RATIO = 0.025
+TEST_RATIO = 0.025
+RANDOM_SEED = 42
+
+DATASET_CONFIGS: dict[str, dict[str, Any]] = {
+    "openscience": {
+        "hf_name": "nvidia/OpenScience",
+        "hf_config": "OS-Q3-235B-4",
+        "format": "input_output",  # Has input/output fields
+        "gated": False,
+    },
+    "nemotron-v2": {
+        "hf_name": "nvidia/Nemotron-Post-Training-Dataset-v2",
+        "hf_config": None,  # Uses split names directly
+        "format": "messages",  # Has messages field
+        "gated": True,
+        "default_splits": ["stem", "math", "code", "chat"],
+        "all_splits": [
+            "stem",
+            "math",
+            "code",
+            "chat",
+            "multilingual_ja",
+            "multilingual_de",
+            "multilingual_it",
+            "multilingual_es",
+            "multilingual_fr",
+        ],
+    },
+}
+
+# Global tokenizer
+_TOKENIZER = None
+
+
+def init_tokenizer(tokenizer_name: str) -> None:
+    """Initialize tokenizer for chat template formatting."""
+    global _TOKENIZER
+    if tokenizer_name:
+        from transformers import AutoTokenizer
+
+        print(f"Loading tokenizer: {tokenizer_name}")
+        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+
+
+def format_input_output(input_text: str, output_text: str) -> str:
+    """Format input/output pair (OpenScience format)."""
+    global _TOKENIZER
+
+    if _TOKENIZER is not None:
+        messages = [
+            {"role": "user", "content": input_text},
+            {"role": "assistant", "content": output_text},
+        ]
+        try:
+            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
+        except Exception as e:
+            print(f"Warning: Chat template failed: {e}")
+
+    return f"User: {input_text}\n\nAssistant: {output_text}"
+
+
+def format_messages(messages: list, reasoning: str | None = None) -> str:
+    """Format messages list (Nemotron-v2 format)."""
+    global _TOKENIZER
+
+    # Optionally prepend reasoning as thinking block
+    if reasoning and reasoning.strip():
+        messages_with_cot = []
+        for i, msg in enumerate(messages):
+            if msg.get("role") == "assistant" and i == len(messages) - 1:
+                thinking_content = f"<think>\n{reasoning}\n</think>\n{msg.get('content', '')}"
+                messages_with_cot.append({"role": "assistant", "content": thinking_content})
+            else:
+                messages_with_cot.append(msg)
+        messages = messages_with_cot
+
+    if _TOKENIZER is not None:
+        try:
+            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
+        except Exception as e:
+            print(f"Warning: Chat template failed: {e}")
+
+    # Fallback: simple format
+    text_parts = []
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        if role == "system":
+            text_parts.append(f"System: {content}")
+        elif role == "user":
+            text_parts.append(f"User: {content}")
+        elif role == "assistant":
+            text_parts.append(f"Assistant: {content}")
+
+    return "\n\n".join(text_parts)
+
+
+def download_openscience(output_dir: str, datablend_dir: str, use_chat: bool) -> dict[str, Any]:
+    """Download and split OpenScience dataset."""
+    from datasets import load_dataset
+
+    config = DATASET_CONFIGS["openscience"]
+    chat_suffix = "_chat" if use_chat else ""
+
+    print(f"\nDownloading {config['hf_name']}...")
+    dataset = load_dataset(config["hf_name"], config["hf_config"])
+
+    # Get the data
+    if "train" in dataset:
+        full_data = dataset["train"]
+    else:
+        first_split = next(iter(dataset.keys()))
+        print(f"Using '{first_split}' split")
+        full_data = dataset[first_split]
+
+    print(f"Shuffling {len(full_data)} examples...")
+    shuffled_data = full_data.shuffle(seed=RANDOM_SEED)
+
+    # Split
+    total = len(shuffled_data)
+    train_end = int(total * TRAIN_RATIO)
+    valid_end = train_end + int(total * VALID_RATIO)
+
+    splits = {
+        "train": shuffled_data.select(range(train_end)),
+        "validation": shuffled_data.select(range(train_end, valid_end)),
+        "test": shuffled_data.select(range(valid_end, total)),
+    }
+
+    print(
+        f"Splits: train={len(splits['train'])}, valid={len(splits['validation'])}, test={len(splits['test'])}"
+    )
+
+    # Save
+    os.makedirs(output_dir, exist_ok=True)
+    saved_files = {}
+
+    for split_name, split_data in splits.items():
+        output_file = os.path.join(output_dir, f"openscience{chat_suffix}_{split_name}.jsonl")
+
+        with open(output_file, "w", encoding="utf-8") as f:
+            for example in tqdm(split_data, desc=split_name):
+                text = format_input_output(example.get("input", ""), example.get("output", ""))
+                f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")
+
+        saved_files[split_name] = output_file
+        print(f"Saved {split_name}: {len(split_data)} examples")
+
+    # Datablend config
+    preprocessed_dir = output_dir.replace("openscience_splits", "openscience_splits_preprocessed")
+    blend_file = os.path.join(datablend_dir, f"datablend_openscience{chat_suffix}.json")
+    blend_config = {
+        "train": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_train_text_document"],
+        "valid": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_validation_text_document"],
+        "test": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_test_text_document"],
+    }
+    os.makedirs(datablend_dir, exist_ok=True)
+    with open(blend_file, "w") as f:
+        json.dump(blend_config, f, indent=2)
+    print(f"Created datablend: {blend_file}")
+
+    return {
+        "dataset": "openscience",
+        "total": total,
+        "train": len(splits["train"]),
+        "validation": len(splits["validation"]),
+        "test": len(splits["test"]),
+        "files": saved_files,
+        "datablend": blend_file,
+    }
+
+
+def download_nemotron_v2_split(
+    split_name: str,
+    output_dir: str,
+    datablend_dir: str,
+    sample_percent: float,
+    suffix: str,
+    include_reasoning: bool,
+) -> dict[str, Any] | None:
+    """Download a single Nemotron-v2 split."""
+    from datasets import load_dataset, load_dataset_builder
+
+    config = DATASET_CONFIGS["nemotron-v2"]
+    split_dir = os.path.join(output_dir, split_name)
+    os.makedirs(split_dir, exist_ok=True)
+
+    # Get split size
+    try:
+        builder = load_dataset_builder(config["hf_name"], split_name)
+        available = builder.info.splits[split_name].num_examples if builder.info.splits else None
+        if available:
+            target = int(available * sample_percent / 100)
+            print(f"\n{split_name}: downloading {target:,} of {available:,} ({sample_percent}%)")
+        else:
+            target = None
+            print(f"\n{split_name}: size unknown, downloading all then sampling")
+    except Exception as e:
+        if "gated" in str(e).lower() or "access" in str(e).lower():
+            print("\nACCESS DENIED - Request access at:")
+            print(f"  https://huggingface.co/datasets/{config['hf_name']}")
+            print("Then login with: huggingface-cli login")
+            raise
+        target = None
+        print(f"\n{split_name}: could not get size ({e})")
+
+    # Download
+    examples = []
+    dataset = load_dataset(config["hf_name"], split=split_name, streaming=True)
+
+    count = 0
+    for example in tqdm(dataset, desc=split_name, total=target):
+        if target is not None and count >= target:
+            break
+
+        messages = example.get("messages", [])
+        reasoning = example.get("reasoning", "") if include_reasoning else ""
+        text = format_messages(messages, reasoning)
+
+        if text.strip():
+            examples.append({"text": text})
+            count += 1
+
+    print(f"Collected {count:,} examples")
+
+    # If downloaded all, sample
+    if target is None and sample_percent < 100:
+        random.seed(RANDOM_SEED)
+        target_count = int(len(examples) * sample_percent / 100)
+        examples = random.sample(examples, target_count)
+        print(f"Sampled to {len(examples):,}")
+
+    if not examples:
+        print(f"Warning: No examples from {split_name}")
+        return None
+
+    # Shuffle and split
+    random.seed(RANDOM_SEED)
+    random.shuffle(examples)
+
+    total = len(examples)
+    train_end = int(total * TRAIN_RATIO)
+    valid_end = train_end + int(total * VALID_RATIO)
+
+    splits = {
+        "train": examples[:train_end],
+        "validation": examples[train_end:valid_end],
+        "test": examples[valid_end:],
+    }
+
+    # Save
+    saved_files = {}
+    for data_split, data in splits.items():
+        output_file = os.path.join(split_dir, f"{split_name}_{suffix}_{data_split}.jsonl")
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.writelines(json.dumps(ex, ensure_ascii=False) + "\n" for ex in data)
+        saved_files[data_split] = output_file
+        print(f"  {data_split}: {len(data):,} examples")
+
+    # Datablend config
+    preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
+    split_preprocessed_dir = os.path.join(preprocessed_dir, split_name)
+
+    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_{split_name}_{suffix}.json")
+    blend_config = {
+        "train": [1.0, f"{split_preprocessed_dir}/{split_name}_{suffix}_train_text_document"],
+        "valid": [1.0, f"{split_preprocessed_dir}/{split_name}_{suffix}_validation_text_document"],
+        "test": [1.0, f"{split_preprocessed_dir}/{split_name}_{suffix}_test_text_document"],
+    }
+    os.makedirs(datablend_dir, exist_ok=True)
+    with open(blend_file, "w") as f:
+        json.dump(blend_config, f, indent=2)
+    print(f"  Datablend: {blend_file}")
+
+    return {
+        "split_name": split_name,
+        "total": total,
+        "train": len(splits["train"]),
+        "validation": len(splits["validation"]),
+        "test": len(splits["test"]),
+        "files": saved_files,
+        "datablend": blend_file,
+    }
+
+
+def download_nemotron_v2(
+    output_dir: str,
+    datablend_dir: str,
+    splits: list[str],
+    sample_percent: float,
+    suffix: str,
+    include_reasoning: bool,
+) -> list[dict[str, Any]]:
+    """Download Nemotron-v2 dataset (multiple splits)."""
+    print(f"\nDownloading Nemotron-v2: {splits}")
+    print(f"Sample: {sample_percent}%, Reasoning: {include_reasoning}")
+    print("=" * 60)
+    print("NOTE: This dataset is GATED. You need HuggingFace access.")
+    print("=" * 60)
+
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(datablend_dir, exist_ok=True)
+
+    all_infos = []
+    for split_name in splits:
+        info = download_nemotron_v2_split(
+            split_name=split_name,
+            output_dir=output_dir,
+            datablend_dir=datablend_dir,
+            sample_percent=sample_percent,
+            suffix=suffix,
+            include_reasoning=include_reasoning,
+        )
+        if info:
+            all_infos.append(info)
+
+    # Create combined datablend
+    if all_infos:
+        preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
+        total_train = sum(info["train"] for info in all_infos)
+
+        train_blend = []
+        valid_blend = []
+        test_blend = []
+
+        for info in all_infos:
+            sn = info["split_name"]
+            weight = info["train"] / total_train if total_train > 0 else 1.0 / len(all_infos)
+            split_path = os.path.join(preprocessed_dir, sn)
+
+            train_blend.extend([weight, f"{split_path}/{sn}_{suffix}_train_text_document"])
+            valid_blend.extend([weight, f"{split_path}/{sn}_{suffix}_validation_text_document"])
+            test_blend.extend([weight, f"{split_path}/{sn}_{suffix}_test_text_document"])
+
+        combined_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_combined_{suffix}.json")
+        with open(combined_file, "w") as f:
+            json.dump({"train": train_blend, "valid": valid_blend, "test": test_blend}, f, indent=2)
+        print(f"\nCombined datablend: {combined_file}")
+
+    return all_infos
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download datasets for QAD training")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        choices=["openscience", "nemotron-v2", "all"],
+        help="Dataset to download",
+    )
+    parser.add_argument("--output-dir", type=str, required=True, help="Output directory")
+    parser.add_argument(
+        "--datablend-dir",
+        type=str,
+        default=None,
+        help="Datablend config directory (default: output-dir)",
+    )
+    parser.add_argument(
+        "--tokenizer", type=str, default=None, help="HuggingFace tokenizer for chat template"
+    )
+
+    # Nemotron-v2 specific
+    parser.add_argument(
+        "--splits",
+        type=str,
+        default="stem,math,code,chat",
+        help="Nemotron-v2 splits to download (comma-separated)",
+    )
+    parser.add_argument(
+        "--sample-percent",
+        type=float,
+        default=30.0,
+        help="Percentage of data to sample (default: 30)",
+    )
+    parser.add_argument(
+        "--include-reasoning",
+        action="store_true",
+        help="Include chain-of-thought reasoning (for Thinking models)",
+    )
+
+    args = parser.parse_args()
+
+    output_dir = args.output_dir
+    datablend_dir = args.datablend_dir or output_dir
+    use_chat = args.tokenizer is not None
+
+    if args.tokenizer:
+        init_tokenizer(args.tokenizer)
+
+    # Build suffix
+    pct_str = f"{int(args.sample_percent)}pct"
+    cot_str = "_cot" if args.include_reasoning else ""
+    chat_str = "_chat" if use_chat else ""
+    suffix = f"{pct_str}{cot_str}{chat_str}"
+
+    results = []
+
+    if args.dataset in ["openscience", "all"]:
+        os_dir = os.path.join(output_dir, "openscience_splits")
+        info = download_openscience(os_dir, datablend_dir, use_chat)
+        results.append(info)
+
+    if args.dataset in ["nemotron-v2", "all"]:
+        nv2_dir = os.path.join(output_dir, "nemotron_v2")
+        splits = [s.strip() for s in args.splits.split(",")]
+        infos = download_nemotron_v2(
+            output_dir=nv2_dir,
+            datablend_dir=datablend_dir,
+            splits=splits,
+            sample_percent=args.sample_percent,
+            suffix=suffix,
+            include_reasoning=args.include_reasoning,
+        )
+        results.extend(infos)
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("Download complete!")
+    print("=" * 60)
+    for r in results:
+        name = r.get("dataset") or r.get("split_name")
+        print(f"  {name}: {r['total']:,} samples (train={r['train']:,})")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_qad/data_utils/process_dataset.sh b/examples/llm_qad/data_utils/process_dataset.sh
new file mode 100644
index 000000000..a3430ecfc
--- /dev/null
+++ b/examples/llm_qad/data_utils/process_dataset.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# =============================================================================
+# Unified Dataset Preprocessor for QAD Training
+# =============================================================================
+#
+# Converts JSONL datasets to Megatron format (.bin/.idx files).
+# Supports: OpenScience, Nemotron-v2
+#
+# USAGE:
+#   # OpenScience (flat structure)
+#   bash process_dataset.sh --dataset openscience \
+#       --output-dir /path/to/preprocessed \
+#       --input-dir /path/to/openscience_splits \
+#       --mlm-path /path/to/Megatron-LM \
+#       --tokenizer Qwen/Qwen3-8B
+#
+#   # Nemotron-v2 (with splits)
+#   bash process_dataset.sh --dataset nemotron-v2 \
+#       --output-dir /path/to/preprocessed \
+#       --input-dir /path/to/nemotron_v2 \
+#       --mlm-path /path/to/Megatron-LM \
+#       --tokenizer Qwen/Qwen3-8B \
+#       --split stem \
+#       --suffix 30pct_chat
+#
+# REQUIRED ARGUMENTS:
+#   --dataset       Dataset type: openscience, nemotron-v2
+#   --output-dir    Output directory for preprocessed files
+#   --input-dir     Input directory with JSONL files
+#   --mlm-path      Path to Megatron-LM directory
+#   --tokenizer     HuggingFace tokenizer model
+#
+# OPTIONAL ARGUMENTS:
+#   --split         Split name for nemotron-v2: stem, math, code, chat (default: stem)
+#   --suffix        Suffix for file naming (default: chat)
+#   --workers       Number of parallel workers (default: 32)
+#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
+#
+# =============================================================================
+
+set -e
+
+# -----------------------------------------------------------------------------
+# Argument Parsing
+# -----------------------------------------------------------------------------
+DATASET=""
+OUTPUT_DIR=""
+INPUT_DIR=""
+MLM_DIR=""
+TOKENIZER_MODEL=""
+SPLIT="stem"
+SUFFIX="chat"
+WORKERS=32
+DATABLEND_DIR=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dataset)
+            DATASET="$2"
+            shift 2
+            ;;
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --input-dir)
+            INPUT_DIR="$2"
+            shift 2
+            ;;
+        --mlm-path)
+            MLM_DIR="$2"
+            shift 2
+            ;;
+        --tokenizer)
+            TOKENIZER_MODEL="$2"
+            shift 2
+            ;;
+        --split)
+            SPLIT="$2"
+            shift 2
+            ;;
+        --suffix)
+            SUFFIX="$2"
+            shift 2
+            ;;
+        --workers)
+            WORKERS="$2"
+            shift 2
+            ;;
+        --datablend-dir)
+            DATABLEND_DIR="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# -----------------------------------------------------------------------------
+# Validation
+# -----------------------------------------------------------------------------
+if [ -z "$DATASET" ]; then
+    echo "Error: --dataset is required (openscience, nemotron-v2)"
+    exit 1
+fi
+
+if [ -z "$OUTPUT_DIR" ]; then
+    echo "Error: --output-dir is required"
+    exit 1
+fi
+
+if [ -z "$INPUT_DIR" ]; then
+    echo "Error: --input-dir is required"
+    exit 1
+fi
+
+if [ -z "$MLM_DIR" ]; then
+    echo "Error: --mlm-path is required"
+    exit 1
+fi
+
+if [ -z "$TOKENIZER_MODEL" ]; then
+    echo "Error: --tokenizer is required"
+    exit 1
+fi
+
+# Set default datablend dir
+if [ -z "$DATABLEND_DIR" ]; then
+    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
+fi
+
+# Tokenizer settings
+TOKENIZER_TYPE="HuggingFaceTokenizer"
+
+# -----------------------------------------------------------------------------
+# Dataset-specific configuration
+# -----------------------------------------------------------------------------
+case "$DATASET" in
+    openscience)
+        # OpenScience: flat structure, files named openscience_<suffix>_<split>.jsonl
+        if [ -n "$SUFFIX" ]; then
+            FILE_PREFIX="openscience_${SUFFIX}"
+        else
+            FILE_PREFIX="openscience"
+        fi
+        WORK_DIR="${OUTPUT_DIR}"
+        INPUT_SUBDIR="${INPUT_DIR}"
+        BLEND_NAME="datablend_openscience_${SUFFIX}"
+        ;;
+    
+    nemotron-v2)
+        # Nemotron-v2: organized by split, files named <split>_<suffix>_<split>.jsonl
+        # Normalize suffix (handle both 30pctcot and 30pct_cot)
+        SUFFIX=$(echo "$SUFFIX" | sed 's/pctcot/pct_cot/g')
+        FILE_PREFIX="${SPLIT}_${SUFFIX}"
+        WORK_DIR="${OUTPUT_DIR}/${SPLIT}"
+        INPUT_SUBDIR="${INPUT_DIR}/${SPLIT}"
+        BLEND_NAME="datablend_nemotron_v2_${SPLIT}_${SUFFIX}"
+        ;;
+    
+    *)
+        echo "Error: Unknown dataset '$DATASET'. Supported: openscience, nemotron-v2"
+        exit 1
+        ;;
+esac
+
+mkdir -p "${WORK_DIR}"
+mkdir -p "${DATABLEND_DIR}"
+
+# -----------------------------------------------------------------------------
+# Processing
+# -----------------------------------------------------------------------------
+echo "=========================================="
+echo "Preprocessing Dataset: ${DATASET}"
+echo "=========================================="
+echo "File prefix: ${FILE_PREFIX}"
+echo "Tokenizer:   ${TOKENIZER_MODEL}"
+echo "MLM Path:    ${MLM_DIR}"
+echo "Input:       ${INPUT_SUBDIR}"
+echo "Output:      ${WORK_DIR}"
+echo "Workers:     ${WORKERS}"
+echo "=========================================="
+
+# Function to preprocess a single split
+preprocess_split() {
+    local split_type=$1  # train, validation, test
+    local input_file="${INPUT_SUBDIR}/${FILE_PREFIX}_${split_type}.jsonl"
+    local output_prefix="${WORK_DIR}/${FILE_PREFIX}_${split_type}"
+    
+    if [ -f "${input_file}" ]; then
+        echo "Processing ${split_type}: ${input_file}"
+        python "${MLM_DIR}/tools/preprocess_data.py" \
+            --input "${input_file}" \
+            --output-prefix "${output_prefix}" \
+            --tokenizer-type "${TOKENIZER_TYPE}" \
+            --tokenizer-model "${TOKENIZER_MODEL}" \
+            --append-eod \
+            --workers "${WORKERS}" \
+            --json-keys text
+        return 0
+    else
+        echo "Warning: ${split_type} file not found: ${input_file}"
+        return 1
+    fi
+}
+
+# Process all splits
+TRAIN_OK=0
+preprocess_split "train" && TRAIN_OK=1
+
+if [ "$TRAIN_OK" -eq 0 ]; then
+    echo "Error: Training file not found. Check if download was successful."
+    echo "Expected: ${INPUT_SUBDIR}/${FILE_PREFIX}_train.jsonl"
+    ls -la "${INPUT_SUBDIR}/" 2>/dev/null || echo "Directory doesn't exist: ${INPUT_SUBDIR}/"
+    exit 1
+fi
+
+preprocess_split "validation" || true
+preprocess_split "test" || true
+
+# -----------------------------------------------------------------------------
+# Create Datablend Config
+# -----------------------------------------------------------------------------
+BLEND_FILE="${DATABLEND_DIR}/${BLEND_NAME}.json"
+echo ""
+echo "Creating datablend config: ${BLEND_FILE}"
+
+cat > "${BLEND_FILE}" << EOF
+{
+    "train": [1.0, "${WORK_DIR}/${FILE_PREFIX}_train_text_document"],
+    "valid": [1.0, "${WORK_DIR}/${FILE_PREFIX}_validation_text_document"],
+    "test": [1.0, "${WORK_DIR}/${FILE_PREFIX}_test_text_document"]
+}
+EOF
+
+# -----------------------------------------------------------------------------
+# Summary
+# -----------------------------------------------------------------------------
+echo ""
+echo "=========================================="
+echo "Preprocessing complete!"
+echo "=========================================="
+echo "Dataset:     ${DATASET}"
+echo "Output:      ${WORK_DIR}"
+echo "Datablend:   ${BLEND_FILE}"
+echo ""
+echo "Generated files:"
+ls -lh "${WORK_DIR}/${FILE_PREFIX}"*.bin 2>/dev/null || echo "  (no .bin files)"
+ls -lh "${WORK_DIR}/${FILE_PREFIX}"*.idx 2>/dev/null || echo "  (no .idx files)"
+echo "=========================================="
+
diff --git a/examples/llm_qad/data_utils/process_nemotron_v2.sh b/examples/llm_qad/data_utils/process_nemotron_v2.sh
deleted file mode 100755
index 6a56eb288..000000000
--- a/examples/llm_qad/data_utils/process_nemotron_v2.sh
+++ /dev/null
@@ -1,195 +0,0 @@
-#!/bin/bash
-# Preprocess Nemotron-v2 dataset for QAD training (general, model-agnostic)
-#
-# Usage:
-#   bash process_nemotron_v2.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
-#
-# Required arguments:
-#   --output-dir    Output directory for preprocessed files
-#   --mlm-path      Path to Megatron-LM directory
-#   --tokenizer     HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
-#
-# Optional arguments:
-#   --input-dir     Input directory (default: derived from output-dir)
-#   --split         Split name: stem, math, code, chat (default: stem)
-#   --suffix        Suffix for file naming (default: 30pct_cot_chat)
-#   --workers       Number of parallel workers (default: 32)
-#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
-
-set -e
-
-# Parse arguments
-OUTPUT_DIR=""
-MLM_DIR=""
-TOKENIZER_MODEL=""
-INPUT_DIR=""
-SPLIT="stem"
-SUFFIX="30pct_cot_chat"
-WORKERS=32
-DATABLEND_DIR=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --output-dir)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        --mlm-path)
-            MLM_DIR="$2"
-            shift 2
-            ;;
-        --tokenizer)
-            TOKENIZER_MODEL="$2"
-            shift 2
-            ;;
-        --input-dir)
-            INPUT_DIR="$2"
-            shift 2
-            ;;
-        --split)
-            SPLIT="$2"
-            shift 2
-            ;;
-        --suffix)
-            SUFFIX="$2"
-            shift 2
-            ;;
-        --workers)
-            WORKERS="$2"
-            shift 2
-            ;;
-        --datablend-dir)
-            DATABLEND_DIR="$2"
-            shift 2
-            ;;
-        *)
-            echo "Unknown option: $1"
-            exit 1
-            ;;
-    esac
-done
-
-# Validate required arguments
-if [ -z "$OUTPUT_DIR" ]; then
-    echo "Error: --output-dir is required"
-    exit 1
-fi
-
-if [ -z "$MLM_DIR" ]; then
-    echo "Error: --mlm-path is required"
-    exit 1
-fi
-
-if [ -z "$TOKENIZER_MODEL" ]; then
-    echo "Error: --tokenizer is required"
-    exit 1
-fi
-
-# Set defaults for optional arguments
-if [ -z "$INPUT_DIR" ]; then
-    INPUT_DIR="${OUTPUT_DIR//_preprocessed/}"
-fi
-
-if [ -z "$DATABLEND_DIR" ]; then
-    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
-fi
-
-# Normalize suffix (handle both 30pctcot and 30pct_cot)
-SUFFIX=$(echo "$SUFFIX" | sed 's/pctcot/pct_cot/g')
-
-mkdir -p ${OUTPUT_DIR}/${SPLIT}
-mkdir -p ${DATABLEND_DIR}
-
-# Ensure transformers is installed for tokenizer
-pip install -q transformers tokenizers
-
-# Tokenizer settings
-TOKENIZER_TYPE="HuggingFaceTokenizer"
-
-# Full name for output files
-FULL_NAME="${SPLIT}_${SUFFIX}"
-
-echo "=========================================="
-echo "Preprocessing Nemotron-v2 Dataset"
-echo "=========================================="
-echo "Split: ${SPLIT}"
-echo "Suffix: ${SUFFIX}"
-echo "Tokenizer: ${TOKENIZER_MODEL}"
-echo "MLM Path: ${MLM_DIR}"
-echo "Input dir: ${INPUT_DIR}/${SPLIT}/"
-echo "Output dir: ${OUTPUT_DIR}/${SPLIT}/"
-echo "Datablend dir: ${DATABLEND_DIR}"
-echo "=========================================="
-
-# Process training split
-TRAIN_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_train.jsonl"
-if [ -f "${TRAIN_FILE}" ]; then
-    echo "Processing training split: ${TRAIN_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TRAIN_FILE} \
-        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "❌ Error: Training file not found: ${TRAIN_FILE}"
-    echo "   Check if download was successful."
-    ls -la ${INPUT_DIR}/${SPLIT}/ 2>/dev/null || echo "   Directory doesn't exist: ${INPUT_DIR}/${SPLIT}/"
-    exit 1
-fi
-
-# Process validation split
-VALID_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_validation.jsonl"
-if [ -f "${VALID_FILE}" ]; then
-    echo "Processing validation split: ${VALID_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${VALID_FILE} \
-        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Validation file not found: ${VALID_FILE}"
-fi
-
-# Process test split
-TEST_FILE="${INPUT_DIR}/${SPLIT}/${FULL_NAME}_test.jsonl"
-if [ -f "${TEST_FILE}" ]; then
-    echo "Processing test split: ${TEST_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TEST_FILE} \
-        --output-prefix ${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Test file not found: ${TEST_FILE}"
-fi
-
-# Create datablend config
-BLEND_FILE="${DATABLEND_DIR}/datablend_nemotron_v2_${FULL_NAME}.json"
-echo "Creating datablend config: ${BLEND_FILE}"
-cat > ${BLEND_FILE} << EOF
-{
-    "train": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_train_text_document"],
-    "valid": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_validation_text_document"],
-    "test": [1.0, "${OUTPUT_DIR}/${SPLIT}/${FULL_NAME}_test_text_document"]
-}
-EOF
-
-echo "=========================================="
-echo "✓ Nemotron-v2 (${FULL_NAME}) preprocessing complete!"
-echo "=========================================="
-echo "Output directory: ${OUTPUT_DIR}/${SPLIT}/"
-echo "Datablend config: ${BLEND_FILE}"
-echo ""
-echo "To run QAD training:"
-echo "  DATASET_NAME=nemotron_v2_${SPLIT}_${SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
-echo "=========================================="
-
diff --git a/examples/llm_qad/data_utils/process_openscience.sh b/examples/llm_qad/data_utils/process_openscience.sh
deleted file mode 100755
index eb18af9ff..000000000
--- a/examples/llm_qad/data_utils/process_openscience.sh
+++ /dev/null
@@ -1,189 +0,0 @@
-#!/bin/bash
-# Preprocess OpenScience dataset for QAD training (general, model-agnostic)
-#
-# Usage:
-#   bash process_openscience.sh --output-dir <path> --mlm-path <path> --tokenizer <model> [options]
-#
-# Required arguments:
-#   --output-dir    Output directory for preprocessed files
-#   --mlm-path      Path to Megatron-LM directory
-#   --tokenizer     HuggingFace tokenizer model (e.g., Qwen/Qwen3-8B)
-#
-# Optional arguments:
-#   --input-dir     Input directory (default: derived from output-dir)
-#   --suffix        Suffix for file naming (empty for simple format, "chat" for chat template)
-#   --workers       Number of parallel workers (default: 32)
-#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
-
-set -e
-
-# Parse arguments
-OUTPUT_DIR=""
-MLM_DIR=""
-TOKENIZER_MODEL=""
-INPUT_DIR=""
-SUFFIX=""
-WORKERS=32
-DATABLEND_DIR=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --output-dir)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        --mlm-path)
-            MLM_DIR="$2"
-            shift 2
-            ;;
-        --tokenizer)
-            TOKENIZER_MODEL="$2"
-            shift 2
-            ;;
-        --input-dir)
-            INPUT_DIR="$2"
-            shift 2
-            ;;
-        --suffix)
-            SUFFIX="$2"
-            shift 2
-            ;;
-        --workers)
-            WORKERS="$2"
-            shift 2
-            ;;
-        --datablend-dir)
-            DATABLEND_DIR="$2"
-            shift 2
-            ;;
-        *)
-            echo "Unknown option: $1"
-            exit 1
-            ;;
-    esac
-done
-
-# Validate required arguments
-if [ -z "$OUTPUT_DIR" ]; then
-    echo "Error: --output-dir is required"
-    exit 1
-fi
-
-if [ -z "$MLM_DIR" ]; then
-    echo "Error: --mlm-path is required"
-    exit 1
-fi
-
-if [ -z "$TOKENIZER_MODEL" ]; then
-    echo "Error: --tokenizer is required"
-    exit 1
-fi
-
-# Set defaults for optional arguments
-if [ -z "$INPUT_DIR" ]; then
-    INPUT_DIR="${OUTPUT_DIR//_preprocessed/}"
-fi
-
-if [ -z "$DATABLEND_DIR" ]; then
-    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
-fi
-
-# Normalize suffix
-if [ -n "$SUFFIX" ]; then
-    FILE_SUFFIX="_${SUFFIX}"
-else
-    FILE_SUFFIX=""
-fi
-
-mkdir -p ${OUTPUT_DIR}
-mkdir -p ${DATABLEND_DIR}
-
-# Tokenizer settings
-TOKENIZER_TYPE="HuggingFaceTokenizer"
-
-echo "=========================================="
-echo "Preprocessing OpenScience Dataset"
-echo "=========================================="
-echo "Format suffix: ${FILE_SUFFIX:-none (simple format)}"
-echo "Tokenizer: ${TOKENIZER_MODEL}"
-echo "MLM Path: ${MLM_DIR}"
-echo "Input dir: ${INPUT_DIR}"
-echo "Output dir: ${OUTPUT_DIR}"
-echo "Datablend dir: ${DATABLEND_DIR}"
-echo "=========================================="
-
-# Process training split
-TRAIN_FILE="${INPUT_DIR}/openscience${FILE_SUFFIX}_train.jsonl"
-if [ -f "${TRAIN_FILE}" ]; then
-    echo "Processing training split: ${TRAIN_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TRAIN_FILE} \
-        --output-prefix ${OUTPUT_DIR}/openscience${FILE_SUFFIX}_train \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "❌ Training file not found: ${TRAIN_FILE}"
-    exit 1
-fi
-
-# Process validation split
-VALID_FILE="${INPUT_DIR}/openscience${FILE_SUFFIX}_validation.jsonl"
-if [ -f "${VALID_FILE}" ]; then
-    echo "Processing validation split: ${VALID_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${VALID_FILE} \
-        --output-prefix ${OUTPUT_DIR}/openscience${FILE_SUFFIX}_validation \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Validation file not found: ${VALID_FILE}"
-fi
-
-# Process test split (if exists)
-TEST_FILE="${INPUT_DIR}/openscience${FILE_SUFFIX}_test.jsonl"
-if [ -f "${TEST_FILE}" ]; then
-    echo "Processing test split: ${TEST_FILE}"
-    python ${MLM_DIR}/tools/preprocess_data.py \
-        --input ${TEST_FILE} \
-        --output-prefix ${OUTPUT_DIR}/openscience${FILE_SUFFIX}_test \
-        --tokenizer-type ${TOKENIZER_TYPE} \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --append-eod \
-        --workers ${WORKERS} \
-        --json-keys text
-else
-    echo "Warning: Test file not found: ${TEST_FILE}"
-fi
-
-# Create datablend config
-BLEND_FILE="${DATABLEND_DIR}/datablend_openscience${FILE_SUFFIX}.json"
-echo "Creating datablend config: ${BLEND_FILE}"
-cat > ${BLEND_FILE} << EOF
-{
-    "train": [1.0, "${OUTPUT_DIR}/openscience${FILE_SUFFIX}_train_text_document"],
-    "valid": [1.0, "${OUTPUT_DIR}/openscience${FILE_SUFFIX}_validation_text_document"],
-    "test": [1.0, "${OUTPUT_DIR}/openscience${FILE_SUFFIX}_test_text_document"]
-}
-EOF
-
-echo "=========================================="
-echo "✓ Preprocessing complete!"
-echo "Output files are in: ${OUTPUT_DIR}"
-echo "Datablend config: ${BLEND_FILE}"
-echo "=========================================="
-
-# List generated files
-echo "Generated files:"
-ls -lh ${OUTPUT_DIR}/openscience${FILE_SUFFIX}*.bin 2>/dev/null || echo "No .bin files found"
-ls -lh ${OUTPUT_DIR}/openscience${FILE_SUFFIX}*.idx 2>/dev/null || echo "No .idx files found"
-
-echo ""
-echo "To use in QAD training:"
-echo "  DATASET_NAME=openscience${FILE_SUFFIX} bash qwen_qad.sh --config configs/your-config.conf"
-

From 7dc540d9ed5fbae1d4edc9c939f6233b3cebc30f Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:54 -0800
Subject: [PATCH 08/16] remove unused part

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../llm_qad/data_utils/process_dataset.sh     | 268 ------------------
 1 file changed, 268 deletions(-)
 delete mode 100644 examples/llm_qad/data_utils/process_dataset.sh

diff --git a/examples/llm_qad/data_utils/process_dataset.sh b/examples/llm_qad/data_utils/process_dataset.sh
deleted file mode 100644
index a3430ecfc..000000000
--- a/examples/llm_qad/data_utils/process_dataset.sh
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# =============================================================================
-# Unified Dataset Preprocessor for QAD Training
-# =============================================================================
-#
-# Converts JSONL datasets to Megatron format (.bin/.idx files).
-# Supports: OpenScience, Nemotron-v2
-#
-# USAGE:
-#   # OpenScience (flat structure)
-#   bash process_dataset.sh --dataset openscience \
-#       --output-dir /path/to/preprocessed \
-#       --input-dir /path/to/openscience_splits \
-#       --mlm-path /path/to/Megatron-LM \
-#       --tokenizer Qwen/Qwen3-8B
-#
-#   # Nemotron-v2 (with splits)
-#   bash process_dataset.sh --dataset nemotron-v2 \
-#       --output-dir /path/to/preprocessed \
-#       --input-dir /path/to/nemotron_v2 \
-#       --mlm-path /path/to/Megatron-LM \
-#       --tokenizer Qwen/Qwen3-8B \
-#       --split stem \
-#       --suffix 30pct_chat
-#
-# REQUIRED ARGUMENTS:
-#   --dataset       Dataset type: openscience, nemotron-v2
-#   --output-dir    Output directory for preprocessed files
-#   --input-dir     Input directory with JSONL files
-#   --mlm-path      Path to Megatron-LM directory
-#   --tokenizer     HuggingFace tokenizer model
-#
-# OPTIONAL ARGUMENTS:
-#   --split         Split name for nemotron-v2: stem, math, code, chat (default: stem)
-#   --suffix        Suffix for file naming (default: chat)
-#   --workers       Number of parallel workers (default: 32)
-#   --datablend-dir Directory for datablend configs (default: parent of output-dir)
-#
-# =============================================================================
-
-set -e
-
-# -----------------------------------------------------------------------------
-# Argument Parsing
-# -----------------------------------------------------------------------------
-DATASET=""
-OUTPUT_DIR=""
-INPUT_DIR=""
-MLM_DIR=""
-TOKENIZER_MODEL=""
-SPLIT="stem"
-SUFFIX="chat"
-WORKERS=32
-DATABLEND_DIR=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --dataset)
-            DATASET="$2"
-            shift 2
-            ;;
-        --output-dir)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        --input-dir)
-            INPUT_DIR="$2"
-            shift 2
-            ;;
-        --mlm-path)
-            MLM_DIR="$2"
-            shift 2
-            ;;
-        --tokenizer)
-            TOKENIZER_MODEL="$2"
-            shift 2
-            ;;
-        --split)
-            SPLIT="$2"
-            shift 2
-            ;;
-        --suffix)
-            SUFFIX="$2"
-            shift 2
-            ;;
-        --workers)
-            WORKERS="$2"
-            shift 2
-            ;;
-        --datablend-dir)
-            DATABLEND_DIR="$2"
-            shift 2
-            ;;
-        *)
-            echo "Unknown option: $1"
-            exit 1
-            ;;
-    esac
-done
-
-# -----------------------------------------------------------------------------
-# Validation
-# -----------------------------------------------------------------------------
-if [ -z "$DATASET" ]; then
-    echo "Error: --dataset is required (openscience, nemotron-v2)"
-    exit 1
-fi
-
-if [ -z "$OUTPUT_DIR" ]; then
-    echo "Error: --output-dir is required"
-    exit 1
-fi
-
-if [ -z "$INPUT_DIR" ]; then
-    echo "Error: --input-dir is required"
-    exit 1
-fi
-
-if [ -z "$MLM_DIR" ]; then
-    echo "Error: --mlm-path is required"
-    exit 1
-fi
-
-if [ -z "$TOKENIZER_MODEL" ]; then
-    echo "Error: --tokenizer is required"
-    exit 1
-fi
-
-# Set default datablend dir
-if [ -z "$DATABLEND_DIR" ]; then
-    DATABLEND_DIR="$(dirname "$OUTPUT_DIR")"
-fi
-
-# Tokenizer settings
-TOKENIZER_TYPE="HuggingFaceTokenizer"
-
-# -----------------------------------------------------------------------------
-# Dataset-specific configuration
-# -----------------------------------------------------------------------------
-case "$DATASET" in
-    openscience)
-        # OpenScience: flat structure, files named openscience_<suffix>_<split>.jsonl
-        if [ -n "$SUFFIX" ]; then
-            FILE_PREFIX="openscience_${SUFFIX}"
-        else
-            FILE_PREFIX="openscience"
-        fi
-        WORK_DIR="${OUTPUT_DIR}"
-        INPUT_SUBDIR="${INPUT_DIR}"
-        BLEND_NAME="datablend_openscience_${SUFFIX}"
-        ;;
-    
-    nemotron-v2)
-        # Nemotron-v2: organized by split, files named <split>_<suffix>_<split>.jsonl
-        # Normalize suffix (handle both 30pctcot and 30pct_cot)
-        SUFFIX=$(echo "$SUFFIX" | sed 's/pctcot/pct_cot/g')
-        FILE_PREFIX="${SPLIT}_${SUFFIX}"
-        WORK_DIR="${OUTPUT_DIR}/${SPLIT}"
-        INPUT_SUBDIR="${INPUT_DIR}/${SPLIT}"
-        BLEND_NAME="datablend_nemotron_v2_${SPLIT}_${SUFFIX}"
-        ;;
-    
-    *)
-        echo "Error: Unknown dataset '$DATASET'. Supported: openscience, nemotron-v2"
-        exit 1
-        ;;
-esac
-
-mkdir -p "${WORK_DIR}"
-mkdir -p "${DATABLEND_DIR}"
-
-# -----------------------------------------------------------------------------
-# Processing
-# -----------------------------------------------------------------------------
-echo "=========================================="
-echo "Preprocessing Dataset: ${DATASET}"
-echo "=========================================="
-echo "File prefix: ${FILE_PREFIX}"
-echo "Tokenizer:   ${TOKENIZER_MODEL}"
-echo "MLM Path:    ${MLM_DIR}"
-echo "Input:       ${INPUT_SUBDIR}"
-echo "Output:      ${WORK_DIR}"
-echo "Workers:     ${WORKERS}"
-echo "=========================================="
-
-# Function to preprocess a single split
-preprocess_split() {
-    local split_type=$1  # train, validation, test
-    local input_file="${INPUT_SUBDIR}/${FILE_PREFIX}_${split_type}.jsonl"
-    local output_prefix="${WORK_DIR}/${FILE_PREFIX}_${split_type}"
-    
-    if [ -f "${input_file}" ]; then
-        echo "Processing ${split_type}: ${input_file}"
-        python "${MLM_DIR}/tools/preprocess_data.py" \
-            --input "${input_file}" \
-            --output-prefix "${output_prefix}" \
-            --tokenizer-type "${TOKENIZER_TYPE}" \
-            --tokenizer-model "${TOKENIZER_MODEL}" \
-            --append-eod \
-            --workers "${WORKERS}" \
-            --json-keys text
-        return 0
-    else
-        echo "Warning: ${split_type} file not found: ${input_file}"
-        return 1
-    fi
-}
-
-# Process all splits
-TRAIN_OK=0
-preprocess_split "train" && TRAIN_OK=1
-
-if [ "$TRAIN_OK" -eq 0 ]; then
-    echo "Error: Training file not found. Check if download was successful."
-    echo "Expected: ${INPUT_SUBDIR}/${FILE_PREFIX}_train.jsonl"
-    ls -la "${INPUT_SUBDIR}/" 2>/dev/null || echo "Directory doesn't exist: ${INPUT_SUBDIR}/"
-    exit 1
-fi
-
-preprocess_split "validation" || true
-preprocess_split "test" || true
-
-# -----------------------------------------------------------------------------
-# Create Datablend Config
-# -----------------------------------------------------------------------------
-BLEND_FILE="${DATABLEND_DIR}/${BLEND_NAME}.json"
-echo ""
-echo "Creating datablend config: ${BLEND_FILE}"
-
-cat > "${BLEND_FILE}" << EOF
-{
-    "train": [1.0, "${WORK_DIR}/${FILE_PREFIX}_train_text_document"],
-    "valid": [1.0, "${WORK_DIR}/${FILE_PREFIX}_validation_text_document"],
-    "test": [1.0, "${WORK_DIR}/${FILE_PREFIX}_test_text_document"]
-}
-EOF
-
-# -----------------------------------------------------------------------------
-# Summary
-# -----------------------------------------------------------------------------
-echo ""
-echo "=========================================="
-echo "Preprocessing complete!"
-echo "=========================================="
-echo "Dataset:     ${DATASET}"
-echo "Output:      ${WORK_DIR}"
-echo "Datablend:   ${BLEND_FILE}"
-echo ""
-echo "Generated files:"
-ls -lh "${WORK_DIR}/${FILE_PREFIX}"*.bin 2>/dev/null || echo "  (no .bin files)"
-ls -lh "${WORK_DIR}/${FILE_PREFIX}"*.idx 2>/dev/null || echo "  (no .idx files)"
-echo "=========================================="
-

From 2c68ae9e6f1f46ef547646f25b4ac943845c49ec Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:55 -0800
Subject: [PATCH 09/16] minor

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../llm_qad/data_utils/generate_dataset.sh    | 188 +++++++++++-------
 1 file changed, 118 insertions(+), 70 deletions(-)

diff --git a/examples/llm_qad/data_utils/generate_dataset.sh b/examples/llm_qad/data_utils/generate_dataset.sh
index e027f8318..59c3daba9 100755
--- a/examples/llm_qad/data_utils/generate_dataset.sh
+++ b/examples/llm_qad/data_utils/generate_dataset.sh
@@ -26,6 +26,16 @@
 #                            --mlm-path /path/to/Megatron-LM \
 #                            --tokenizer Qwen/Qwen3-30B-A3B-Instruct-2507
 #
+# REQUIRED ARGUMENTS:
+#   --output-dir   Base output directory for datasets
+#   --mlm-path     Path to Megatron-LM directory
+#   --tokenizer    HuggingFace tokenizer (e.g., Qwen/Qwen3-30B-A3B-Instruct-2507)
+#
+# OPTIONAL ARGUMENTS:
+#   --sample-percent    Percentage of Nemotron-v2 to use (default: 30)
+#   --include-reasoning Include chain-of-thought for Thinking models
+#   --workers           Parallel workers for preprocessing (default: 32)
+#
 # REQUIREMENTS:
 #   - HuggingFace access to nvidia/Nemotron-Post-Training-Dataset-v2
 #   - Run: huggingface-cli login
@@ -34,11 +44,15 @@
 
 set -e
 
-# Parse arguments
+# =============================================================================
+# Argument Parsing
+# =============================================================================
 OUTPUT_DIR=""
 MLM_DIR=""
 TOKENIZER=""
 SAMPLE_PERCENT=30
+INCLUDE_REASONING=false
+WORKERS=32
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -58,6 +72,14 @@ while [[ $# -gt 0 ]]; do
             SAMPLE_PERCENT="$2"
             shift 2
             ;;
+        --include-reasoning)
+            INCLUDE_REASONING=true
+            shift
+            ;;
+        --workers)
+            WORKERS="$2"
+            shift 2
+            ;;
         *)
             echo "Unknown option: $1"
             echo "Usage: bash generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>"
@@ -78,12 +100,25 @@ if [ -z "$OUTPUT_DIR" ] || [ -z "$MLM_DIR" ] || [ -z "$TOKENIZER" ]; then
     echo "  --tokenizer    HuggingFace tokenizer (e.g., Qwen/Qwen3-30B-A3B-Instruct-2507)"
     echo ""
     echo "Optional:"
-    echo "  --sample-percent  Percentage of data to use (default: 30)"
+    echo "  --sample-percent    Percentage of data to use (default: 30)"
+    echo "  --include-reasoning Include chain-of-thought (for Thinking models)"
+    echo "  --workers           Parallel workers (default: 32)"
     exit 1
 fi
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SUFFIX="${SAMPLE_PERCENT}pct_chat"
+
+# Build suffix based on options
+if [ "$INCLUDE_REASONING" = true ]; then
+    SUFFIX="${SAMPLE_PERCENT}pct_cot_chat"
+    REASONING_FLAG="--include-reasoning"
+else
+    SUFFIX="${SAMPLE_PERCENT}pct_chat"
+    REASONING_FLAG=""
+fi
+
+# Tokenizer settings for Megatron preprocessing
+TOKENIZER_TYPE="HuggingFaceTokenizer"
 
 echo "=============================================="
 echo "QAD Dataset Generation"
@@ -92,13 +127,39 @@ echo "Output:       ${OUTPUT_DIR}"
 echo "MLM path:     ${MLM_DIR}"
 echo "Tokenizer:    ${TOKENIZER}"
 echo "Sample:       ${SAMPLE_PERCENT}%"
+echo "Reasoning:    ${INCLUDE_REASONING}"
+echo "Workers:      ${WORKERS}"
 echo "=============================================="
 
 # Create directories
 mkdir -p "${OUTPUT_DIR}"
 
 # =============================================================================
-# Step 1: Download datasets (using unified download script)
+# Helper Function: Preprocess JSONL to Megatron format
+# =============================================================================
+preprocess_to_megatron() {
+    local input_file="$1"
+    local output_prefix="$2"
+    
+    if [ -f "${input_file}" ]; then
+        echo "  Preprocessing: $(basename ${input_file})"
+        python "${MLM_DIR}/tools/preprocess_data.py" \
+            --input "${input_file}" \
+            --output-prefix "${output_prefix}" \
+            --tokenizer-type "${TOKENIZER_TYPE}" \
+            --tokenizer-model "${TOKENIZER}" \
+            --append-eod \
+            --workers "${WORKERS}" \
+            --json-keys text
+        return 0
+    else
+        echo "  Warning: File not found: ${input_file}"
+        return 1
+    fi
+}
+
+# =============================================================================
+# Step 1: Download OpenScience
 # =============================================================================
 echo ""
 echo "=== Step 1: Downloading Datasets ==="
@@ -116,65 +177,53 @@ python "${SCRIPT_DIR}/download_dataset.py" \
     --output-dir "${OUTPUT_DIR}" \
     --datablend-dir "${OUTPUT_DIR}" \
     --sample-percent "${SAMPLE_PERCENT}" \
+    ${REASONING_FLAG} \
     --tokenizer "${TOKENIZER}"
 
 # =============================================================================
-# Step 2: Preprocess datasets to Megatron format
+# Step 2: Preprocess OpenScience
 # =============================================================================
 echo ""
 echo "=== Step 2: Preprocessing to Megatron Format ==="
 
+OS_INPUT="${OUTPUT_DIR}/openscience_splits"
+OS_OUTPUT="${OUTPUT_DIR}/openscience_splits_preprocessed"
+mkdir -p "${OS_OUTPUT}"
+
 echo "[1/5] Processing OpenScience..."
-bash "${SCRIPT_DIR}/process_openscience.sh" \
-    --output-dir "${OUTPUT_DIR}/openscience_splits_preprocessed" \
-    --input-dir "${OUTPUT_DIR}/openscience_splits" \
-    --mlm-path "${MLM_DIR}" \
-    --tokenizer "${TOKENIZER}" \
-    --suffix chat \
-    --datablend-dir "${OUTPUT_DIR}"
-
-echo "[2/5] Processing Nemotron-v2 code..."
-bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
-    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
-    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
-    --mlm-path "${MLM_DIR}" \
-    --tokenizer "${TOKENIZER}" \
-    --split code \
-    --suffix "${SUFFIX}" \
-    --datablend-dir "${OUTPUT_DIR}"
-
-echo "[3/5] Processing Nemotron-v2 math..."
-bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
-    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
-    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
-    --mlm-path "${MLM_DIR}" \
-    --tokenizer "${TOKENIZER}" \
-    --split math \
-    --suffix "${SUFFIX}" \
-    --datablend-dir "${OUTPUT_DIR}"
-
-echo "[4/5] Processing Nemotron-v2 stem..."
-bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
-    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
-    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
-    --mlm-path "${MLM_DIR}" \
-    --tokenizer "${TOKENIZER}" \
-    --split stem \
-    --suffix "${SUFFIX}" \
-    --datablend-dir "${OUTPUT_DIR}"
-
-echo "[5/5] Processing Nemotron-v2 chat..."
-bash "${SCRIPT_DIR}/process_nemotron_v2.sh" \
-    --output-dir "${OUTPUT_DIR}/nemotron_v2_preprocessed" \
-    --input-dir "${OUTPUT_DIR}/nemotron_v2" \
-    --mlm-path "${MLM_DIR}" \
-    --tokenizer "${TOKENIZER}" \
-    --split chat \
-    --suffix "${SUFFIX}" \
-    --datablend-dir "${OUTPUT_DIR}"
-
-# =============================================================================
-# Step 3: Create combined datablend JSON
+preprocess_to_megatron "${OS_INPUT}/openscience_chat_train.jsonl" "${OS_OUTPUT}/openscience_chat_train"
+preprocess_to_megatron "${OS_INPUT}/openscience_chat_validation.jsonl" "${OS_OUTPUT}/openscience_chat_validation" || true
+preprocess_to_megatron "${OS_INPUT}/openscience_chat_test.jsonl" "${OS_OUTPUT}/openscience_chat_test" || true
+
+# =============================================================================
+# Step 3: Preprocess Nemotron-v2 (all splits)
+# =============================================================================
+NV2_INPUT="${OUTPUT_DIR}/nemotron_v2"
+NV2_OUTPUT="${OUTPUT_DIR}/nemotron_v2_preprocessed"
+
+STEP=2
+for split in code math stem chat; do
+    echo "[${STEP}/5] Processing Nemotron-v2 ${split}..."
+    SPLIT_DIR="${NV2_OUTPUT}/${split}"
+    mkdir -p "${SPLIT_DIR}"
+    
+    preprocess_to_megatron \
+        "${NV2_INPUT}/${split}/${split}_${SUFFIX}_train.jsonl" \
+        "${SPLIT_DIR}/${split}_${SUFFIX}_train"
+    
+    preprocess_to_megatron \
+        "${NV2_INPUT}/${split}/${split}_${SUFFIX}_validation.jsonl" \
+        "${SPLIT_DIR}/${split}_${SUFFIX}_validation" || true
+    
+    preprocess_to_megatron \
+        "${NV2_INPUT}/${split}/${split}_${SUFFIX}_test.jsonl" \
+        "${SPLIT_DIR}/${split}_${SUFFIX}_test" || true
+    
+    STEP=$((STEP + 1))
+done
+
+# =============================================================================
+# Step 4: Create Combined Datablend JSON
 # =============================================================================
 echo ""
 echo "=== Step 3: Creating Combined Datablend ==="
@@ -184,19 +233,19 @@ DATABLEND_FILE="${OUTPUT_DIR}/datablend_combined.json"
 cat > "${DATABLEND_FILE}" << EOF
 {
     "train": [
-        0.3, "${OUTPUT_DIR}/nemotron_v2_preprocessed/code/code_${SUFFIX}_train_text_document",
-        0.2, "${OUTPUT_DIR}/nemotron_v2_preprocessed/math/math_${SUFFIX}_train_text_document",
-        0.2, "${OUTPUT_DIR}/nemotron_v2_preprocessed/stem/stem_${SUFFIX}_train_text_document",
-        0.1, "${OUTPUT_DIR}/nemotron_v2_preprocessed/chat/chat_${SUFFIX}_train_text_document",
-        0.2, "${OUTPUT_DIR}/openscience_splits_preprocessed/openscience_chat_train_text_document"
+        0.3, "${NV2_OUTPUT}/code/code_${SUFFIX}_train_text_document",
+        0.2, "${NV2_OUTPUT}/math/math_${SUFFIX}_train_text_document",
+        0.2, "${NV2_OUTPUT}/stem/stem_${SUFFIX}_train_text_document",
+        0.1, "${NV2_OUTPUT}/chat/chat_${SUFFIX}_train_text_document",
+        0.2, "${OS_OUTPUT}/openscience_chat_train_text_document"
     ],
     "valid": [
-        0.5, "${OUTPUT_DIR}/nemotron_v2_preprocessed/stem/stem_${SUFFIX}_validation_text_document",
-        0.5, "${OUTPUT_DIR}/openscience_splits_preprocessed/openscience_chat_validation_text_document"
+        0.5, "${NV2_OUTPUT}/stem/stem_${SUFFIX}_validation_text_document",
+        0.5, "${OS_OUTPUT}/openscience_chat_validation_text_document"
     ],
     "test": [
-        0.5, "${OUTPUT_DIR}/nemotron_v2_preprocessed/stem/stem_${SUFFIX}_test_text_document",
-        0.5, "${OUTPUT_DIR}/openscience_splits_preprocessed/openscience_chat_test_text_document"
+        0.5, "${NV2_OUTPUT}/stem/stem_${SUFFIX}_test_text_document",
+        0.5, "${OS_OUTPUT}/openscience_chat_test_text_document"
     ]
 }
 EOF
@@ -213,11 +262,11 @@ echo "=============================================="
 echo ""
 echo "Output structure:"
 echo "  ${OUTPUT_DIR}/"
-echo "  ├── openscience_splits/           # Raw JSONL"
-echo "  ├── openscience_splits_preprocessed/  # Megatron format"
-echo "  ├── nemotron_v2/                  # Raw JSONL"
-echo "  ├── nemotron_v2_preprocessed/     # Megatron format"
-echo "  └── datablend_combined.json       # Combined dataset config"
+echo "  ├── openscience_splits/              # Raw JSONL"
+echo "  ├── openscience_splits_preprocessed/ # Megatron format"
+echo "  ├── nemotron_v2/                     # Raw JSONL"
+echo "  ├── nemotron_v2_preprocessed/        # Megatron format"
+echo "  └── datablend_combined.json          # Combined config"
 echo ""
 echo "Dataset weights (train):"
 echo "  - 30% Nemotron-v2 code"
@@ -233,4 +282,3 @@ echo ""
 echo "  2. Run training:"
 echo "     sbatch sbatch_qad.sh --config configs/your-config.conf"
 echo "=============================================="
-

From 4b66064599dda0bf0dab76496405115c0850d798 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:57 -0800
Subject: [PATCH 10/16] simplify scripts

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../llm_qad/data_utils/download_dataset.py    | 500 ++++--------------
 .../llm_qad/data_utils/generate_dataset.sh    | 291 ++--------
 examples/llm_qad/qad.sh                       | 309 +++--------
 examples/llm_qad/sbatch_qad.sh                | 293 +++-------
 4 files changed, 296 insertions(+), 1097 deletions(-)

diff --git a/examples/llm_qad/data_utils/download_dataset.py b/examples/llm_qad/data_utils/download_dataset.py
index 2dc642e22..e3e3d0646 100644
--- a/examples/llm_qad/data_utils/download_dataset.py
+++ b/examples/llm_qad/data_utils/download_dataset.py
@@ -13,28 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""
-Unified dataset downloader for QAD training.
-
-Supports:
-  - nvidia/OpenScience (OS-Q3-235B-4)
-  - nvidia/Nemotron-Post-Training-Dataset-v2 (stem, math, code, chat)
-
-Usage:
-    # Download OpenScience
-    python download_dataset.py --dataset openscience --output-dir /path/to/data --tokenizer Qwen/Qwen3-8B
-
-    # Download Nemotron-v2 (all English splits)
-    python download_dataset.py --dataset nemotron-v2 --output-dir /path/to/data --tokenizer Qwen/Qwen3-8B
-
-    # Download specific Nemotron-v2 splits
-    python download_dataset.py --dataset nemotron-v2 --splits stem,math --sample-percent 30 ...
-
-NOTE: Nemotron-v2 is GATED. You need:
-  1. Request access at: https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2
-  2. Login with: huggingface-cli login
-"""
+"""Download datasets for QAD training (OpenScience, Nemotron-v2)."""
 
 from __future__ import annotations
 
@@ -46,433 +25,176 @@
 
 from tqdm import tqdm
 
-# Constants
-TRAIN_RATIO = 0.95
-VALID_RATIO = 0.025
-TEST_RATIO = 0.025
-RANDOM_SEED = 42
-
-DATASET_CONFIGS: dict[str, dict[str, Any]] = {
-    "openscience": {
-        "hf_name": "nvidia/OpenScience",
-        "hf_config": "OS-Q3-235B-4",
-        "format": "input_output",  # Has input/output fields
-        "gated": False,
-    },
-    "nemotron-v2": {
-        "hf_name": "nvidia/Nemotron-Post-Training-Dataset-v2",
-        "hf_config": None,  # Uses split names directly
-        "format": "messages",  # Has messages field
-        "gated": True,
-        "default_splits": ["stem", "math", "code", "chat"],
-        "all_splits": [
-            "stem",
-            "math",
-            "code",
-            "chat",
-            "multilingual_ja",
-            "multilingual_de",
-            "multilingual_it",
-            "multilingual_es",
-            "multilingual_fr",
-        ],
-    },
-}
-
-# Global tokenizer
+SEED = 42
+TRAIN_RATIO, VALID_RATIO = 0.95, 0.025
 _TOKENIZER = None
 
 
-def init_tokenizer(tokenizer_name: str) -> None:
-    """Initialize tokenizer for chat template formatting."""
+def init_tokenizer(name: str) -> None:
+    """Load HuggingFace tokenizer for chat template."""
     global _TOKENIZER
-    if tokenizer_name:
+    if name:
         from transformers import AutoTokenizer
 
-        print(f"Loading tokenizer: {tokenizer_name}")
-        _TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        print(f"Loading tokenizer: {name}")
+        _TOKENIZER = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
 
 
-def format_input_output(input_text: str, output_text: str) -> str:
-    """Format input/output pair (OpenScience format)."""
-    global _TOKENIZER
+def format_text(messages: list[dict], reasoning: str = "") -> str:
+    """Format messages to text using tokenizer chat template or simple format."""
+    # Add reasoning as thinking block if provided
+    if reasoning.strip():
+        messages = messages.copy()
+        for i, m in enumerate(messages):
+            if m.get("role") == "assistant" and i == len(messages) - 1:
+                messages[i] = {
+                    "role": "assistant",
+                    "content": f"<think>\n{reasoning}\n</think>\n{m.get('content', '')}",
+                }
 
-    if _TOKENIZER is not None:
-        messages = [
-            {"role": "user", "content": input_text},
-            {"role": "assistant", "content": output_text},
-        ]
+    if _TOKENIZER:
         try:
             return _TOKENIZER.apply_chat_template(messages, tokenize=False)
-        except Exception as e:
-            print(f"Warning: Chat template failed: {e}")
+        except Exception:
+            pass
 
-    return f"User: {input_text}\n\nAssistant: {output_text}"
-
-
-def format_messages(messages: list, reasoning: str | None = None) -> str:
-    """Format messages list (Nemotron-v2 format)."""
-    global _TOKENIZER
-
-    # Optionally prepend reasoning as thinking block
-    if reasoning and reasoning.strip():
-        messages_with_cot = []
-        for i, msg in enumerate(messages):
-            if msg.get("role") == "assistant" and i == len(messages) - 1:
-                thinking_content = f"<think>\n{reasoning}\n</think>\n{msg.get('content', '')}"
-                messages_with_cot.append({"role": "assistant", "content": thinking_content})
-            else:
-                messages_with_cot.append(msg)
-        messages = messages_with_cot
-
-    if _TOKENIZER is not None:
-        try:
-            return _TOKENIZER.apply_chat_template(messages, tokenize=False)
-        except Exception as e:
-            print(f"Warning: Chat template failed: {e}")
-
-    # Fallback: simple format
-    text_parts = []
-    for msg in messages:
-        role = msg.get("role", "")
-        content = msg.get("content", "")
-        if role == "system":
-            text_parts.append(f"System: {content}")
-        elif role == "user":
-            text_parts.append(f"User: {content}")
-        elif role == "assistant":
-            text_parts.append(f"Assistant: {content}")
-
-    return "\n\n".join(text_parts)
-
-
-def download_openscience(output_dir: str, datablend_dir: str, use_chat: bool) -> dict[str, Any]:
-    """Download and split OpenScience dataset."""
-    from datasets import load_dataset
+    # Fallback
+    return "\n\n".join(f"{m['role'].title()}: {m['content']}" for m in messages if m.get("content"))
 
-    config = DATASET_CONFIGS["openscience"]
-    chat_suffix = "_chat" if use_chat else ""
 
-    print(f"\nDownloading {config['hf_name']}...")
-    dataset = load_dataset(config["hf_name"], config["hf_config"])
-
-    # Get the data
-    if "train" in dataset:
-        full_data = dataset["train"]
-    else:
-        first_split = next(iter(dataset.keys()))
-        print(f"Using '{first_split}' split")
-        full_data = dataset[first_split]
-
-    print(f"Shuffling {len(full_data)} examples...")
-    shuffled_data = full_data.shuffle(seed=RANDOM_SEED)
+def split_and_save(examples: list[dict], output_dir: str, prefix: str) -> dict[str, int]:
+    """Shuffle, split into train/valid/test, and save as JSONL."""
+    random.seed(SEED)
+    random.shuffle(examples)
 
-    # Split
-    total = len(shuffled_data)
-    train_end = int(total * TRAIN_RATIO)
-    valid_end = train_end + int(total * VALID_RATIO)
+    n = len(examples)
+    train_end = int(n * TRAIN_RATIO)
+    valid_end = train_end + int(n * VALID_RATIO)
 
     splits = {
-        "train": shuffled_data.select(range(train_end)),
-        "validation": shuffled_data.select(range(train_end, valid_end)),
-        "test": shuffled_data.select(range(valid_end, total)),
+        "train": examples[:train_end],
+        "validation": examples[train_end:valid_end],
+        "test": examples[valid_end:],
     }
 
-    print(
-        f"Splits: train={len(splits['train'])}, valid={len(splits['validation'])}, test={len(splits['test'])}"
-    )
-
-    # Save
     os.makedirs(output_dir, exist_ok=True)
-    saved_files = {}
-
-    for split_name, split_data in splits.items():
-        output_file = os.path.join(output_dir, f"openscience{chat_suffix}_{split_name}.jsonl")
-
-        with open(output_file, "w", encoding="utf-8") as f:
-            for example in tqdm(split_data, desc=split_name):
-                text = format_input_output(example.get("input", ""), example.get("output", ""))
-                f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")
-
-        saved_files[split_name] = output_file
-        print(f"Saved {split_name}: {len(split_data)} examples")
-
-    # Datablend config
-    preprocessed_dir = output_dir.replace("openscience_splits", "openscience_splits_preprocessed")
-    blend_file = os.path.join(datablend_dir, f"datablend_openscience{chat_suffix}.json")
-    blend_config = {
-        "train": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_train_text_document"],
-        "valid": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_validation_text_document"],
-        "test": [1.0, f"{preprocessed_dir}/openscience{chat_suffix}_test_text_document"],
-    }
-    os.makedirs(datablend_dir, exist_ok=True)
-    with open(blend_file, "w") as f:
-        json.dump(blend_config, f, indent=2)
-    print(f"Created datablend: {blend_file}")
-
-    return {
-        "dataset": "openscience",
-        "total": total,
-        "train": len(splits["train"]),
-        "validation": len(splits["validation"]),
-        "test": len(splits["test"]),
-        "files": saved_files,
-        "datablend": blend_file,
-    }
-
-
-def download_nemotron_v2_split(
-    split_name: str,
-    output_dir: str,
-    datablend_dir: str,
-    sample_percent: float,
-    suffix: str,
-    include_reasoning: bool,
-) -> dict[str, Any] | None:
-    """Download a single Nemotron-v2 split."""
-    from datasets import load_dataset, load_dataset_builder
-
-    config = DATASET_CONFIGS["nemotron-v2"]
-    split_dir = os.path.join(output_dir, split_name)
-    os.makedirs(split_dir, exist_ok=True)
-
-    # Get split size
-    try:
-        builder = load_dataset_builder(config["hf_name"], split_name)
-        available = builder.info.splits[split_name].num_examples if builder.info.splits else None
-        if available:
-            target = int(available * sample_percent / 100)
-            print(f"\n{split_name}: downloading {target:,} of {available:,} ({sample_percent}%)")
-        else:
-            target = None
-            print(f"\n{split_name}: size unknown, downloading all then sampling")
-    except Exception as e:
-        if "gated" in str(e).lower() or "access" in str(e).lower():
-            print("\nACCESS DENIED - Request access at:")
-            print(f"  https://huggingface.co/datasets/{config['hf_name']}")
-            print("Then login with: huggingface-cli login")
-            raise
-        target = None
-        print(f"\n{split_name}: could not get size ({e})")
-
-    # Download
-    examples = []
-    dataset = load_dataset(config["hf_name"], split=split_name, streaming=True)
-
-    count = 0
-    for example in tqdm(dataset, desc=split_name, total=target):
-        if target is not None and count >= target:
-            break
+    counts = {}
+    for name, data in splits.items():
+        path = os.path.join(output_dir, f"{prefix}_{name}.jsonl")
+        with open(path, "w") as f:
+            f.writelines(json.dumps(d, ensure_ascii=False) + "\n" for d in data)
+        counts[name] = len(data)
+        print(f"  {name}: {len(data):,}")
 
-        messages = example.get("messages", [])
-        reasoning = example.get("reasoning", "") if include_reasoning else ""
-        text = format_messages(messages, reasoning)
+    return counts
 
-        if text.strip():
-            examples.append({"text": text})
-            count += 1
 
-    print(f"Collected {count:,} examples")
-
-    # If downloaded all, sample
-    if target is None and sample_percent < 100:
-        random.seed(RANDOM_SEED)
-        target_count = int(len(examples) * sample_percent / 100)
-        examples = random.sample(examples, target_count)
-        print(f"Sampled to {len(examples):,}")
-
-    if not examples:
-        print(f"Warning: No examples from {split_name}")
-        return None
-
-    # Shuffle and split
-    random.seed(RANDOM_SEED)
-    random.shuffle(examples)
+def download_openscience(output_dir: str, use_chat: bool) -> dict[str, Any]:
+    """Download nvidia/OpenScience dataset."""
+    from datasets import load_dataset
 
-    total = len(examples)
-    train_end = int(total * TRAIN_RATIO)
-    valid_end = train_end + int(total * VALID_RATIO)
+    print("\nDownloading nvidia/OpenScience...")
+    ds = load_dataset("nvidia/OpenScience", "OS-Q3-235B-4")
+    data = ds["train"] if "train" in ds else ds[next(iter(ds.keys()))]
 
-    splits = {
-        "train": examples[:train_end],
-        "validation": examples[train_end:valid_end],
-        "test": examples[valid_end:],
-    }
+    print(f"Processing {len(data)} examples...")
+    suffix = "_chat" if use_chat else ""
+    examples = []
+    for ex in tqdm(data.shuffle(seed=SEED), desc="openscience"):
+        msgs = [
+            {"role": "user", "content": ex.get("input", "")},
+            {"role": "assistant", "content": ex.get("output", "")},
+        ]
+        examples.append({"text": format_text(msgs)})
 
-    # Save
-    saved_files = {}
-    for data_split, data in splits.items():
-        output_file = os.path.join(split_dir, f"{split_name}_{suffix}_{data_split}.jsonl")
-        with open(output_file, "w", encoding="utf-8") as f:
-            f.writelines(json.dumps(ex, ensure_ascii=False) + "\n" for ex in data)
-        saved_files[data_split] = output_file
-        print(f"  {data_split}: {len(data):,} examples")
-
-    # Datablend config
-    preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
-    split_preprocessed_dir = os.path.join(preprocessed_dir, split_name)
-
-    blend_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_{split_name}_{suffix}.json")
-    blend_config = {
-        "train": [1.0, f"{split_preprocessed_dir}/{split_name}_{suffix}_train_text_document"],
-        "valid": [1.0, f"{split_preprocessed_dir}/{split_name}_{suffix}_validation_text_document"],
-        "test": [1.0, f"{split_preprocessed_dir}/{split_name}_{suffix}_test_text_document"],
-    }
-    os.makedirs(datablend_dir, exist_ok=True)
-    with open(blend_file, "w") as f:
-        json.dump(blend_config, f, indent=2)
-    print(f"  Datablend: {blend_file}")
-
-    return {
-        "split_name": split_name,
-        "total": total,
-        "train": len(splits["train"]),
-        "validation": len(splits["validation"]),
-        "test": len(splits["test"]),
-        "files": saved_files,
-        "datablend": blend_file,
-    }
+    counts = split_and_save(examples, output_dir, f"openscience{suffix}")
+    return {"dataset": "openscience", "total": len(examples), **counts}
 
 
 def download_nemotron_v2(
-    output_dir: str,
-    datablend_dir: str,
-    splits: list[str],
-    sample_percent: float,
-    suffix: str,
-    include_reasoning: bool,
+    output_dir: str, splits: list[str], sample_pct: float, suffix: str, include_reasoning: bool
 ) -> list[dict[str, Any]]:
-    """Download Nemotron-v2 dataset (multiple splits)."""
-    print(f"\nDownloading Nemotron-v2: {splits}")
-    print(f"Sample: {sample_percent}%, Reasoning: {include_reasoning}")
-    print("=" * 60)
-    print("NOTE: This dataset is GATED. You need HuggingFace access.")
-    print("=" * 60)
+    """Download nvidia/Nemotron-Post-Training-Dataset-v2 splits."""
+    from datasets import load_dataset
 
-    os.makedirs(output_dir, exist_ok=True)
-    os.makedirs(datablend_dir, exist_ok=True)
-
-    all_infos = []
-    for split_name in splits:
-        info = download_nemotron_v2_split(
-            split_name=split_name,
-            output_dir=output_dir,
-            datablend_dir=datablend_dir,
-            sample_percent=sample_percent,
-            suffix=suffix,
-            include_reasoning=include_reasoning,
-        )
-        if info:
-            all_infos.append(info)
+    print(f"\nDownloading Nemotron-v2 ({', '.join(splits)}) @ {sample_pct}%...")
+    results = []
 
-    # Create combined datablend
-    if all_infos:
-        preprocessed_dir = output_dir.replace("nemotron_v2", "nemotron_v2_preprocessed")
-        total_train = sum(info["train"] for info in all_infos)
+    for split in splits:
+        print(f"\n{split}:")
+        ds = load_dataset("nvidia/Nemotron-Post-Training-Dataset-v2", split=split, streaming=True)
 
-        train_blend = []
-        valid_blend = []
-        test_blend = []
+        examples = []
+        for ex in tqdm(ds, desc=split):
+            msgs = ex.get("messages", [])
+            reasoning = ex.get("reasoning", "") if include_reasoning else ""
+            text = format_text(msgs, reasoning)
+            if text.strip():
+                examples.append({"text": text})
 
-        for info in all_infos:
-            sn = info["split_name"]
-            weight = info["train"] / total_train if total_train > 0 else 1.0 / len(all_infos)
-            split_path = os.path.join(preprocessed_dir, sn)
+        # Sample if needed
+        if sample_pct < 100:
+            random.seed(SEED)
+            target = int(len(examples) * sample_pct / 100)
+            examples = random.sample(examples, min(target, len(examples)))
+            print(f"  Sampled to {len(examples):,}")
 
-            train_blend.extend([weight, f"{split_path}/{sn}_{suffix}_train_text_document"])
-            valid_blend.extend([weight, f"{split_path}/{sn}_{suffix}_validation_text_document"])
-            test_blend.extend([weight, f"{split_path}/{sn}_{suffix}_test_text_document"])
+        if not examples:
+            continue
 
-        combined_file = os.path.join(datablend_dir, f"datablend_nemotron_v2_combined_{suffix}.json")
-        with open(combined_file, "w") as f:
-            json.dump({"train": train_blend, "valid": valid_blend, "test": test_blend}, f, indent=2)
-        print(f"\nCombined datablend: {combined_file}")
+        split_dir = os.path.join(output_dir, split)
+        counts = split_and_save(examples, split_dir, f"{split}_{suffix}")
+        results.append({"split_name": split, "total": len(examples), **counts})
 
-    return all_infos
+    return results
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Download datasets for QAD training")
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        required=True,
-        choices=["openscience", "nemotron-v2", "all"],
-        help="Dataset to download",
-    )
-    parser.add_argument("--output-dir", type=str, required=True, help="Output directory")
-    parser.add_argument(
-        "--datablend-dir",
-        type=str,
-        default=None,
-        help="Datablend config directory (default: output-dir)",
-    )
-    parser.add_argument(
-        "--tokenizer", type=str, default=None, help="HuggingFace tokenizer for chat template"
-    )
-
-    # Nemotron-v2 specific
-    parser.add_argument(
-        "--splits",
-        type=str,
-        default="stem,math,code,chat",
-        help="Nemotron-v2 splits to download (comma-separated)",
-    )
-    parser.add_argument(
-        "--sample-percent",
-        type=float,
-        default=30.0,
-        help="Percentage of data to sample (default: 30)",
+    p = argparse.ArgumentParser(description="Download QAD datasets")
+    p.add_argument("--dataset", required=True, choices=["openscience", "nemotron-v2", "all"])
+    p.add_argument("--output-dir", required=True)
+    p.add_argument("--tokenizer", help="HuggingFace tokenizer for chat template")
+    p.add_argument("--splits", default="stem,math,code,chat", help="Nemotron-v2 splits")
+    p.add_argument("--sample-percent", type=float, default=30.0)
+    p.add_argument(
+        "--include-reasoning", action="store_true", help="Include COT for Thinking models"
     )
-    parser.add_argument(
-        "--include-reasoning",
-        action="store_true",
-        help="Include chain-of-thought reasoning (for Thinking models)",
-    )
-
-    args = parser.parse_args()
-
-    output_dir = args.output_dir
-    datablend_dir = args.datablend_dir or output_dir
-    use_chat = args.tokenizer is not None
+    args = p.parse_args()
 
     if args.tokenizer:
         init_tokenizer(args.tokenizer)
 
     # Build suffix
-    pct_str = f"{int(args.sample_percent)}pct"
-    cot_str = "_cot" if args.include_reasoning else ""
-    chat_str = "_chat" if use_chat else ""
-    suffix = f"{pct_str}{cot_str}{chat_str}"
+    suffix = f"{int(args.sample_percent)}pct"
+    if args.include_reasoning:
+        suffix += "_cot"
+    if args.tokenizer:
+        suffix += "_chat"
 
     results = []
 
     if args.dataset in ["openscience", "all"]:
-        os_dir = os.path.join(output_dir, "openscience_splits")
-        info = download_openscience(os_dir, datablend_dir, use_chat)
+        info = download_openscience(
+            os.path.join(args.output_dir, "openscience_splits"), args.tokenizer is not None
+        )
         results.append(info)
 
     if args.dataset in ["nemotron-v2", "all"]:
-        nv2_dir = os.path.join(output_dir, "nemotron_v2")
-        splits = [s.strip() for s in args.splits.split(",")]
         infos = download_nemotron_v2(
-            output_dir=nv2_dir,
-            datablend_dir=datablend_dir,
-            splits=splits,
-            sample_percent=args.sample_percent,
-            suffix=suffix,
-            include_reasoning=args.include_reasoning,
+            os.path.join(args.output_dir, "nemotron_v2"),
+            [s.strip() for s in args.splits.split(",")],
+            args.sample_percent,
+            suffix,
+            args.include_reasoning,
         )
         results.extend(infos)
 
-    # Summary
-    print("\n" + "=" * 60)
+    print("\n" + "=" * 50)
     print("Download complete!")
-    print("=" * 60)
     for r in results:
         name = r.get("dataset") or r.get("split_name")
-        print(f"  {name}: {r['total']:,} samples (train={r['train']:,})")
-    print("=" * 60)
+        print(f"  {name}: {r['total']:,} (train={r['train']:,})")
+    print("=" * 50)
 
 
 if __name__ == "__main__":
diff --git a/examples/llm_qad/data_utils/generate_dataset.sh b/examples/llm_qad/data_utils/generate_dataset.sh
index 59c3daba9..39d678df9 100755
--- a/examples/llm_qad/data_utils/generate_dataset.sh
+++ b/examples/llm_qad/data_utils/generate_dataset.sh
@@ -14,271 +14,92 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# =============================================================================
-# One-Button Dataset Generation for QAD Training
-# =============================================================================
-#
-# Downloads and preprocesses OpenScience + Nemotron-v2 datasets for QAD.
-# Creates a combined datablend JSON ready for training.
-#
-# USAGE:
-#   bash generate_dataset.sh --output-dir /path/to/datasets \
-#                            --mlm-path /path/to/Megatron-LM \
-#                            --tokenizer Qwen/Qwen3-30B-A3B-Instruct-2507
-#
-# REQUIRED ARGUMENTS:
-#   --output-dir   Base output directory for datasets
-#   --mlm-path     Path to Megatron-LM directory
-#   --tokenizer    HuggingFace tokenizer (e.g., Qwen/Qwen3-30B-A3B-Instruct-2507)
-#
-# OPTIONAL ARGUMENTS:
-#   --sample-percent    Percentage of Nemotron-v2 to use (default: 30)
-#   --include-reasoning Include chain-of-thought for Thinking models
-#   --workers           Parallel workers for preprocessing (default: 32)
-#
-# REQUIREMENTS:
-#   - HuggingFace access to nvidia/Nemotron-Post-Training-Dataset-v2
-#   - Run: huggingface-cli login
-#
-# =============================================================================
+# Download and preprocess OpenScience + Nemotron-v2 datasets for QAD training.
+# Usage: bash generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>
 
 set -e
 
-# =============================================================================
-# Argument Parsing
-# =============================================================================
-OUTPUT_DIR=""
-MLM_DIR=""
-TOKENIZER=""
-SAMPLE_PERCENT=30
-INCLUDE_REASONING=false
-WORKERS=32
+# Defaults
+OUTPUT_DIR="" MLM_DIR="" TOKENIZER="" SAMPLE_PERCENT=30 INCLUDE_REASONING=false WORKERS=32
 
+# Parse args
 while [[ $# -gt 0 ]]; do
     case $1 in
-        --output-dir)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        --mlm-path)
-            MLM_DIR="$2"
-            shift 2
-            ;;
-        --tokenizer)
-            TOKENIZER="$2"
-            shift 2
-            ;;
-        --sample-percent)
-            SAMPLE_PERCENT="$2"
-            shift 2
-            ;;
-        --include-reasoning)
-            INCLUDE_REASONING=true
-            shift
-            ;;
-        --workers)
-            WORKERS="$2"
-            shift 2
-            ;;
-        *)
-            echo "Unknown option: $1"
-            echo "Usage: bash generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>"
-            exit 1
-            ;;
+        --output-dir) OUTPUT_DIR="$2"; shift 2;;
+        --mlm-path) MLM_DIR="$2"; shift 2;;
+        --tokenizer) TOKENIZER="$2"; shift 2;;
+        --sample-percent) SAMPLE_PERCENT="$2"; shift 2;;
+        --include-reasoning) INCLUDE_REASONING=true; shift;;
+        --workers) WORKERS="$2"; shift 2;;
+        *) echo "Unknown: $1"; exit 1;;
     esac
 done
 
-# Validate required arguments
+# Validate
 if [ -z "$OUTPUT_DIR" ] || [ -z "$MLM_DIR" ] || [ -z "$TOKENIZER" ]; then
-    echo "Error: Missing required arguments"
-    echo ""
     echo "Usage: bash generate_dataset.sh --output-dir <path> --mlm-path <path> --tokenizer <model>"
-    echo ""
-    echo "Required:"
-    echo "  --output-dir   Base output directory for datasets"
-    echo "  --mlm-path     Path to Megatron-LM directory"
-    echo "  --tokenizer    HuggingFace tokenizer (e.g., Qwen/Qwen3-30B-A3B-Instruct-2507)"
-    echo ""
-    echo "Optional:"
-    echo "  --sample-percent    Percentage of data to use (default: 30)"
-    echo "  --include-reasoning Include chain-of-thought (for Thinking models)"
-    echo "  --workers           Parallel workers (default: 32)"
+    echo "Optional: --sample-percent N --include-reasoning --workers N"
     exit 1
 fi
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Build suffix based on options
-if [ "$INCLUDE_REASONING" = true ]; then
-    SUFFIX="${SAMPLE_PERCENT}pct_cot_chat"
-    REASONING_FLAG="--include-reasoning"
-else
-    SUFFIX="${SAMPLE_PERCENT}pct_chat"
-    REASONING_FLAG=""
-fi
-
-# Tokenizer settings for Megatron preprocessing
-TOKENIZER_TYPE="HuggingFaceTokenizer"
-
-echo "=============================================="
-echo "QAD Dataset Generation"
-echo "=============================================="
-echo "Output:       ${OUTPUT_DIR}"
-echo "MLM path:     ${MLM_DIR}"
-echo "Tokenizer:    ${TOKENIZER}"
-echo "Sample:       ${SAMPLE_PERCENT}%"
-echo "Reasoning:    ${INCLUDE_REASONING}"
-echo "Workers:      ${WORKERS}"
-echo "=============================================="
-
-# Create directories
-mkdir -p "${OUTPUT_DIR}"
-
-# =============================================================================
-# Helper Function: Preprocess JSONL to Megatron format
-# =============================================================================
-preprocess_to_megatron() {
-    local input_file="$1"
-    local output_prefix="$2"
-    
-    if [ -f "${input_file}" ]; then
-        echo "  Preprocessing: $(basename ${input_file})"
-        python "${MLM_DIR}/tools/preprocess_data.py" \
-            --input "${input_file}" \
-            --output-prefix "${output_prefix}" \
-            --tokenizer-type "${TOKENIZER_TYPE}" \
-            --tokenizer-model "${TOKENIZER}" \
-            --append-eod \
-            --workers "${WORKERS}" \
-            --json-keys text
-        return 0
-    else
-        echo "  Warning: File not found: ${input_file}"
-        return 1
-    fi
+SUFFIX="${SAMPLE_PERCENT}pct$( [ "$INCLUDE_REASONING" = true ] && echo "_cot" )_chat"
+REASONING_FLAG=$( [ "$INCLUDE_REASONING" = true ] && echo "--include-reasoning" )
+
+echo "=== QAD Dataset Generation ==="
+echo "Output: $OUTPUT_DIR | Tokenizer: $TOKENIZER | Sample: ${SAMPLE_PERCENT}%"
+
+# Helper: preprocess JSONL to Megatron format
+preprocess() {
+    [ -f "$1" ] && python "$MLM_DIR/tools/preprocess_data.py" \
+        --input "$1" --output-prefix "$2" \
+        --tokenizer-type HuggingFaceTokenizer --tokenizer-model "$TOKENIZER" \
+        --append-eod --workers "$WORKERS" --json-keys text
 }
 
-# =============================================================================
-# Step 1: Download OpenScience
-# =============================================================================
-echo ""
-echo "=== Step 1: Downloading Datasets ==="
-
-echo "[1/2] Downloading OpenScience..."
-python "${SCRIPT_DIR}/download_dataset.py" \
-    --dataset openscience \
-    --output-dir "${OUTPUT_DIR}" \
-    --datablend-dir "${OUTPUT_DIR}" \
-    --tokenizer "${TOKENIZER}"
-
-echo "[2/2] Downloading Nemotron-v2 @ ${SAMPLE_PERCENT}%..."
-python "${SCRIPT_DIR}/download_dataset.py" \
-    --dataset nemotron-v2 \
-    --output-dir "${OUTPUT_DIR}" \
-    --datablend-dir "${OUTPUT_DIR}" \
-    --sample-percent "${SAMPLE_PERCENT}" \
-    ${REASONING_FLAG} \
-    --tokenizer "${TOKENIZER}"
-
-# =============================================================================
-# Step 2: Preprocess OpenScience
-# =============================================================================
-echo ""
-echo "=== Step 2: Preprocessing to Megatron Format ==="
+# Step 1: Download
+echo -e "\n=== Downloading ==="
+python "$SCRIPT_DIR/download_dataset.py" --dataset openscience --output-dir "$OUTPUT_DIR" --tokenizer "$TOKENIZER"
+python "$SCRIPT_DIR/download_dataset.py" --dataset nemotron-v2 --output-dir "$OUTPUT_DIR" \
+    --sample-percent "$SAMPLE_PERCENT" $REASONING_FLAG --tokenizer "$TOKENIZER"
 
-OS_INPUT="${OUTPUT_DIR}/openscience_splits"
-OS_OUTPUT="${OUTPUT_DIR}/openscience_splits_preprocessed"
-mkdir -p "${OS_OUTPUT}"
+# Step 2: Preprocess
+echo -e "\n=== Preprocessing ==="
+OS_IN="$OUTPUT_DIR/openscience_splits" OS_OUT="$OUTPUT_DIR/openscience_splits_preprocessed"
+NV_IN="$OUTPUT_DIR/nemotron_v2" NV_OUT="$OUTPUT_DIR/nemotron_v2_preprocessed"
+mkdir -p "$OS_OUT"
 
-echo "[1/5] Processing OpenScience..."
-preprocess_to_megatron "${OS_INPUT}/openscience_chat_train.jsonl" "${OS_OUTPUT}/openscience_chat_train"
-preprocess_to_megatron "${OS_INPUT}/openscience_chat_validation.jsonl" "${OS_OUTPUT}/openscience_chat_validation" || true
-preprocess_to_megatron "${OS_INPUT}/openscience_chat_test.jsonl" "${OS_OUTPUT}/openscience_chat_test" || true
+for s in train validation test; do preprocess "$OS_IN/openscience_chat_$s.jsonl" "$OS_OUT/openscience_chat_$s" || true; done
 
-# =============================================================================
-# Step 3: Preprocess Nemotron-v2 (all splits)
-# =============================================================================
-NV2_INPUT="${OUTPUT_DIR}/nemotron_v2"
-NV2_OUTPUT="${OUTPUT_DIR}/nemotron_v2_preprocessed"
-
-STEP=2
 for split in code math stem chat; do
-    echo "[${STEP}/5] Processing Nemotron-v2 ${split}..."
-    SPLIT_DIR="${NV2_OUTPUT}/${split}"
-    mkdir -p "${SPLIT_DIR}"
-    
-    preprocess_to_megatron \
-        "${NV2_INPUT}/${split}/${split}_${SUFFIX}_train.jsonl" \
-        "${SPLIT_DIR}/${split}_${SUFFIX}_train"
-    
-    preprocess_to_megatron \
-        "${NV2_INPUT}/${split}/${split}_${SUFFIX}_validation.jsonl" \
-        "${SPLIT_DIR}/${split}_${SUFFIX}_validation" || true
-    
-    preprocess_to_megatron \
-        "${NV2_INPUT}/${split}/${split}_${SUFFIX}_test.jsonl" \
-        "${SPLIT_DIR}/${split}_${SUFFIX}_test" || true
-    
-    STEP=$((STEP + 1))
+    mkdir -p "$NV_OUT/$split"
+    for s in train validation test; do
+        preprocess "$NV_IN/$split/${split}_${SUFFIX}_$s.jsonl" "$NV_OUT/$split/${split}_${SUFFIX}_$s" || true
+    done
 done
 
-# =============================================================================
-# Step 4: Create Combined Datablend JSON
-# =============================================================================
-echo ""
-echo "=== Step 3: Creating Combined Datablend ==="
-
-DATABLEND_FILE="${OUTPUT_DIR}/datablend_combined.json"
-
-cat > "${DATABLEND_FILE}" << EOF
+# Step 3: Create combined datablend
+BLEND="$OUTPUT_DIR/datablend_combined.json"
+cat > "$BLEND" << EOF
 {
     "train": [
-        0.3, "${NV2_OUTPUT}/code/code_${SUFFIX}_train_text_document",
-        0.2, "${NV2_OUTPUT}/math/math_${SUFFIX}_train_text_document",
-        0.2, "${NV2_OUTPUT}/stem/stem_${SUFFIX}_train_text_document",
-        0.1, "${NV2_OUTPUT}/chat/chat_${SUFFIX}_train_text_document",
-        0.2, "${OS_OUTPUT}/openscience_chat_train_text_document"
+        0.3, "$NV_OUT/code/code_${SUFFIX}_train_text_document",
+        0.2, "$NV_OUT/math/math_${SUFFIX}_train_text_document",
+        0.2, "$NV_OUT/stem/stem_${SUFFIX}_train_text_document",
+        0.1, "$NV_OUT/chat/chat_${SUFFIX}_train_text_document",
+        0.2, "$OS_OUT/openscience_chat_train_text_document"
     ],
     "valid": [
-        0.5, "${NV2_OUTPUT}/stem/stem_${SUFFIX}_validation_text_document",
-        0.5, "${OS_OUTPUT}/openscience_chat_validation_text_document"
+        0.5, "$NV_OUT/stem/stem_${SUFFIX}_validation_text_document",
+        0.5, "$OS_OUT/openscience_chat_validation_text_document"
     ],
     "test": [
-        0.5, "${NV2_OUTPUT}/stem/stem_${SUFFIX}_test_text_document",
-        0.5, "${OS_OUTPUT}/openscience_chat_test_text_document"
+        0.5, "$NV_OUT/stem/stem_${SUFFIX}_test_text_document",
+        0.5, "$OS_OUT/openscience_chat_test_text_document"
     ]
 }
 EOF
 
-echo "Created: ${DATABLEND_FILE}"
-
-# =============================================================================
-# Done
-# =============================================================================
-echo ""
-echo "=============================================="
-echo "Dataset generation complete!"
-echo "=============================================="
-echo ""
-echo "Output structure:"
-echo "  ${OUTPUT_DIR}/"
-echo "  ├── openscience_splits/              # Raw JSONL"
-echo "  ├── openscience_splits_preprocessed/ # Megatron format"
-echo "  ├── nemotron_v2/                     # Raw JSONL"
-echo "  ├── nemotron_v2_preprocessed/        # Megatron format"
-echo "  └── datablend_combined.json          # Combined config"
-echo ""
-echo "Dataset weights (train):"
-echo "  - 30% Nemotron-v2 code"
-echo "  - 20% Nemotron-v2 math"
-echo "  - 20% Nemotron-v2 stem"
-echo "  - 10% Nemotron-v2 chat"
-echo "  - 20% OpenScience"
-echo ""
-echo "Next steps:"
-echo "  1. Set in your config:"
-echo "     export BLEND_PATH=\"${DATABLEND_FILE}\""
-echo ""
-echo "  2. Run training:"
-echo "     sbatch sbatch_qad.sh --config configs/your-config.conf"
-echo "=============================================="
+echo -e "\n=== Done! ==="
+echo "Datablend: $BLEND"
+echo "Set BLEND_PATH in your config and run: sbatch sbatch_qad.sh --config <config>"
diff --git a/examples/llm_qad/qad.sh b/examples/llm_qad/qad.sh
index 4998cf6c2..aae9e54d8 100644
--- a/examples/llm_qad/qad.sh
+++ b/examples/llm_qad/qad.sh
@@ -14,193 +14,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# =============================================================================
 # QAD (Quantization-Aware Distillation) Training Script
-# =============================================================================
-#
-# This script trains quantized language models using knowledge distillation
-# from a teacher model. Supports both dense and MoE (Mixture of Experts) models.
-#
-# USAGE:
-#   bash qad.sh --config configs/qwen3-8b.conf
-#   bash qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
-#   bash qad.sh --hf-token hf_xxx --config configs/qwen3-8b.conf
-#
-# REQUIRED CONFIG VARIABLES:
-#   Model:      STUDENT_MODEL, TEACHER_MODEL, IS_MOE, TOKENIZER_MODEL
-#   Training:   LR, GBS, MIN_LR, LR_DECAY_STYLE, SAVE_INTERVAL, LOG_INTERVAL
-#   Data:       DATASET_NAME, BLEND_PATH, TRAIN_SAMPLES
-#   Parallel:   TP_SIZE, MBS
-#   Paths:      STUDENT_CKPT, TEACHER_CKPT, TEACHER_MODEL_CONFIG,
-#               STUDENT_CONFIG_FILE, MLM_DIR, MODELOPT_DIR,
-#               QAD_CHECKPOINT_ROOT, DATACACHE_DIR
-#
-# =============================================================================
+# Usage: bash qad.sh --config configs/your-config.conf
 
 set -euo pipefail
 
-# =============================================================================
-# HELPER FUNCTIONS
-# =============================================================================
-
-log_info()  { echo "[INFO] $*"; }
-log_warn()  { echo "[WARN] $*"; }
-log_error() { echo "[ERROR] $*" >&2; }
-
-die() {
-    log_error "$@"
-    exit 1
-}
-
-require_var() {
-    local var_name="$1"
-    local var_value="${!var_name:-}"
-    if [[ -z "$var_value" ]]; then
-        die "$var_name must be set in config"
-    fi
-}
-
-require_file() {
-    local path="$1"
-    local desc="${2:-File}"
-    [[ -f "$path" ]] || die "$desc not found: $path"
-}
-
-require_dir() {
-    local path="$1"
-    local desc="${2:-Directory}"
-    [[ -d "$path" ]] || die "$desc not found: $path"
-}
-
-sanitize_for_path() {
-    echo "$1" | sed -e 's/[\/ :]/_/g' -e 's/[=]/_/g'
-}
+# === Helpers ===
+die() { echo "[ERROR] $*" >&2; exit 1; }
+log_info() { echo "[INFO] $*"; }
+log_warn() { echo "[WARN] $*"; }
+require_var() { [[ -n "${!1:-}" ]] || die "$1 must be set in config"; }
+require_file() { [[ -f "$1" ]] || die "${2:-File} not found: $1"; }
+require_dir() { [[ -d "$1" ]] || die "${2:-Directory} not found: $1"; }
+sanitize() { echo "$1" | sed -e 's/[\/ :]/_/g' -e 's/[=]/_/g'; }
 
-# =============================================================================
-# ENVIRONMENT SETUP
-# =============================================================================
-
-# NCCL and distributed training settings
+# === Environment ===
 export NCCL_IB_SL=1
 export NCCL_IB_TIMEOUT=19
 export NCCL_P2P_NET_CHUNKSIZE=2097152
 export NCCL_DEBUG=WARN
 export NCCL_SHM_DISABLE=1
 export NCCL_NVLS_ENABLE=0
-
-# CUDA settings
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export UB_TIMEOUT=720
-
-# Transformer Engine margins
 export NVTE_FWD_LAYERNORM_SM_MARGIN=16
 export NVTE_BWD_LAYERNORM_SM_MARGIN=16
-
-# PyTorch settings (disable features that cause issues during training)
 export TORCHINDUCTOR_COMPILE_THREADS=1
 export TORCH_COMPILE_DISABLE=1
 export PYTORCH_NO_CUDA_MEMORY_CACHING=0
 export TORCH_DISTRIBUTED_DEBUG=OFF
 export PYTORCH_JIT=0
 export TORCH_USE_CUDA_DSA=0
-
-# Network interface
 export GLOO_SOCKET_IFNAME=ibp26s0
 
-# =============================================================================
-# ARGUMENT PARSING
-# =============================================================================
-
+# === Argument Parsing ===
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 CONFIG_FILE=""
 HF_TOKEN_ARG=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
-        --config|-c)
-            CONFIG_FILE="$2"
-            shift 2
-            ;;
-        --hf-token)
-            HF_TOKEN_ARG="$2"
-            shift 2
-            ;;
-        *)
-            die "Unknown argument: $1"
-            ;;
+        --config|-c) CONFIG_FILE="$2"; shift 2;;
+        --hf-token) HF_TOKEN_ARG="$2"; shift 2;;
+        *) die "Unknown argument: $1";;
     esac
 done
 
-# HuggingFace token (from arg takes precedence)
-if [[ -n "$HF_TOKEN_ARG" ]]; then
-    export HF_TOKEN="$HF_TOKEN_ARG"
-fi
-
-if [[ -n "${HF_TOKEN:-}" ]]; then
-    export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
-    log_info "HuggingFace token configured"
-fi
-
-# =============================================================================
-# CONFIG LOADING
-# =============================================================================
+# HuggingFace token
+[[ -n "$HF_TOKEN_ARG" ]] && export HF_TOKEN="$HF_TOKEN_ARG"
+[[ -n "${HF_TOKEN:-}" ]] && export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" && log_info "HuggingFace token configured"
 
+# === Load Config ===
 if [[ -z "$CONFIG_FILE" ]]; then
-    log_error "Config file is required. Use --config <path>"
-    echo "Available configs:"
-    ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null || echo "  (none found)"
-    exit 1
+    die "Config file required. Use --config <path>\nAvailable: $(ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null | tr '\n' ' ')"
 fi
-
-# Handle relative paths
 [[ "$CONFIG_FILE" = /* ]] || CONFIG_FILE="${SCRIPT_DIR}/${CONFIG_FILE}"
-
 require_file "$CONFIG_FILE" "Config file"
-log_info "Loading config from: ${CONFIG_FILE}"
+log_info "Loading config: ${CONFIG_FILE}"
 source "$CONFIG_FILE"
 
-# =============================================================================
-# CONFIG VALIDATION
-# =============================================================================
-
-# Required: Training hyperparameters
-require_var LR
-require_var GBS
-require_var MIN_LR
-require_var LR_DECAY_STYLE
-require_var SAVE_INTERVAL
-require_var LOG_INTERVAL
-
-# Required: Model and data
-require_var STUDENT_MODEL
-require_var TEACHER_MODEL
-require_var DATASET_NAME
-require_var BLEND_PATH
-require_var TRAIN_SAMPLES
-require_var IS_MOE
-require_var TOKENIZER_MODEL
-
-# Required: Parallelism
-require_var TP_SIZE
-require_var MBS
-
-# Required: Checkpoints
-require_var STUDENT_CKPT
-require_var TEACHER_CKPT
-require_var TEACHER_MODEL_CONFIG
-
-# Required: Paths
-require_var STUDENT_CONFIG_FILE
-require_var MLM_DIR
-require_var MODELOPT_DIR
-require_var QAD_CHECKPOINT_ROOT
-require_var DATACACHE_DIR
-
-# =============================================================================
-# OPTIONAL CONFIG WITH DEFAULTS
-# =============================================================================
-
-# Parallelism defaults
+# === Validate Required Config ===
+for v in LR GBS MIN_LR LR_DECAY_STYLE SAVE_INTERVAL LOG_INTERVAL \
+         STUDENT_MODEL TEACHER_MODEL DATASET_NAME BLEND_PATH TRAIN_SAMPLES IS_MOE TOKENIZER_MODEL \
+         TP_SIZE MBS STUDENT_CKPT TEACHER_CKPT TEACHER_MODEL_CONFIG \
+         STUDENT_CONFIG_FILE MLM_DIR MODELOPT_DIR QAD_CHECKPOINT_ROOT DATACACHE_DIR; do
+    require_var "$v"
+done
+
+# === Defaults for Optional Config ===
 EP_SIZE="${EP_SIZE:-1}"
 PP_SIZE="${PP_SIZE:-1}"
 NUM_GPUS="${NUM_GPUS:-8}"
@@ -208,154 +89,88 @@ NNODES="${NNODES:-1}"
 NODE_RANK="${NODE_RANK:-0}"
 MASTER_ADDR="${MASTER_ADDR:-localhost}"
 MASTER_PORT="${MASTER_PORT:-29500}"
-
-# Training schedule (derived from TRAIN_SAMPLES if not set)
 LR_DECAY_SAMPLES="${LR_DECAY_SAMPLES:-$(( TRAIN_SAMPLES * 99 / 100 ))}"
 LR_WARMUP_SAMPLES="${LR_WARMUP_SAMPLES:-$(( TRAIN_SAMPLES / 100 ))}"
-
-# Checkpoint intervals
 SAVE_RETAIN_INTERVAL="${SAVE_RETAIN_INTERVAL:-$SAVE_INTERVAL}"
 EVAL_INTERVAL="${EVAL_INTERVAL:-$SAVE_INTERVAL}"
 EVAL_ITERS="${EVAL_ITERS:-20}"
-
-# Optional overrides
 MAX_SEQ="${MAX_SEQ:-}"
 RUN_TAG="${RUN_TAG:-}"
 KD_CFG_PATH="${KD_CFG_PATH:-}"
 ITERATIONS_TO_SKIP="${ITERATIONS_TO_SKIP:-}"
-
-# MoE performance flags
 ENABLE_MOE_PERF="${ENABLE_MOE_PERF:-1}"
 ENABLE_MOE_EXPERIMENTAL="${ENABLE_MOE_EXPERIMENTAL:-0}"
-
-# Logging
 LOG_PARAMS_NORM="${LOG_PARAMS_NORM:-}"
 
-# =============================================================================
-# MODEL CONFIGURATION
-# =============================================================================
-
+# === Load Student Model Config ===
 require_file "$STUDENT_CONFIG_FILE" "Student model config"
-log_info "Loading student model config from: ${STUDENT_CONFIG_FILE}"
-
-# Temporarily disable strict mode for external config (may use unset vars)
-set +u
-source "$STUDENT_CONFIG_FILE"
-set -u
-
+log_info "Loading student model config: ${STUDENT_CONFIG_FILE}"
+set +u; source "$STUDENT_CONFIG_FILE"; set -u
 STUDENT_MODEL_ARGS="${MODEL_ARGS}"
 
-# Log params norm setting
+# Log params norm (disabled for MoE to save memory)
 if [[ "${LOG_PARAMS_NORM}" == "1" ]]; then
     LOG_PARAMS_NORM_ARG="--log-params-norm"
 elif [[ "$IS_MOE" == "true" ]]; then
     LOG_PARAMS_NORM_ARG=""
-    log_warn "log-params-norm disabled for MoE model to save memory"
+    warn "log-params-norm disabled for MoE model"
 else
     LOG_PARAMS_NORM_ARG="--log-params-norm"
 fi
 
-log_info "Model: ${STUDENT_MODEL}"
-log_info "Parallelism: TP=${TP_SIZE}, PP=${PP_SIZE}, EP=${EP_SIZE}, MBS=${MBS}, MoE=${IS_MOE}"
-
-# =============================================================================
-# CHECKPOINT VALIDATION
-# =============================================================================
+log_info "Model: ${STUDENT_MODEL} | TP=${TP_SIZE} PP=${PP_SIZE} EP=${EP_SIZE} MBS=${MBS} MoE=${IS_MOE}"
 
+# === Validate Checkpoints ===
 require_dir "$STUDENT_CKPT" "Student checkpoint"
 require_dir "$TEACHER_CKPT" "Teacher checkpoint"
 require_file "$TEACHER_MODEL_CONFIG" "Teacher model config"
+log_info "Student: ${STUDENT_CKPT}"
+log_info "Teacher: ${TEACHER_CKPT}"
 
-log_info "Student checkpoint: ${STUDENT_CKPT}"
-log_info "Teacher checkpoint: ${TEACHER_CKPT}"
-
-# =============================================================================
-# OUTPUT PATH SETUP
-# =============================================================================
-
+# === Output Paths ===
 DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
 STUDENT_CKPT_NAME=$(basename "${STUDENT_CKPT}")
 TEACHER_CKPT_NAME=$(basename "${TEACHER_CKPT}")
 
-# Build descriptive run name from hyperparameters
-TAG_PARTS="lr$(sanitize_for_path "$LR")"
-TAG_PARTS="${TAG_PARTS}-minlr$(sanitize_for_path "$MIN_LR")"
-TAG_PARTS="${TAG_PARTS}-decay$(sanitize_for_path "$LR_DECAY_STYLE")"
+TAG_PARTS="lr$(sanitize "$LR")-minlr$(sanitize "$MIN_LR")-decay$(sanitize "$LR_DECAY_STYLE")"
 [[ -n "$MAX_SEQ" ]] && TAG_PARTS="${TAG_PARTS}-seq${MAX_SEQ}"
-[[ -n "$RUN_TAG" ]] && TAG_PARTS="${TAG_PARTS}-tag$(sanitize_for_path "$RUN_TAG")"
+[[ -n "$RUN_TAG" ]] && TAG_PARTS="${TAG_PARTS}-tag$(sanitize "$RUN_TAG")"
 
 OUTPUT_ROOT="${QAD_CHECKPOINT_ROOT}/${STUDENT_CKPT_NAME}-Teacher-${TEACHER_CKPT_NAME}-Data-${DATASET_NAME}-${TAG_PARTS}"
-NAME="${STUDENT_CKPT_NAME}"
-
-RUN_DIR="${OUTPUT_ROOT}"
-LOGS_DIR="${RUN_DIR}/logs"
-CHECKPOINT_DIR="${RUN_DIR}/checkpoints/${NAME}"
-TENSORBOARD_DIR="${RUN_DIR}/tensorboard/${NAME}"
-
-# Create directories
+CHECKPOINT_DIR="${OUTPUT_ROOT}/checkpoints/${STUDENT_CKPT_NAME}"
+TENSORBOARD_DIR="${OUTPUT_ROOT}/tensorboard/${STUDENT_CKPT_NAME}"
+LOGS_DIR="${OUTPUT_ROOT}/logs"
 mkdir -p "${LOGS_DIR}" "${CHECKPOINT_DIR}" "${DATACACHE_DIR}" "${TENSORBOARD_DIR}"
 
-# =============================================================================
-# RESUME LOGIC
-# =============================================================================
-
+# === Resume Logic ===
 if [[ -f "${CHECKPOINT_DIR}/latest_checkpointed_iteration.txt" ]]; then
-    log_info "Resuming from existing checkpoint: ${CHECKPOINT_DIR}"
+    log_info "Resuming from: ${CHECKPOINT_DIR}"
     LOAD_CHECKPOINT_DIR="${CHECKPOINT_DIR}"
     FINETUNE_FLAG=""
     LOAD_OPTIM_ARGS=""
     CKPT_PARALLEL_LOAD_ARG="--ckpt-fully-parallel-load"
 else
-    log_info "Starting fresh from base student checkpoint"
+    log_info "Starting fresh from base checkpoint"
     LOAD_CHECKPOINT_DIR="${STUDENT_CKPT}"
     FINETUNE_FLAG="--finetune"
     LOAD_OPTIM_ARGS="--no-load-optim --no-load-rng"
     CKPT_PARALLEL_LOAD_ARG=""
 fi
 
-# =============================================================================
-# TRAINING CONFIGURATION LOGGING
-# =============================================================================
-
-ENV_LOG="${LOGS_DIR}/${NAME}_${DATETIME}.env.log"
-
+# === Log Configuration ===
+ENV_LOG="${LOGS_DIR}/${STUDENT_CKPT_NAME}_${DATETIME}.env.log"
 {
-    echo "========================================"
-    echo "QAD Training: ${STUDENT_MODEL}"
+    echo "=== QAD Training: ${STUDENT_MODEL} ==="
     echo "Time: ${DATETIME}"
-    echo "========================================"
-    echo ""
-    echo "MODEL CONFIG"
-    echo "  Student: ${STUDENT_MODEL}"
-    echo "  Teacher: ${TEACHER_MODEL}"
-    echo "  Config: ${STUDENT_CONFIG_FILE}"
-    echo "  MoE: ${IS_MOE}"
-    echo ""
-    echo "TRAINING HYPERPARAMETERS"
-    echo "  LR: ${LR}, Min LR: ${MIN_LR}"
-    echo "  LR Decay: ${LR_DECAY_STYLE}"
-    echo "  GBS: ${GBS}, MBS: ${MBS}"
-    echo "  Train Samples: ${TRAIN_SAMPLES}"
-    echo "  Save Interval: ${SAVE_INTERVAL}, Log Interval: ${LOG_INTERVAL}"
-    echo ""
-    echo "PARALLELISM"
-    echo "  TP: ${TP_SIZE}, PP: ${PP_SIZE}, EP: ${EP_SIZE}"
-    echo "  Nodes: ${NNODES}, GPUs/node: ${NUM_GPUS}"
-    echo "  Total GPUs: $((NNODES * NUM_GPUS))"
-    echo ""
-    echo "PATHS"
-    echo "  MLM_DIR: ${MLM_DIR}"
-    echo "  Checkpoint: ${CHECKPOINT_DIR}"
-    echo "  TensorBoard: ${TENSORBOARD_DIR}"
-    echo ""
-    echo "ENVIRONMENT"
+    echo "LR=${LR} MinLR=${MIN_LR} Decay=${LR_DECAY_STYLE} GBS=${GBS} MBS=${MBS}"
+    echo "TrainSamples=${TRAIN_SAMPLES} SaveInterval=${SAVE_INTERVAL} LogInterval=${LOG_INTERVAL}"
+    echo "TP=${TP_SIZE} PP=${PP_SIZE} EP=${EP_SIZE} Nodes=${NNODES} GPUs/node=${NUM_GPUS}"
+    echo "Checkpoint: ${CHECKPOINT_DIR}"
+    echo "TensorBoard: ${TENSORBOARD_DIR}"
     env
-    echo "========================================"
-} | tee "$ENV_LOG"
+} > "$ENV_LOG"
 
-# =============================================================================
-# BUILD TRAINING ARGUMENTS
-# =============================================================================
+# === Build Training Arguments ===
 
 # Checkpoint loading
 CHECKPOINT_ARGS=" \
@@ -514,17 +329,13 @@ ALL_ARGS=" \
 # Optional: iterations to skip
 [[ -n "$ITERATIONS_TO_SKIP" ]] && ALL_ARGS="${ALL_ARGS} --iterations-to-skip ${ITERATIONS_TO_SKIP}"
 
-# =============================================================================
-# LAUNCH TRAINING
-# =============================================================================
-
+# === Launch Training ===
 export PYTHONPATH="${MODELOPT_DIR}:${MLM_DIR}:${PYTHONPATH:-}"
-
-LOG_FILE="${LOGS_DIR}/${NAME}_qad_${DATETIME}.log"
+LOG_FILE="${LOGS_DIR}/${STUDENT_CKPT_NAME}_qad_${DATETIME}.log"
 
 log_info "Starting training..."
 log_info "Log file: ${LOG_FILE}"
-log_info "Distributed: ${NNODES} nodes × ${NUM_GPUS} GPUs = $((NNODES * NUM_GPUS)) total"
+log_info "Distributed: ${NNODES} nodes x ${NUM_GPUS} GPUs = $((NNODES * NUM_GPUS)) total"
 
 torchrun \
     --nproc_per_node="${NUM_GPUS}" \
diff --git a/examples/llm_qad/sbatch_qad.sh b/examples/llm_qad/sbatch_qad.sh
index a1435eb53..613b9bc27 100755
--- a/examples/llm_qad/sbatch_qad.sh
+++ b/examples/llm_qad/sbatch_qad.sh
@@ -14,14 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# =============================================================================
 # QAD SLURM Batch Submission Script
-# =============================================================================
-#
-# Override these SLURM settings via command line:
-#   sbatch --nodes=4 --account=<your-account> sbatch_qad.sh --config ...
-#
-# Or set defaults below for your cluster:
+# Usage: sbatch sbatch_qad.sh --config configs/your-config.conf
+# Override: sbatch --nodes=4 --account=<account> sbatch_qad.sh --config ...
 
 #SBATCH -p batch
 #SBATCH --account=<your-account>
@@ -33,86 +28,37 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --job-name=qad-training
 
-# Usage:
-#   sbatch sbatch_qad.sh --config configs/qwen3-8b.conf
-#   sbatch sbatch_qad.sh --config configs/qwen3-30b-a3b-instruct-2507-moe.conf
-#
-# With HuggingFace token:
-#   sbatch sbatch_qad.sh --hf-token hf_xxx --config configs/qwen3-8b.conf
-#
-
 set -x -e
 
-########################################################
-# Parse Arguments
-########################################################
-
-# Use SLURM_SUBMIT_DIR if available (SLURM copies script to temp location)
-# Otherwise use the script's directory
-if [ -n "${SLURM_SUBMIT_DIR:-}" ]; then
-    SCRIPT_DIR="${SLURM_SUBMIT_DIR}"
-else
-    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-fi
+# === Parse Arguments ===
+SCRIPT_DIR="${SLURM_SUBMIT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}"
 CONFIG_FILE=""
 HF_TOKEN_ARG=""
 
-# Parse arguments
 while [[ $# -gt 0 ]]; do
     case $1 in
-        --config|-c)
-            CONFIG_FILE="$2"
-            shift 2
-            ;;
-        --hf-token)
-            HF_TOKEN_ARG="$2"
-            shift 2
-            ;;
-        *)
-            break
-            ;;
+        --config|-c) CONFIG_FILE="$2"; shift 2;;
+        --hf-token) HF_TOKEN_ARG="$2"; shift 2;;
+        *) break;;
     esac
 done
 
-# Set HF_TOKEN from arg (takes precedence, doesn't appear in logs)
-if [ -n "$HF_TOKEN_ARG" ]; then
-    export HF_TOKEN="$HF_TOKEN_ARG"
-fi
-
-########################################################
-# Load Config File
-########################################################
+[[ -n "$HF_TOKEN_ARG" ]] && export HF_TOKEN="$HF_TOKEN_ARG"
 
-if [ -n "$CONFIG_FILE" ]; then
-    # Handle relative paths
-    if [[ ! "$CONFIG_FILE" = /* ]]; then
-        CONFIG_FILE="${SCRIPT_DIR}/${CONFIG_FILE}"
-    fi
-    
-    if [ -f "$CONFIG_FILE" ]; then
-        echo "📄 Loading config from: ${CONFIG_FILE}"
+# === Load Config ===
+if [[ -n "$CONFIG_FILE" ]]; then
+    [[ "$CONFIG_FILE" = /* ]] || CONFIG_FILE="${SCRIPT_DIR}/${CONFIG_FILE}"
+    if [[ -f "$CONFIG_FILE" ]]; then
+        echo "Loading config: ${CONFIG_FILE}"
         source "$CONFIG_FILE"
     else
-        echo "❌ ERROR: Config file not found: ${CONFIG_FILE}"
-        echo "Available configs:"
-        ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null || echo "  (none found)"
+        echo "ERROR: Config not found: ${CONFIG_FILE}"
+        ls -1 "${SCRIPT_DIR}/configs/"*.conf 2>/dev/null || echo "(no configs found)"
         exit 1
     fi
 fi
 
-########################################################
-# Default Values (only if not set by config/env)
-########################################################
-
-# Training args (command line can override)
-# Order: LR, TEACHER_MODEL, DATASET_NAME, STUDENT_MODEL, KD_CFG_PATH
-LR="${1:-${LR:-1e-6}}"
-TEACHER_MODEL="${2:-${TEACHER_MODEL:-Qwen3-8B}}"
-DATASET_NAME="${3:-${DATASET_NAME:-openscience}}"
-STUDENT_MODEL="${4:-${STUDENT_MODEL:-Qwen3-8B}}"
-KD_CFG_PATH="${5:-${KD_CFG_PATH:-}}"
-
-# Paths
+# === Default Paths (override in config) ===
 MLM_DIR="${MLM_DIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/workspace/Megatron-LM}"
 MODELOPT_DIR="${MODELOPT_DIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/workspace/TensorRT-Model-Optimizer}"
 MODELS_ROOT="${MODELS_ROOT:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/models}"
@@ -120,193 +66,93 @@ QAD_CHECKPOINT_ROOT="${QAD_CHECKPOINT_ROOT:-/lustre/fs1/portfolios/coreai/projec
 DATACACHE_DIR="${DATACACHE_DIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/data_cache}"
 LOG_DIR="${LOG_DIR:-${QAD_CHECKPOINT_ROOT}/logs_slurm}"
 
-# Container
+# Container settings
 CONTAINER_IMAGE="${CONTAINER_IMAGE:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/containers/pytorch_25.06-py3.sqsh}"
 CONTAINER_MOUNTS="${CONTAINER_MOUNTS:-/lustre/fs1:/lustre/fs1}"
 CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/weimingc/workspace/TensorRT-Model-Optimizer/examples/llm_qad}"
 
-# Parallelism settings (from config, required)
+# Parallelism (required from config)
 TP_SIZE="${TP_SIZE:?ERROR: TP_SIZE must be set in config}"
+MBS="${MBS:?ERROR: MBS must be set in config}"
 PP_SIZE="${PP_SIZE:-1}"
 EP_SIZE="${EP_SIZE:-1}"
-MBS="${MBS:?ERROR: MBS must be set in config}"
-
-# Other settings
 NUM_GPUS="${NUM_GPUS:-8}"
 MASTER_PORT="${MASTER_PORT:-29500}"
 
-# Multi-node config from SLURM (passed via sbatch --nodes=N)
+# Multi-node from SLURM
 NNODES="${SLURM_NNODES:-4}"
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 
-# Create directories
-mkdir -p ${LOG_DIR}
-
+mkdir -p "${LOG_DIR}"
 DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
 
-########################################################
-# Display Configuration
-########################################################
-
+# === Display Configuration ===
 echo "========================================"
 echo "QAD Training Configuration"
 echo "========================================"
-if [ -n "$CONFIG_FILE" ]; then
-    echo "CONFIG_FILE: ${CONFIG_FILE}"
-fi
-echo ""
-echo "Model:"
-echo "  STUDENT_MODEL: ${STUDENT_MODEL}"
-echo "  TEACHER_MODEL: ${TEACHER_MODEL}"
-echo ""
-echo "Training:"
-echo "  LR: ${LR}"
-echo "  DATASET: ${DATASET_NAME}"
-echo "  KD_CFG_PATH: ${KD_CFG_PATH:-none}"
-echo ""
-echo "Parallelism:"
-echo "  TP: ${TP_SIZE}, PP: ${PP_SIZE}, EP: ${EP_SIZE}"
-echo "  MBS: ${MBS}"
-echo "  NNODES: ${NNODES}"
-echo "  NUM_GPUS/node: ${NUM_GPUS}"
-echo "  Total GPUs: $((NNODES * NUM_GPUS))"
-echo ""
-echo "Distributed:"
-echo "  MASTER_ADDR: ${MASTER_ADDR}"
-echo "  MASTER_PORT: ${MASTER_PORT}"
-echo "  SLURM_NODELIST: ${SLURM_JOB_NODELIST}"
+[[ -n "$CONFIG_FILE" ]] && echo "Config: ${CONFIG_FILE}"
+echo "Model: ${STUDENT_MODEL:-unknown} -> Teacher: ${TEACHER_MODEL:-unknown}"
+echo "LR: ${LR:-?} | Dataset: ${DATASET_NAME:-?}"
+echo "Parallelism: TP=${TP_SIZE} PP=${PP_SIZE} EP=${EP_SIZE} MBS=${MBS}"
+echo "Nodes: ${NNODES} x ${NUM_GPUS} GPUs = $((NNODES * NUM_GPUS)) total"
+echo "Master: ${MASTER_ADDR}:${MASTER_PORT}"
 echo ""
 echo "Paths:"
 echo "  MLM_DIR: ${MLM_DIR}"
 echo "  MODELOPT_DIR: ${MODELOPT_DIR}"
-echo "  MODELS_ROOT: ${MODELS_ROOT}"
-echo "  QAD_CHECKPOINT_ROOT: ${QAD_CHECKPOINT_ROOT}"
-echo "  DATACACHE_DIR: ${DATACACHE_DIR}"
-echo "  LOG_DIR: ${LOG_DIR}"
+echo "  Checkpoints: ${QAD_CHECKPOINT_ROOT}"
 echo ""
-echo "Container:"
-echo "  IMAGE: ${CONTAINER_IMAGE}"
-echo "  WORKDIR: ${CONTAINER_WORKDIR}"
-
-# Show checkpoint paths
+echo "Container: ${CONTAINER_IMAGE}"
 echo ""
 echo "Checkpoints:"
-echo "  STUDENT_CKPT: ${STUDENT_CKPT:-NOT SET}"
-echo "  TEACHER_CKPT: ${TEACHER_CKPT:-NOT SET}"
-if [ -n "${TEACHER_MODEL_CONFIG:-}" ]; then
-    echo "  TEACHER_MODEL_CONFIG: ${TEACHER_MODEL_CONFIG}"
-fi
-if [ -n "${BLEND_PATH:-}" ]; then
-    echo "  BLEND_PATH: ${BLEND_PATH}"
-fi
+echo "  Student: ${STUDENT_CKPT:-NOT SET}"
+echo "  Teacher: ${TEACHER_CKPT:-NOT SET}"
+[[ -n "${BLEND_PATH:-}" ]] && echo "  Blend: ${BLEND_PATH}"
 echo "========================================"
 
-# Validate required checkpoints
-if [ -z "${STUDENT_CKPT:-}" ]; then
-    echo "❌ ERROR: STUDENT_CKPT is required. Set it in config file."
-    exit 1
-fi
-if [ -z "${TEACHER_CKPT:-}" ]; then
-    echo "❌ ERROR: TEACHER_CKPT is required. Set it in config file."
-    exit 1
-fi
-
-########################################################
-# Build Container Environment Exports
-########################################################
-
-# Core exports (environment variables that qwen_qad.sh will read)
-# Use local /tmp for Triton cache to avoid race conditions on shared filesystem
-EXPORTS="export TRITON_CACHE_DIR=/tmp/triton_cache_\${SLURM_JOB_ID}_\${SLURM_PROCID} && \
-export NODE_RANK=\${SLURM_PROCID} && \
-export NNODES=${NNODES} && \
-export NUM_GPUS=${NUM_GPUS} && \
-export TP_SIZE=${TP_SIZE} && \
-export PP_SIZE=${PP_SIZE} && \
-export EP_SIZE=${EP_SIZE} && \
-export MBS=${MBS} && \
-export IS_MOE=${IS_MOE:-false} && \
-export MASTER_ADDR=${MASTER_ADDR} && \
-export MASTER_PORT=${MASTER_PORT} && \
-export MLM_DIR=${MLM_DIR} && \
-export MODELOPT_DIR=${MODELOPT_DIR} && \
-export QAD_CHECKPOINT_ROOT=${QAD_CHECKPOINT_ROOT} && \
-export DATACACHE_DIR=${DATACACHE_DIR}"
-
-# Training hyperparameters (required by qwen_qad.sh)
-EXPORTS="${EXPORTS} && export LR=${LR:-}"
-EXPORTS="${EXPORTS} && export GBS=${GBS:-}"
-EXPORTS="${EXPORTS} && export MIN_LR=${MIN_LR:-}"
-EXPORTS="${EXPORTS} && export LR_DECAY_STYLE=${LR_DECAY_STYLE:-}"
-EXPORTS="${EXPORTS} && export SAVE_INTERVAL=${SAVE_INTERVAL:-}"
-EXPORTS="${EXPORTS} && export LOG_INTERVAL=${LOG_INTERVAL:-}"
-EXPORTS="${EXPORTS} && export STUDENT_MODEL=${STUDENT_MODEL:-}"
-EXPORTS="${EXPORTS} && export TEACHER_MODEL=${TEACHER_MODEL:-}"
-EXPORTS="${EXPORTS} && export DATASET_NAME=${DATASET_NAME:-}"
-
-# Student config file (required)
-if [ -n "${STUDENT_CONFIG_FILE:-}" ]; then
-    EXPORTS="${EXPORTS} && export STUDENT_CONFIG_FILE=${STUDENT_CONFIG_FILE}"
-fi
-
-# Tokenizer model (optional - defaults to Qwen/${STUDENT_MODEL} in qad.sh)
-if [ -n "${TOKENIZER_MODEL:-}" ]; then
-    EXPORTS="${EXPORTS} && export TOKENIZER_MODEL=${TOKENIZER_MODEL}"
-fi
-
-# Checkpoint exports (required)
-EXPORTS="${EXPORTS} && export STUDENT_CKPT=${STUDENT_CKPT}"
-EXPORTS="${EXPORTS} && export TEACHER_CKPT=${TEACHER_CKPT}"
-if [ -n "${TEACHER_MODEL_CONFIG:-}" ]; then
-    EXPORTS="${EXPORTS} && export TEACHER_MODEL_CONFIG=${TEACHER_MODEL_CONFIG}"
-fi
-
-# Optional dataset exports
-if [ -n "${BLEND_PATH:-}" ]; then
-    EXPORTS="${EXPORTS} && export BLEND_PATH=${BLEND_PATH}"
-fi
-if [ -n "${TRAIN_SAMPLES:-}" ]; then
-    EXPORTS="${EXPORTS} && export TRAIN_SAMPLES=${TRAIN_SAMPLES}"
-fi
-
-# HuggingFace token (avoid rate limiting)
-if [ -n "${HF_TOKEN:-}" ]; then
-    EXPORTS="${EXPORTS} && export HF_TOKEN=${HF_TOKEN}"
-    EXPORTS="${EXPORTS} && export HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}"
-fi
-if [ -n "${ITERATIONS_TO_SKIP:-}" ]; then
-    EXPORTS="${EXPORTS} && export ITERATIONS_TO_SKIP=${ITERATIONS_TO_SKIP}"
-fi
+# Validate required
+[[ -z "${STUDENT_CKPT:-}" ]] && echo "ERROR: STUDENT_CKPT required" && exit 1
+[[ -z "${TEACHER_CKPT:-}" ]] && echo "ERROR: TEACHER_CKPT required" && exit 1
+
+# === Build Container Exports ===
+# Use local /tmp for Triton cache to avoid race conditions
+EXPORTS="export TRITON_CACHE_DIR=/tmp/triton_cache_\${SLURM_JOB_ID}_\${SLURM_PROCID}"
+EXPORTS="${EXPORTS} && export NODE_RANK=\${SLURM_PROCID}"
+EXPORTS="${EXPORTS} && export NNODES=${NNODES} NUM_GPUS=${NUM_GPUS}"
+EXPORTS="${EXPORTS} && export TP_SIZE=${TP_SIZE} PP_SIZE=${PP_SIZE} EP_SIZE=${EP_SIZE} MBS=${MBS}"
+EXPORTS="${EXPORTS} && export IS_MOE=${IS_MOE:-false}"
+EXPORTS="${EXPORTS} && export MASTER_ADDR=${MASTER_ADDR} MASTER_PORT=${MASTER_PORT}"
+EXPORTS="${EXPORTS} && export MLM_DIR=${MLM_DIR} MODELOPT_DIR=${MODELOPT_DIR}"
+EXPORTS="${EXPORTS} && export QAD_CHECKPOINT_ROOT=${QAD_CHECKPOINT_ROOT} DATACACHE_DIR=${DATACACHE_DIR}"
+EXPORTS="${EXPORTS} && export STUDENT_CKPT=${STUDENT_CKPT} TEACHER_CKPT=${TEACHER_CKPT}"
+
+# Training hyperparameters
+for v in LR GBS MIN_LR LR_DECAY_STYLE SAVE_INTERVAL LOG_INTERVAL STUDENT_MODEL TEACHER_MODEL DATASET_NAME; do
+    [[ -n "${!v:-}" ]] && EXPORTS="${EXPORTS} && export ${v}=${!v}"
+done
 
-# Optional KD config
-if [ -n "${DISTILL_CONFIG_PATH:-}" ]; then
-    EXPORTS="${EXPORTS} && export DISTILL_CONFIG_PATH=${DISTILL_CONFIG_PATH}"
-fi
+# Model config
+[[ -n "${STUDENT_CONFIG_FILE:-}" ]] && EXPORTS="${EXPORTS} && export STUDENT_CONFIG_FILE=${STUDENT_CONFIG_FILE}"
+[[ -n "${TOKENIZER_MODEL:-}" ]] && EXPORTS="${EXPORTS} && export TOKENIZER_MODEL=${TOKENIZER_MODEL}"
+[[ -n "${TEACHER_MODEL_CONFIG:-}" ]] && EXPORTS="${EXPORTS} && export TEACHER_MODEL_CONFIG=${TEACHER_MODEL_CONFIG}"
 
-########################################################
-# Launch Training
-########################################################
+# Dataset
+[[ -n "${BLEND_PATH:-}" ]] && EXPORTS="${EXPORTS} && export BLEND_PATH=${BLEND_PATH}"
+[[ -n "${TRAIN_SAMPLES:-}" ]] && EXPORTS="${EXPORTS} && export TRAIN_SAMPLES=${TRAIN_SAMPLES}"
 
-SCRIPT_NAME="qad.sh"
+# Optional
+[[ -n "${HF_TOKEN:-}" ]] && EXPORTS="${EXPORTS} && export HF_TOKEN=${HF_TOKEN} HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}"
+[[ -n "${ITERATIONS_TO_SKIP:-}" ]] && EXPORTS="${EXPORTS} && export ITERATIONS_TO_SKIP=${ITERATIONS_TO_SKIP}"
+[[ -n "${DISTILL_CONFIG_PATH:-}" ]] && EXPORTS="${EXPORTS} && export DISTILL_CONFIG_PATH=${DISTILL_CONFIG_PATH}"
 
-# Build config args for qwen_qad.sh
+# === Launch ===
 CONFIG_ARGS=""
-if [ -n "${CONFIG_FILE}" ]; then
-    CONFIG_ARGS="--config ${CONFIG_FILE}"
-fi
-if [ -n "${HF_TOKEN:-}" ]; then
-    CONFIG_ARGS="${CONFIG_ARGS} --hf-token ${HF_TOKEN}"
-fi
+[[ -n "${CONFIG_FILE}" ]] && CONFIG_ARGS="--config ${CONFIG_FILE}"
+[[ -n "${HF_TOKEN:-}" ]] && CONFIG_ARGS="${CONFIG_ARGS} --hf-token ${HF_TOKEN}"
 
-run_cmd="pip install transformers==4.54 && \
-${EXPORTS} && \
-cd ${CONTAINER_WORKDIR} && \
-bash ${SCRIPT_NAME} ${CONFIG_ARGS}"
+run_cmd="pip install transformers==4.54 && ${EXPORTS} && cd ${CONTAINER_WORKDIR} && bash qad.sh ${CONFIG_ARGS}"
 
-echo ""
-echo "Running command:"
-echo "${run_cmd}"
-echo ""
+echo "Running: ${run_cmd}"
 
 srun -l \
     --output=${LOG_DIR}/%x_%j_${DATETIME}.log \
@@ -316,7 +162,6 @@ srun -l \
     --container-workdir ${CONTAINER_WORKDIR} \
     sh -c "${run_cmd}"
 
-echo ""
 echo "========================================"
 echo "QAD Training completed at $(date)"
 echo "Logs: ${LOG_DIR}/"

From c0317897868e33332dad5e9e2acdb8dbe4cc49f0 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:43:59 -0800
Subject: [PATCH 11/16] minor

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../qwen3-30b-a3b-instruct-2507-moe_template.conf      | 10 +++++-----
 examples/llm_qad/configs/qwen3-8b_template.conf        | 10 +++++-----
 examples/llm_qad/qad.sh                                |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
index 8915b5009..2e073324a 100644
--- a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
@@ -24,13 +24,13 @@ export TEACHER_MODEL_CONFIG="" # Teacher MLM model config yaml file, e.g., confi
 ########################################################
 # TRAINING (REQUIRED - no defaults in qwen_qad.sh)
 ########################################################
-export LR="1e-5"
-export GBS=256
-export MIN_LR="0.0"
-export LR_DECAY_STYLE="constant"
+export LR="5e-6"
+export GBS=64
+export MIN_LR="1e-8"
+export LR_DECAY_STYLE="cosine"
 export SAVE_INTERVAL=200
 export LOG_INTERVAL=10
-export DATASET_NAME="combined_openscience_nemotron"
+export DATASET_NAME="openscience_nemotron"  # use for logging
 export TRAIN_SAMPLES=102400
 
 ########################################################
diff --git a/examples/llm_qad/configs/qwen3-8b_template.conf b/examples/llm_qad/configs/qwen3-8b_template.conf
index 61f6d8582..12783efeb 100644
--- a/examples/llm_qad/configs/qwen3-8b_template.conf
+++ b/examples/llm_qad/configs/qwen3-8b_template.conf
@@ -23,13 +23,13 @@ export TEACHER_MODEL_CONFIG="" # Teacher MLM model config yaml file
 ########################################################
 # TRAINING
 ########################################################
-export LR="1e-5"
-export GBS=256
-export MIN_LR="0.0"
-export LR_DECAY_STYLE="constant"
+export LR="5e-6"
+export GBS=64
+export MIN_LR="1e-8"
+export LR_DECAY_STYLE="cosine"
 export SAVE_INTERVAL=200
 export LOG_INTERVAL=10
-export DATASET_NAME="openscience"
+export DATASET_NAME="openscience"  # use for logging
 export TRAIN_SAMPLES=102400
 
 ########################################################
diff --git a/examples/llm_qad/qad.sh b/examples/llm_qad/qad.sh
index aae9e54d8..52ec2bd6a 100644
--- a/examples/llm_qad/qad.sh
+++ b/examples/llm_qad/qad.sh
@@ -113,7 +113,7 @@ if [[ "${LOG_PARAMS_NORM}" == "1" ]]; then
     LOG_PARAMS_NORM_ARG="--log-params-norm"
 elif [[ "$IS_MOE" == "true" ]]; then
     LOG_PARAMS_NORM_ARG=""
-    warn "log-params-norm disabled for MoE model"
+    log_warn "log-params-norm disabled for MoE model"
 else
     LOG_PARAMS_NORM_ARG="--log-params-norm"
 fi

From 3e3b8d03baa6fa4885b9d86a6e6c87bb18c3c504 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:44:00 -0800
Subject: [PATCH 12/16] fix README

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_qad/README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/llm_qad/README.md b/examples/llm_qad/README.md
index 07f584ea7..5a8e3335f 100644
--- a/examples/llm_qad/README.md
+++ b/examples/llm_qad/README.md
@@ -33,7 +33,7 @@ You need the following checkpoints before training:
 2. **Teacher checkpoint**: Full-precision (BF16) model in Megatron-LM format
 3. **Teacher config YAML**: Model architecture configuration
 
-See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt) for checkpoint conversion from HuggingFace format. 
+See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt) for checkpoint conversion from HuggingFace format.
 
 ## Creating a Configuration
 
@@ -47,6 +47,7 @@ See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/m
 ### Create Your Config
 
 1. Copy a template:
+
    ```bash
    # For MoE models
    cp configs/qwen3-30b-a3b-instruct-2507-moe_template.conf configs/my-experiment.conf
@@ -58,6 +59,7 @@ See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/m
 2. Fill in required fields:
 
    **Checkpoints** (required):
+
    | Variable | Description |
    |----------|-------------|
    | `STUDENT_CKPT` | Path to quantized student MLM checkpoint |
@@ -65,12 +67,14 @@ See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/m
    | `TEACHER_MODEL_CONFIG` | Path to teacher YAML config (see below) |
 
    **Paths** (required):
+
    | Variable | Description |
    |----------|-------------|
    | `MLM_DIR` | Path to Megatron-LM directory |
    | `BLEND_PATH` | Path to datablend JSON (from dataset generation) |
 
    **Parallelism** (adjust for your hardware):
+
    | Variable | Dense Model | MoE Model |
    |----------|-------------|-----------|
    | `IS_MOE` | `false` | `true` |
@@ -79,6 +83,7 @@ See [Megatron-LM ModelOpt examples](https://github.com/NVIDIA/Megatron-LM/tree/m
    | `MBS` | `4` | `2` |
 
    **Training** (tune as needed):
+
    | Variable | Default | Description |
    |----------|---------|-------------|
    | `LR` | `1e-5` | Learning rate |
@@ -120,6 +125,7 @@ bash generate_dataset.sh \
 ### SLURM Batch Submission (Recommended)
 
 First, update `sbatch_qad.sh` SLURM header with your cluster settings:
+
 - `--account=<your-account>`
 - `--nodes`, `--gres=gpu`, `-t` as needed
 
@@ -150,6 +156,7 @@ bash qad.sh --config configs/qwen3-8b.conf
 ## Resuming Training
 
 Training automatically resumes from checkpoints. To force a fresh start:
+
 ```bash
 rm -rf /path/to/checkpoints/*/latest_checkpointed_iteration.txt
 ```
@@ -157,6 +164,7 @@ rm -rf /path/to/checkpoints/*/latest_checkpointed_iteration.txt
 ## Troubleshooting
 
 ### OOM Errors
+
 - Reduce `MBS` to `1`
-- Increase `EP_SIZE`,`TP_SIZE`,`PP_SIZE`
+- Increase `EP_SIZE`, `TP_SIZE`, `PP_SIZE`
 - Add more nodes

From b09a3645ca635aaffefdebb4fe8dee2eaa34378c Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:44:02 -0800
Subject: [PATCH 13/16] minor

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../configs/qwen3-30b-a3b-instruct-2507-moe_template.conf     | 2 +-
 examples/llm_qad/configs/qwen3-8b_template.conf               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
index 2e073324a..156046c4d 100644
--- a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
@@ -31,7 +31,7 @@ export LR_DECAY_STYLE="cosine"
 export SAVE_INTERVAL=200
 export LOG_INTERVAL=10
 export DATASET_NAME="openscience_nemotron"  # use for logging
-export TRAIN_SAMPLES=102400
+export TRAIN_SAMPLES=5120000
 
 ########################################################
 # PARALLELISM (MoE specific)
diff --git a/examples/llm_qad/configs/qwen3-8b_template.conf b/examples/llm_qad/configs/qwen3-8b_template.conf
index 12783efeb..32616ffe6 100644
--- a/examples/llm_qad/configs/qwen3-8b_template.conf
+++ b/examples/llm_qad/configs/qwen3-8b_template.conf
@@ -29,8 +29,8 @@ export MIN_LR="1e-8"
 export LR_DECAY_STYLE="cosine"
 export SAVE_INTERVAL=200
 export LOG_INTERVAL=10
-export DATASET_NAME="openscience"  # use for logging
-export TRAIN_SAMPLES=102400
+export DATASET_NAME="openscience_nemotron"  # use for logging
+export TRAIN_SAMPLES=5120000
 
 ########################################################
 # PARALLELISM (Dense model - simpler settings)

From 1e648bf63584e93b7e403a1eef0b4214bbedf519 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:44:03 -0800
Subject: [PATCH 14/16] update changelog

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 61c198026..826b51160 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -13,6 +13,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add support for subgraphs in ONNX autocast.
 - Add support for parallel draft heads in Eagle speculative decoding.
 - Add support to enable custom emulated quantization backend. See :meth:`register_quant_backend <modelopt.torch.quantization.nn.modules.tensor_quantizer.register_quant_backend>`` for more details. See an example in ``tests/unit/torch/quantization/test_custom_backend.py``.
+- Add ``examples/llm_qad`` for QAD training with Megatron-LM.
 
 **Deprecations**
 

From 0bdb2159be1f6314d1003c8c2d2559d396df68c5 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Mon, 22 Dec 2025 10:10:53 -0800
Subject: [PATCH 15/16] minor readme update

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 examples/llm_qad/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llm_qad/README.md b/examples/llm_qad/README.md
index 5a8e3335f..68fd01849 100644
--- a/examples/llm_qad/README.md
+++ b/examples/llm_qad/README.md
@@ -165,6 +165,6 @@ rm -rf /path/to/checkpoints/*/latest_checkpointed_iteration.txt
 
 ### OOM Errors
 
-- Reduce `MBS` to `1`
+- Reduce `MBS`
 - Increase `EP_SIZE`, `TP_SIZE`, `PP_SIZE`
 - Add more nodes

From e8b96069052b2153e292d19fc93e7f50cbb39b78 Mon Sep 17 00:00:00 2001
From: weimingc <17592131+meenchen@users.noreply.github.com>
Date: Mon, 22 Dec 2025 10:27:21 -0800
Subject: [PATCH 16/16] minor template change

Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
---
 .../qwen3-30b-a3b-instruct-2507-moe_template.conf      | 10 +++++++---
 examples/llm_qad/configs/qwen3-8b_template.conf        |  8 ++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
index 156046c4d..52ca5efe0 100644
--- a/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
+++ b/examples/llm_qad/configs/qwen3-30b-a3b-instruct-2507-moe_template.conf
@@ -34,17 +34,21 @@ export DATASET_NAME="openscience_nemotron"  # use for logging
 export TRAIN_SAMPLES=5120000
 
 ########################################################
-# PARALLELISM (MoE specific)
+# PARALLELISM
 # Note: QAD loads both student + teacher models, requires more memory
 ########################################################
-export IS_MOE=true
 export TP_SIZE=2
 export PP_SIZE=1
-export EP_SIZE=4
 export MBS=2
 export NUM_GPUS=4
 export MASTER_PORT=29500
 
+########################################################
+# MOE
+########################################################
+export EP_SIZE=4
+export IS_MOE=false
+
 ########################################################
 # PATHS (REQUIRED - no defaults in qwen_qad.sh)
 ########################################################
diff --git a/examples/llm_qad/configs/qwen3-8b_template.conf b/examples/llm_qad/configs/qwen3-8b_template.conf
index 32616ffe6..1af932b39 100644
--- a/examples/llm_qad/configs/qwen3-8b_template.conf
+++ b/examples/llm_qad/configs/qwen3-8b_template.conf
@@ -35,14 +35,18 @@ export TRAIN_SAMPLES=5120000
 ########################################################
 # PARALLELISM (Dense model - simpler settings)
 ########################################################
-export IS_MOE=false
 export TP_SIZE=1
 export PP_SIZE=1
-export EP_SIZE=1
 export MBS=4
 export NUM_GPUS=8
 export MASTER_PORT=29500
 
+########################################################
+# MOE
+########################################################
+export EP_SIZE=1
+export IS_MOE=false
+
 ########################################################
 # PATHS (REQUIRED)
 ########################################################