From 5023d5060f54805f67c33b053fb1e39675f12712 Mon Sep 17 00:00:00 2001
From: Cory Ye <cye@nvidia.com>
Date: Tue, 7 Apr 2026 10:14:55 -0700
Subject: [PATCH 1/5] Add Megatron-FSDP E2E integration test to TE CI/CD (L1).

Signed-off-by: Cory Ye <cye@nvidia.com>
---
 .../.gitignore                                |  2 +
 .../merges.txt                                |  1 +
 qa/L1_pytorch_mcore_fsdp_integration/test.sh  | 87 +++++++++++++++++++
 3 files changed, 90 insertions(+)
 create mode 100644 qa/L1_pytorch_mcore_fsdp_integration/.gitignore
 create mode 100644 qa/L1_pytorch_mcore_fsdp_integration/merges.txt
 create mode 100644 qa/L1_pytorch_mcore_fsdp_integration/test.sh

diff --git a/qa/L1_pytorch_mcore_fsdp_integration/.gitignore b/qa/L1_pytorch_mcore_fsdp_integration/.gitignore
new file mode 100644
index 0000000000..46426003ca
--- /dev/null
+++ b/qa/L1_pytorch_mcore_fsdp_integration/.gitignore
@@ -0,0 +1,2 @@
+Megatron-LM
+vocab.json
\ No newline at end of file
diff --git a/qa/L1_pytorch_mcore_fsdp_integration/merges.txt b/qa/L1_pytorch_mcore_fsdp_integration/merges.txt
new file mode 100644
index 0000000000..5e7f1fd949
--- /dev/null
+++ b/qa/L1_pytorch_mcore_fsdp_integration/merges.txt
@@ -0,0 +1 @@
+#version: 0.2
diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
new file mode 100644
index 0000000000..70e760fcba
--- /dev/null
+++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
@@ -0,0 +1,87 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Paths
+: ${TE_PATH:=/opt/transformerengine}
+: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/Megatron-LM}
+
+# Download Megatron-LM if needed
+if [ ! -d "${MCORE_PATH}" ]; then
+    pushd $(dirname ${MCORE_PATH})
+    git clone -b core_v0.16.1 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    popd
+fi
+
+# Create mock vocab
+VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/vocab.json
+printf "" > ${VOCAB_FILE}
+printf "{" >> ${VOCAB_FILE}
+printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
+seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
+printf "}" >> ${VOCAB_FILE}
+
+# Megatron-LM command to run Megatron-FSDP.
+# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer
+# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add:
+# --use-nccl-ub
+# --fsdp-double-buffer
+# --fsdp-manual-registration
+COMMAND="
+NVTE_TORCH_COMPILE=0
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+NVTE_FLASH_ATTN=1
+NVTE_FWD_LAYERNORM_SM_MARGIN=0
+NVTE_BWD_LAYERNORM_SM_MARGIN=0
+NVTE_BIAS_GELU_NVFUSION=0
+NVTE_BIAS_DROPOUT_FUSION=0
+unset CUDA_DEVICE_MAX_CONNECTIONS
+
+python3
+-m torch.distributed.launch
+--use_env
+--nnodes=1
+--nproc_per_node=4
+
+${MCORE_PATH}/pretrain_gpt.py
+--tensor-model-parallel-size 1
+--pipeline-model-parallel-size 1
+--use-cpu-initialization
+--num-layers 2
+--hidden-size 128
+--num-attention-heads 8
+--swiglu
+--seq-length 128
+--max-position-embeddings 128
+--micro-batch-size 1
+--global-batch-size 8
+--train-iters 10
+--eval-iters 10
+--lr 1e-4
+--mock-data
+--vocab-file ${VOCAB_FILE}
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/merges.txt
+--transformer-impl transformer_engine
+--use-megatron-fsdp
+--data-parallel-sharding-strategy optim_grads_params
+--use-distributed-optimizer
+--use-precision-aware-optimizer
+--num-distributed-optimizer-instances 2
+--outer-dp-sharding-strategy optim
+--fp8-format hybrid
+--fp8-param-gather
+--fp8-recipe tensorwise
+--cpu-offloading-num-layers 1
+--overlap-grad-reduce
+--overlap-param-gather
+--ckpt-format fsdp_dtensor
+--init-model-with-meta-device
+--bf16
+--grad-reduce-in-bf16
+"
+COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
+
+# Launch Megatron-LM
+bash -c "${COMMAND}"

From 4374e7f7d55d2320affba8850699fd7ef821ee4e Mon Sep 17 00:00:00 2001
From: Cory Ye <44509866+cspades@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:39:56 -0700
Subject: [PATCH 2/5] Update qa/L1_pytorch_mcore_fsdp_integration/test.sh

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Signed-off-by: Cory Ye <44509866+cspades@users.noreply.github.com>
---
 qa/L1_pytorch_mcore_fsdp_integration/test.sh | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
index 70e760fcba..39088d7a23 100644
--- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh
+++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
@@ -30,13 +30,13 @@ printf "}" >> ${VOCAB_FILE}
 # --fsdp-double-buffer
 # --fsdp-manual-registration
 COMMAND="
-NVTE_TORCH_COMPILE=0
-NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-NVTE_FLASH_ATTN=1
-NVTE_FWD_LAYERNORM_SM_MARGIN=0
-NVTE_BWD_LAYERNORM_SM_MARGIN=0
-NVTE_BIAS_GELU_NVFUSION=0
-NVTE_BIAS_DROPOUT_FUSION=0
+export NVTE_TORCH_COMPILE=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+export NVTE_FLASH_ATTN=1
+export NVTE_FWD_LAYERNORM_SM_MARGIN=0
+export NVTE_BWD_LAYERNORM_SM_MARGIN=0
+export NVTE_BIAS_GELU_NVFUSION=0
+export NVTE_BIAS_DROPOUT_FUSION=0
 unset CUDA_DEVICE_MAX_CONNECTIONS
 
 python3
@@ -81,7 +81,6 @@ ${MCORE_PATH}/pretrain_gpt.py
 --bf16
 --grad-reduce-in-bf16
 "
-COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
 
 # Launch Megatron-LM
 bash -c "${COMMAND}"

From dcd334ad4d9f92c2bf277303d8eccfc20566b40c Mon Sep 17 00:00:00 2001
From: Cory Ye <cye@nvidia.com>
Date: Tue, 7 Apr 2026 11:47:48 -0700
Subject: [PATCH 3/5] Explicit torchrun invoke.

Signed-off-by: Cory Ye <cye@nvidia.com>
---
 qa/L1_pytorch_mcore_fsdp_integration/test.sh | 100 +++++++++----------
 1 file changed, 48 insertions(+), 52 deletions(-)

diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
index 39088d7a23..bc9f19934c 100644
--- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh
+++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
@@ -23,13 +23,9 @@ printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
 seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
 printf "}" >> ${VOCAB_FILE}
 
-# Megatron-LM command to run Megatron-FSDP.
-# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer
-# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add:
-# --use-nccl-ub
-# --fsdp-double-buffer
-# --fsdp-manual-registration
-COMMAND="
+# Setting CUDA_DEVICE_MAX_CONNECTIONS limits
+# Megatron-FSDP stream parallelism.
+unset CUDA_DEVICE_MAX_CONNECTIONS
 export NVTE_TORCH_COMPILE=0
 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
 export NVTE_FLASH_ATTN=1
@@ -37,50 +33,50 @@ export NVTE_FWD_LAYERNORM_SM_MARGIN=0
 export NVTE_BWD_LAYERNORM_SM_MARGIN=0
 export NVTE_BIAS_GELU_NVFUSION=0
 export NVTE_BIAS_DROPOUT_FUSION=0
-unset CUDA_DEVICE_MAX_CONNECTIONS
 
-python3
--m torch.distributed.launch
---use_env
---nnodes=1
---nproc_per_node=4
-
-${MCORE_PATH}/pretrain_gpt.py
---tensor-model-parallel-size 1
---pipeline-model-parallel-size 1
---use-cpu-initialization
---num-layers 2
---hidden-size 128
---num-attention-heads 8
---swiglu
---seq-length 128
---max-position-embeddings 128
---micro-batch-size 1
---global-batch-size 8
---train-iters 10
---eval-iters 10
---lr 1e-4
---mock-data
---vocab-file ${VOCAB_FILE}
---merge-file ${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/merges.txt
---transformer-impl transformer_engine
---use-megatron-fsdp
---data-parallel-sharding-strategy optim_grads_params
---use-distributed-optimizer
---use-precision-aware-optimizer
---num-distributed-optimizer-instances 2
---outer-dp-sharding-strategy optim
---fp8-format hybrid
---fp8-param-gather
---fp8-recipe tensorwise
---cpu-offloading-num-layers 1
---overlap-grad-reduce
---overlap-param-gather
---ckpt-format fsdp_dtensor
---init-model-with-meta-device
---bf16
+# Megatron-LM command to run Megatron-FSDP.
+# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer
+# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add:
+# --use-nccl-ub
+# --fsdp-double-buffer
+# --fsdp-manual-registration
+python3 \
+-m torch.distributed.launch \
+--use_env \
+--nnodes=1 \
+--nproc_per_node=4 \
+${MCORE_PATH}/pretrain_gpt.py \
+--tensor-model-parallel-size 1 \
+--pipeline-model-parallel-size 1 \
+--use-cpu-initialization \
+--num-layers 2 \
+--hidden-size 128 \
+--num-attention-heads 8 \
+--swiglu \
+--seq-length 128 \
+--max-position-embeddings 128 \
+--micro-batch-size 1 \
+--global-batch-size 8 \
+--train-iters 10 \
+--eval-iters 10 \
+--lr 1e-4 \
+--mock-data \
+--vocab-file ${VOCAB_FILE} \
+--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/merges.txt \
+--transformer-impl transformer_engine \
+--use-megatron-fsdp \
+--data-parallel-sharding-strategy optim_grads_params \
+--use-distributed-optimizer \
+--use-precision-aware-optimizer \
+--num-distributed-optimizer-instances 2 \
+--outer-dp-sharding-strategy optim \
+--fp8-format hybrid \
+--fp8-param-gather \
+--fp8-recipe tensorwise \
+--cpu-offloading-num-layers 1 \
+--overlap-grad-reduce \
+--overlap-param-gather \
+--ckpt-format fsdp_dtensor \
+--init-model-with-meta-device \
+--bf16 \
 --grad-reduce-in-bf16
-"
-
-# Launch Megatron-LM
-bash -c "${COMMAND}"

From e52169463fb5cbee92a7626ffaf68bc9de1b737d Mon Sep 17 00:00:00 2001
From: Cory Ye <cye@nvidia.com>
Date: Tue, 7 Apr 2026 14:39:13 -0700
Subject: [PATCH 4/5] Edits.

Signed-off-by: Cory Ye <cye@nvidia.com>
---
 qa/L1_pytorch_mcore_fsdp_integration/test.sh | 22 +++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
index bc9f19934c..14dcdc3355 100644
--- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh
+++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
@@ -11,7 +11,10 @@ set -e
 # Download Megatron-LM if needed
 if [ ! -d "${MCORE_PATH}" ]; then
     pushd $(dirname ${MCORE_PATH})
-    git clone -b core_v0.16.1 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    git clone https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    # Megatron-LM / Megatron-FSDP commit for main branch on Apr. 7, 2026.
+    # Necessary to support wgrad accumulate fusion and Megatron-FSDP NCCL UBR.
+    pushd Megatron-LM && git checkout 8cbc45b6e039f300c53eb09579fc973d703455cd && popd
     popd
 fi
 
@@ -34,17 +37,17 @@ export NVTE_BWD_LAYERNORM_SM_MARGIN=0
 export NVTE_BIAS_GELU_NVFUSION=0
 export NVTE_BIAS_DROPOUT_FUSION=0
 
+# V1 offloading has bugs that are exposed by Megatron-FSDP.
+# This test will focus on validating the new offloading code.
+# Un-set the Megatron-LM default of V1.
+export NVTE_CPU_OFFLOAD_V1=0
+
 # Megatron-LM command to run Megatron-FSDP.
-# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer
-# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add:
-# --use-nccl-ub
-# --fsdp-double-buffer
-# --fsdp-manual-registration
 python3 \
 -m torch.distributed.launch \
 --use_env \
 --nnodes=1 \
---nproc_per_node=4 \
+--nproc_per_node=$(nvidia-smi -L | wc -l) \
 ${MCORE_PATH}/pretrain_gpt.py \
 --tensor-model-parallel-size 1 \
 --pipeline-model-parallel-size 1 \
@@ -70,9 +73,12 @@ ${MCORE_PATH}/pretrain_gpt.py \
 --use-precision-aware-optimizer \
 --num-distributed-optimizer-instances 2 \
 --outer-dp-sharding-strategy optim \
+--use-nccl-ub \
+--fsdp-double-buffer \
+--fsdp-manual-registration \
 --fp8-format hybrid \
 --fp8-param-gather \
---fp8-recipe tensorwise \
+--fp8-recipe mxfp8 \
 --cpu-offloading-num-layers 1 \
 --overlap-grad-reduce \
 --overlap-param-gather \

From fce5369336f32d84a0cc7e10e0d88709b03ea835 Mon Sep 17 00:00:00 2001
From: Cory Ye <cye@nvidia.com>
Date: Tue, 7 Apr 2026 16:18:23 -0700
Subject: [PATCH 5/5] Remove CPU initialization, add FW args.

Signed-off-by: Cory Ye <cye@nvidia.com>
---
 qa/L1_pytorch_mcore_fsdp_integration/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
index 14dcdc3355..ea54bcccce 100644
--- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh
+++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh
@@ -51,7 +51,6 @@ python3 \
 ${MCORE_PATH}/pretrain_gpt.py \
 --tensor-model-parallel-size 1 \
 --pipeline-model-parallel-size 1 \
---use-cpu-initialization \
 --num-layers 2 \
 --hidden-size 128 \
 --num-attention-heads 8 \
@@ -62,6 +61,7 @@ ${MCORE_PATH}/pretrain_gpt.py \
 --global-batch-size 8 \
 --train-iters 10 \
 --eval-iters 10 \
+--eval-interval 100 \
 --lr 1e-4 \
 --mock-data \
 --vocab-file ${VOCAB_FILE} \