From 5023d5060f54805f67c33b053fb1e39675f12712 Mon Sep 17 00:00:00 2001 From: Cory Ye Date: Tue, 7 Apr 2026 10:14:55 -0700 Subject: [PATCH 1/5] Add Megatron-FSDP E2E integration test to TE CI/CD (L1). Signed-off-by: Cory Ye --- .../.gitignore | 2 + .../merges.txt | 1 + qa/L1_pytorch_mcore_fsdp_integration/test.sh | 87 +++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 qa/L1_pytorch_mcore_fsdp_integration/.gitignore create mode 100644 qa/L1_pytorch_mcore_fsdp_integration/merges.txt create mode 100644 qa/L1_pytorch_mcore_fsdp_integration/test.sh diff --git a/qa/L1_pytorch_mcore_fsdp_integration/.gitignore b/qa/L1_pytorch_mcore_fsdp_integration/.gitignore new file mode 100644 index 0000000000..46426003ca --- /dev/null +++ b/qa/L1_pytorch_mcore_fsdp_integration/.gitignore @@ -0,0 +1,2 @@ +Megatron-LM +vocab.json \ No newline at end of file diff --git a/qa/L1_pytorch_mcore_fsdp_integration/merges.txt b/qa/L1_pytorch_mcore_fsdp_integration/merges.txt new file mode 100644 index 0000000000..5e7f1fd949 --- /dev/null +++ b/qa/L1_pytorch_mcore_fsdp_integration/merges.txt @@ -0,0 +1 @@ +#version: 0.2 diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh new file mode 100644 index 0000000000..70e760fcba --- /dev/null +++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh @@ -0,0 +1,87 @@ +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +set -e + +# Paths +: ${TE_PATH:=/opt/transformerengine} +: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/Megatron-LM} + +# Download Megatron-LM if needed +if [ ! -d "${MCORE_PATH}" ]; then + pushd $(dirname ${MCORE_PATH}) + git clone -b core_v0.16.1 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM + popd +fi + +# Create mock vocab +VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/vocab.json +printf "" > ${VOCAB_FILE} +printf "{" >> ${VOCAB_FILE} +printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE} +seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE} +printf "}" >> ${VOCAB_FILE} + +# Megatron-LM command to run Megatron-FSDP. +# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer +# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add: +# --use-nccl-ub +# --fsdp-double-buffer +# --fsdp-manual-registration +COMMAND=" +NVTE_TORCH_COMPILE=0 +NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 +NVTE_FLASH_ATTN=1 +NVTE_FWD_LAYERNORM_SM_MARGIN=0 +NVTE_BWD_LAYERNORM_SM_MARGIN=0 +NVTE_BIAS_GELU_NVFUSION=0 +NVTE_BIAS_DROPOUT_FUSION=0 +unset CUDA_DEVICE_MAX_CONNECTIONS + +python3 +-m torch.distributed.launch +--use_env +--nnodes=1 +--nproc_per_node=4 + +${MCORE_PATH}/pretrain_gpt.py +--tensor-model-parallel-size 1 +--pipeline-model-parallel-size 1 +--use-cpu-initialization +--num-layers 2 +--hidden-size 128 +--num-attention-heads 8 +--swiglu +--seq-length 128 +--max-position-embeddings 128 +--micro-batch-size 1 +--global-batch-size 8 +--train-iters 10 +--eval-iters 10 +--lr 1e-4 +--mock-data +--vocab-file ${VOCAB_FILE} +--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/merges.txt +--transformer-impl transformer_engine +--use-megatron-fsdp +--data-parallel-sharding-strategy optim_grads_params +--use-distributed-optimizer +--use-precision-aware-optimizer +--num-distributed-optimizer-instances 2 +--outer-dp-sharding-strategy optim +--fp8-format hybrid +--fp8-param-gather +--fp8-recipe tensorwise +--cpu-offloading-num-layers 1 +--overlap-grad-reduce +--overlap-param-gather +--ckpt-format fsdp_dtensor +--init-model-with-meta-device +--bf16 +--grad-reduce-in-bf16 +" +COMMAND=$(echo "${COMMAND}" | tr '\n' ' ') + +# Launch Megatron-LM +bash -c "${COMMAND}" From 4374e7f7d55d2320affba8850699fd7ef821ee4e Mon Sep 17 00:00:00 2001 From: Cory Ye <44509866+cspades@users.noreply.github.com> Date: Tue, 7 Apr 2026 11:39:56 -0700 Subject: [PATCH 2/5] Update qa/L1_pytorch_mcore_fsdp_integration/test.sh Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Cory Ye <44509866+cspades@users.noreply.github.com> --- qa/L1_pytorch_mcore_fsdp_integration/test.sh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh index 70e760fcba..39088d7a23 100644 --- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh +++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh @@ -30,13 +30,13 @@ printf "}" >> ${VOCAB_FILE} # --fsdp-double-buffer # --fsdp-manual-registration COMMAND=" -NVTE_TORCH_COMPILE=0 -NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 -NVTE_FLASH_ATTN=1 -NVTE_FWD_LAYERNORM_SM_MARGIN=0 -NVTE_BWD_LAYERNORM_SM_MARGIN=0 -NVTE_BIAS_GELU_NVFUSION=0 -NVTE_BIAS_DROPOUT_FUSION=0 +export NVTE_TORCH_COMPILE=0 +export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 +export NVTE_FLASH_ATTN=1 +export NVTE_FWD_LAYERNORM_SM_MARGIN=0 +export NVTE_BWD_LAYERNORM_SM_MARGIN=0 +export NVTE_BIAS_GELU_NVFUSION=0 +export NVTE_BIAS_DROPOUT_FUSION=0 unset CUDA_DEVICE_MAX_CONNECTIONS python3 @@ -81,7 +81,6 @@ ${MCORE_PATH}/pretrain_gpt.py --bf16 --grad-reduce-in-bf16 " -COMMAND=$(echo "${COMMAND}" | tr '\n' ' ') # Launch Megatron-LM bash -c "${COMMAND}" From dcd334ad4d9f92c2bf277303d8eccfc20566b40c Mon Sep 17 00:00:00 2001 From: Cory Ye Date: Tue, 7 Apr 2026 11:47:48 -0700 Subject: [PATCH 3/5] Explicit torchrun invoke. Signed-off-by: Cory Ye --- qa/L1_pytorch_mcore_fsdp_integration/test.sh | 100 +++++++++---------- 1 file changed, 48 insertions(+), 52 deletions(-) diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh index 39088d7a23..bc9f19934c 100644 --- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh +++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh @@ -23,13 +23,9 @@ printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE} seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE} printf "}" >> ${VOCAB_FILE} -# Megatron-LM command to run Megatron-FSDP. -# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer -# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add: -# --use-nccl-ub -# --fsdp-double-buffer -# --fsdp-manual-registration -COMMAND=" +# Setting CUDA_DEVICE_MAX_CONNECTIONS limits +# Megatron-FSDP stream parallelism. +unset CUDA_DEVICE_MAX_CONNECTIONS export NVTE_TORCH_COMPILE=0 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 export NVTE_FLASH_ATTN=1 @@ -37,50 +33,50 @@ export NVTE_FWD_LAYERNORM_SM_MARGIN=0 export NVTE_BWD_LAYERNORM_SM_MARGIN=0 export NVTE_BIAS_GELU_NVFUSION=0 export NVTE_BIAS_DROPOUT_FUSION=0 -unset CUDA_DEVICE_MAX_CONNECTIONS -python3 --m torch.distributed.launch ---use_env ---nnodes=1 ---nproc_per_node=4 - -${MCORE_PATH}/pretrain_gpt.py ---tensor-model-parallel-size 1 ---pipeline-model-parallel-size 1 ---use-cpu-initialization ---num-layers 2 ---hidden-size 128 ---num-attention-heads 8 ---swiglu ---seq-length 128 ---max-position-embeddings 128 ---micro-batch-size 1 ---global-batch-size 8 ---train-iters 10 ---eval-iters 10 ---lr 1e-4 ---mock-data ---vocab-file ${VOCAB_FILE} ---merge-file ${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/merges.txt ---transformer-impl transformer_engine ---use-megatron-fsdp ---data-parallel-sharding-strategy optim_grads_params ---use-distributed-optimizer ---use-precision-aware-optimizer ---num-distributed-optimizer-instances 2 ---outer-dp-sharding-strategy optim ---fp8-format hybrid ---fp8-param-gather ---fp8-recipe tensorwise ---cpu-offloading-num-layers 1 ---overlap-grad-reduce ---overlap-param-gather ---ckpt-format fsdp_dtensor ---init-model-with-meta-device ---bf16 +# Megatron-LM command to run Megatron-FSDP. +# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer +# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add: +# --use-nccl-ub +# --fsdp-double-buffer +# --fsdp-manual-registration +python3 \ +-m torch.distributed.launch \ +--use_env \ +--nnodes=1 \ +--nproc_per_node=4 \ +${MCORE_PATH}/pretrain_gpt.py \ +--tensor-model-parallel-size 1 \ +--pipeline-model-parallel-size 1 \ +--use-cpu-initialization \ +--num-layers 2 \ +--hidden-size 128 \ +--num-attention-heads 8 \ +--swiglu \ +--seq-length 128 \ +--max-position-embeddings 128 \ +--micro-batch-size 1 \ +--global-batch-size 8 \ +--train-iters 10 \ +--eval-iters 10 \ +--lr 1e-4 \ +--mock-data \ +--vocab-file ${VOCAB_FILE} \ +--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_fsdp_integration/merges.txt \ +--transformer-impl transformer_engine \ +--use-megatron-fsdp \ +--data-parallel-sharding-strategy optim_grads_params \ +--use-distributed-optimizer \ +--use-precision-aware-optimizer \ +--num-distributed-optimizer-instances 2 \ +--outer-dp-sharding-strategy optim \ +--fp8-format hybrid \ +--fp8-param-gather \ +--fp8-recipe tensorwise \ +--cpu-offloading-num-layers 1 \ +--overlap-grad-reduce \ +--overlap-param-gather \ +--ckpt-format fsdp_dtensor \ +--init-model-with-meta-device \ +--bf16 \ --grad-reduce-in-bf16 -" - -# Launch Megatron-LM -bash -c "${COMMAND}" From e52169463fb5cbee92a7626ffaf68bc9de1b737d Mon Sep 17 00:00:00 2001 From: Cory Ye Date: Tue, 7 Apr 2026 14:39:13 -0700 Subject: [PATCH 4/5] Edits. Signed-off-by: Cory Ye --- qa/L1_pytorch_mcore_fsdp_integration/test.sh | 22 +++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh index bc9f19934c..14dcdc3355 100644 --- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh +++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh @@ -11,7 +11,10 @@ set -e # Download Megatron-LM if needed if [ ! -d "${MCORE_PATH}" ]; then pushd $(dirname ${MCORE_PATH}) - git clone -b core_v0.16.1 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM + git clone https://github.com/NVIDIA/Megatron-LM.git Megatron-LM + # Megatron-LM / Megatron-FSDP commit for main branch on Apr. 7, 2026. + # Necessary to support wgrad accumulate fusion and Megatron-FSDP NCCL UBR. + pushd Megatron-LM && git checkout 8cbc45b6e039f300c53eb09579fc973d703455cd && popd popd fi @@ -34,17 +37,17 @@ export NVTE_BWD_LAYERNORM_SM_MARGIN=0 export NVTE_BIAS_GELU_NVFUSION=0 export NVTE_BIAS_DROPOUT_FUSION=0 +# V1 offloading has bugs that are exposed by Megatron-FSDP. +# This test will focus on validating the new offloading code. +# Un-set the Megatron-LM default of V1. +export NVTE_CPU_OFFLOAD_V1=0 + # Megatron-LM command to run Megatron-FSDP. -# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer -# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add: -# --use-nccl-ub -# --fsdp-double-buffer -# --fsdp-manual-registration python3 \ -m torch.distributed.launch \ --use_env \ --nnodes=1 \ ---nproc_per_node=4 \ +--nproc_per_node=$(nvidia-smi -L | wc -l) \ ${MCORE_PATH}/pretrain_gpt.py \ --tensor-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \ @@ -70,9 +73,12 @@ ${MCORE_PATH}/pretrain_gpt.py \ --use-precision-aware-optimizer \ --num-distributed-optimizer-instances 2 \ --outer-dp-sharding-strategy optim \ +--use-nccl-ub \ +--fsdp-double-buffer \ +--fsdp-manual-registration \ --fp8-format hybrid \ --fp8-param-gather \ ---fp8-recipe tensorwise \ +--fp8-recipe mxfp8 \ --cpu-offloading-num-layers 1 \ --overlap-grad-reduce \ --overlap-param-gather \ From fce5369336f32d84a0cc7e10e0d88709b03ea835 Mon Sep 17 00:00:00 2001 From: Cory Ye Date: Tue, 7 Apr 2026 16:18:23 -0700 Subject: [PATCH 5/5] Remove CPU initialization, add FW args. Signed-off-by: Cory Ye --- qa/L1_pytorch_mcore_fsdp_integration/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L1_pytorch_mcore_fsdp_integration/test.sh b/qa/L1_pytorch_mcore_fsdp_integration/test.sh index 14dcdc3355..ea54bcccce 100644 --- a/qa/L1_pytorch_mcore_fsdp_integration/test.sh +++ b/qa/L1_pytorch_mcore_fsdp_integration/test.sh @@ -51,7 +51,6 @@ python3 \ ${MCORE_PATH}/pretrain_gpt.py \ --tensor-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \ ---use-cpu-initialization \ --num-layers 2 \ --hidden-size 128 \ --num-attention-heads 8 \ @@ -62,6 +61,7 @@ ${MCORE_PATH}/pretrain_gpt.py \ --global-batch-size 8 \ --train-iters 10 \ --eval-iters 10 \ +--eval-interval 100 \ --lr 1e-4 \ --mock-data \ --vocab-file ${VOCAB_FILE} \