diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh index f2b0b07fed..2d4d80967b 100644 --- a/qa/L0_pytorch_unittest/test.sh +++ b/qa/L0_pytorch_unittest/test.sh @@ -7,60 +7,153 @@ function error_exit() { exit 1 } +: ${TE_PATH:=/opt/transformerengine} +: ${XML_LOG_DIR:=/logs} +mkdir -p "$XML_LOG_DIR" + +set -x + +pip3 install pytest==8.2.1 || error_exit "Failed to install pytest" + +# ── Parallel test infrastructure ──────────────────────────────────────────── +# Detect GPUs and run tests in parallel waves (one test per GPU per wave). +# With 1 GPU, behavior is identical to sequential execution. + +FAIL_DIR=$(mktemp -d) + function test_fail() { - RET=1 - FAILED_CASES="$FAILED_CASES $1" + echo "$1" >> "$FAIL_DIR/failures" echo "Error: sub-test failed: $1" } -RET=0 -FAILED_CASES="" +# Detect available GPUs +if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then + IFS=',' read -ra GPU_LIST <<< "$CUDA_VISIBLE_DEVICES" + NUM_GPUS=${#GPU_LIST[@]} +else + NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l) + NUM_GPUS=${NUM_GPUS:-1} + GPU_LIST=() + for ((i=0; i>> Starting: ${fail_label} on GPU ${GPU_LIST[$gpu_id]}" + ( + eval "CUDA_VISIBLE_DEVICES=${GPU_LIST[$gpu_id]} ${env_prefix} python3 -u -m pytest --tb=auto --junitxml=$XML_LOG_DIR/${xml_name} ${test_path}" \ + > "$XML_LOG_DIR/${xml_name%.xml}.log" 2>&1 \ + || test_fail "$fail_label" + echo ">>> Finished: ${fail_label} on GPU ${GPU_LIST[$gpu_id]}" + ) & + fi + + # When we've filled all GPUs, wait for the wave to complete + if [ "$WAVE_COUNT" -ge "$NUM_GPUS" ] && [ "$NUM_GPUS" -gt 1 ]; then + wait + WAVE_COUNT=0 + fi +} + +# ── Checkpoint pre-step (must run before test_checkpoint.py) ──────────────── -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py" -PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py" -PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py || test_fail "test_jit.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4 || test_fail "test_nvfp4" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_mxfp8.xml $TE_PATH/tests/pytorch/mxfp8 || test_fail "test_mxfp8" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_quantized_tensor.xml $TE_PATH/tests/pytorch/test_quantized_tensor.py || test_fail "test_quantized_tensor.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_grouped_tensor.xml $TE_PATH/tests/pytorch/test_grouped_tensor.py || test_fail "test_grouped_tensor.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py" -NVTE_FLASH_ATTN=0 NVTE_CPU_OFFLOAD_V1=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading_v1.xml $TE_PATH/tests/pytorch/test_cpu_offloading_v1.py || test_fail "test_cpu_offloading_v1.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py" -NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention_deterministic.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/attention/test_kv_cache.py || test_fail "test_kv_cache.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py" export NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint if [ ! -d "$NVTE_TEST_CHECKPOINT_ARTIFACT_PATH" ]; then - python3 $TE_PATH/tests/pytorch/test_checkpoint.py --save-checkpoint all || error_exit "Failed to generate checkpoint files" + python3 $TE_PATH/tests/pytorch/test_checkpoint.py --save-checkpoint all \ + || error_exit "Failed to generate checkpoint files" fi -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py" -python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_partial_cast.xml $TE_PATH/tests/pytorch/test_partial_cast.py || test_fail "test_partial_cast.py" -if [ "$RET" -ne 0 ]; then - echo "Error in the following test cases:$FAILED_CASES" +# ── Tests ─────────────────────────────────────────────────────────────────── +# Each run_test call: env_prefix, xml_name, test_path, fail_label +# Tests are dispatched in waves of NUM_GPUS, one per GPU. + +# DEBUG: inject a deliberate failure to test error capture (remove before merging) +run_test "" "pytest_debug_forced_fail.xml" "-c 'import pytest; pytest.fail(\"DELIBERATE FAILURE: testing parallel error capture\")'" "debug_forced_fail" + +# DEBUG: inject a RuntimeError into test_sanity's GPU slot (remove before merging) +run_test "" "pytest_test_sanity.xml" "-c 'import torch; raise RuntimeError(f\"DELIBERATE ERROR: simulating OOM on GPU {torch.cuda.current_device()}\")'" "test_sanity.py_INJECTED" +run_test "" "pytest_test_recipe.xml" "$TE_PATH/tests/pytorch/test_recipe.py" "test_recipe.py" +run_test "" "pytest_test_deferred_init.xml" "$TE_PATH/tests/pytorch/test_deferred_init.py" "test_deferred_init.py" +run_test "PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0" "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/test_numerics.py" "test_numerics.py" +run_test "PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0" "pytest_test_cuda_graphs.xml" "$TE_PATH/tests/pytorch/test_cuda_graphs.py" "test_cuda_graphs.py" +run_test "" "pytest_test_jit.xml" "$TE_PATH/tests/pytorch/test_jit.py" "test_jit.py" +run_test "" "pytest_test_fused_rope.xml" "$TE_PATH/tests/pytorch/test_fused_rope.py" "test_fused_rope.py" +run_test "" "pytest_test_nvfp4.xml" "$TE_PATH/tests/pytorch/nvfp4" "test_nvfp4" +run_test "" "pytest_test_mxfp8.xml" "$TE_PATH/tests/pytorch/mxfp8" "test_mxfp8" +run_test "" "pytest_test_quantized_tensor.xml" "$TE_PATH/tests/pytorch/test_quantized_tensor.py" "test_quantized_tensor.py" +run_test "" "pytest_test_float8blockwisetensor.xml" "$TE_PATH/tests/pytorch/test_float8blockwisetensor.py" "test_float8blockwisetensor.py" +run_test "" "pytest_test_float8_blockwise_scaling_exact.xml" "$TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py" "test_float8_blockwise_scaling_exact.py" +run_test "" "pytest_test_float8_blockwise_gemm_exact.xml" "$TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py" "test_float8_blockwise_gemm_exact.py" +run_test "" "test_grouped_tensor.xml" "$TE_PATH/tests/pytorch/test_grouped_tensor.py" "test_grouped_tensor.py" +run_test "" "pytest_test_gqa.xml" "$TE_PATH/tests/pytorch/test_gqa.py" "test_gqa.py" +run_test "" "pytest_test_fused_optimizer.xml" "$TE_PATH/tests/pytorch/test_fused_optimizer.py" "test_fused_optimizer.py" +run_test "" "pytest_test_multi_tensor.xml" "$TE_PATH/tests/pytorch/test_multi_tensor.py" "test_multi_tensor.py" +run_test "" "pytest_test_fusible_ops.xml" "$TE_PATH/tests/pytorch/test_fusible_ops.py" "test_fusible_ops.py" +run_test "" "pytest_test_permutation.xml" "$TE_PATH/tests/pytorch/test_permutation.py" "test_permutation.py" +run_test "" "pytest_test_parallel_cross_entropy.xml" "$TE_PATH/tests/pytorch/test_parallel_cross_entropy.py" "test_parallel_cross_entropy.py" +run_test "" "pytest_test_cpu_offloading.xml" "$TE_PATH/tests/pytorch/test_cpu_offloading.py" "test_cpu_offloading.py" +run_test "NVTE_FLASH_ATTN=0 NVTE_CPU_OFFLOAD_V1=1" "pytest_test_cpu_offloading_v1.xml" "$TE_PATH/tests/pytorch/test_cpu_offloading_v1.py" "test_cpu_offloading_v1.py" +run_test "" "pytest_test_attention.xml" "$TE_PATH/tests/pytorch/attention/test_attention.py" "test_attention.py" +run_test "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0" "pytest_test_attention_deterministic.xml" "$TE_PATH/tests/pytorch/attention/test_attention.py" "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention.py" +run_test "" "pytest_test_kv_cache.xml" "$TE_PATH/tests/pytorch/attention/test_kv_cache.py" "test_kv_cache.py" +run_test "" "pytest_test_hf_integration.xml" "$TE_PATH/tests/pytorch/test_hf_integration.py" "test_hf_integration.py" +run_test "" "pytest_test_checkpoint.xml" "$TE_PATH/tests/pytorch/test_checkpoint.py" "test_checkpoint.py" +run_test "" "pytest_test_fused_router.xml" "$TE_PATH/tests/pytorch/test_fused_router.py" "test_fused_router.py" +run_test "" "pytest_test_partial_cast.xml" "$TE_PATH/tests/pytorch/test_partial_cast.py" "test_partial_cast.py" + +# ── Wait for remaining background jobs ────────────────────────────────────── + +if [ "$NUM_GPUS" -gt 1 ]; then + wait +fi + +# ── Replay per-test logs into trace ────────────────────────────────────────── + +if [ "$NUM_GPUS" -gt 1 ]; then + echo "" + echo "=== Per-test output (replayed from parallel execution) ===" + for logfile in "$XML_LOG_DIR"/*.log; do + if [ -f "$logfile" ]; then + echo "" + echo "────────────────────────────────────────────────────────" + echo ">>> $(basename "$logfile" .log)" + echo "────────────────────────────────────────────────────────" + cat "$logfile" + fi + done + echo "" + echo "=== End of per-test output ===" +fi + +# ── Report results ────────────────────────────────────────────────────────── + +if [ -s "$FAIL_DIR/failures" ]; then + FAILED_CASES=$(cat "$FAIL_DIR/failures" | tr '\n' ' ') + echo "Error in the following test cases: $FAILED_CASES" + rm -rf "$FAIL_DIR" exit 1 fi +rm -rf "$FAIL_DIR" echo "All tests passed" exit 0