diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index f2b0b07fed..2d4d80967b 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -7,60 +7,153 @@ function error_exit() {
     exit 1
 }
 
+: ${TE_PATH:=/opt/transformerengine}
+: ${XML_LOG_DIR:=/logs}
+mkdir -p "$XML_LOG_DIR"
+
+set -x
+
+pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
+
+# ── Parallel test infrastructure ────────────────────────────────────────────
+# Detect GPUs and run tests in parallel waves (one test per GPU per wave).
+# With 1 GPU, behavior is identical to sequential execution.
+
+FAIL_DIR=$(mktemp -d)
+
 function test_fail() {
-    RET=1
-    FAILED_CASES="$FAILED_CASES $1"
+    echo "$1" >> "$FAIL_DIR/failures"
     echo "Error: sub-test failed: $1"
 }
 
-RET=0
-FAILED_CASES=""
+# Detect available GPUs
+if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then
+    IFS=',' read -ra GPU_LIST <<< "$CUDA_VISIBLE_DEVICES"
+    NUM_GPUS=${#GPU_LIST[@]}
+else
+    NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
+    NUM_GPUS=${NUM_GPUS:-1}
+    GPU_LIST=()
+    for ((i=0; i<NUM_GPUS; i++)); do GPU_LIST+=($i); done
+fi
+if [ "$NUM_GPUS" -lt 1 ]; then
+    NUM_GPUS=1
+    GPU_LIST=(0)
+fi
+echo "Detected $NUM_GPUS GPU(s): ${GPU_LIST[*]}"
 
-set -x
+GPU_COUNTER=0
+WAVE_COUNT=0
 
-: ${TE_PATH:=/opt/transformerengine}
-: ${XML_LOG_DIR:=/logs}
-mkdir -p "$XML_LOG_DIR"
+function run_test() {
+    local env_prefix="$1"
+    local xml_name="$2"
+    local test_path="$3"
+    local fail_label="$4"
+    local gpu_id=$((GPU_COUNTER % NUM_GPUS))
+    GPU_COUNTER=$((GPU_COUNTER + 1))
+    WAVE_COUNT=$((WAVE_COUNT + 1))
 
-pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
+    if [ "$NUM_GPUS" -le 1 ]; then
+        # Single GPU: run synchronously (identical to original behavior)
+        eval "${env_prefix} python3 -u -m pytest --tb=auto --junitxml=$XML_LOG_DIR/${xml_name} ${test_path}" \
+            || test_fail "$fail_label"
+    else
+        # Multi GPU: run in background on assigned GPU
+        echo ">>> Starting: ${fail_label} on GPU ${GPU_LIST[$gpu_id]}"
+        (
+            eval "CUDA_VISIBLE_DEVICES=${GPU_LIST[$gpu_id]} ${env_prefix} python3 -u -m pytest --tb=auto --junitxml=$XML_LOG_DIR/${xml_name} ${test_path}" \
+                > "$XML_LOG_DIR/${xml_name%.xml}.log" 2>&1 \
+                || test_fail "$fail_label"
+            echo ">>> Finished: ${fail_label} on GPU ${GPU_LIST[$gpu_id]}"
+        ) &
+    fi
+
+    # When we've filled all GPUs, wait for the wave to complete
+    if [ "$WAVE_COUNT" -ge "$NUM_GPUS" ] && [ "$NUM_GPUS" -gt 1 ]; then
+        wait
+        WAVE_COUNT=0
+    fi
+}
+
+# ── Checkpoint pre-step (must run before test_checkpoint.py) ────────────────
 
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py"
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py || test_fail "test_jit.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PATH/tests/pytorch/nvfp4 || test_fail "test_nvfp4"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_mxfp8.xml $TE_PATH/tests/pytorch/mxfp8 || test_fail "test_mxfp8"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_quantized_tensor.xml $TE_PATH/tests/pytorch/test_quantized_tensor.py || test_fail "test_quantized_tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_grouped_tensor.xml $TE_PATH/tests/pytorch/test_grouped_tensor.py || test_fail "test_grouped_tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
-NVTE_FLASH_ATTN=0 NVTE_CPU_OFFLOAD_V1=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading_v1.xml $TE_PATH/tests/pytorch/test_cpu_offloading_v1.py || test_fail "test_cpu_offloading_v1.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py"
-NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_attention_deterministic.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/attention/test_kv_cache.py || test_fail "test_kv_cache.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py"
 export NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint
 if [ ! -d "$NVTE_TEST_CHECKPOINT_ARTIFACT_PATH" ]; then
-    python3 $TE_PATH/tests/pytorch/test_checkpoint.py --save-checkpoint all || error_exit "Failed to generate checkpoint files"
+    python3 $TE_PATH/tests/pytorch/test_checkpoint.py --save-checkpoint all \
+        || error_exit "Failed to generate checkpoint files"
 fi
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_partial_cast.xml $TE_PATH/tests/pytorch/test_partial_cast.py || test_fail "test_partial_cast.py"
 
-if [ "$RET" -ne 0 ]; then
-    echo "Error in the following test cases:$FAILED_CASES"
+# ── Tests ───────────────────────────────────────────────────────────────────
+# Each run_test call: env_prefix, xml_name, test_path, fail_label
+# Tests are dispatched in waves of NUM_GPUS, one per GPU.
+
+# DEBUG: inject a deliberate failure to test error capture (remove before merging)
+run_test "" "pytest_debug_forced_fail.xml" "-c 'import pytest; pytest.fail(\"DELIBERATE FAILURE: testing parallel error capture\")'" "debug_forced_fail"
+
+# DEBUG: inject a RuntimeError into test_sanity's GPU slot (remove before merging)
+run_test "" "pytest_test_sanity.xml" "-c 'import torch; raise RuntimeError(f\"DELIBERATE ERROR: simulating OOM on GPU {torch.cuda.current_device()}\")'" "test_sanity.py_INJECTED"
+run_test "" "pytest_test_recipe.xml" "$TE_PATH/tests/pytorch/test_recipe.py" "test_recipe.py"
+run_test "" "pytest_test_deferred_init.xml" "$TE_PATH/tests/pytorch/test_deferred_init.py" "test_deferred_init.py"
+run_test "PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0" "pytest_test_numerics.xml" "$TE_PATH/tests/pytorch/test_numerics.py" "test_numerics.py"
+run_test "PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0" "pytest_test_cuda_graphs.xml" "$TE_PATH/tests/pytorch/test_cuda_graphs.py" "test_cuda_graphs.py"
+run_test "" "pytest_test_jit.xml" "$TE_PATH/tests/pytorch/test_jit.py" "test_jit.py"
+run_test "" "pytest_test_fused_rope.xml" "$TE_PATH/tests/pytorch/test_fused_rope.py" "test_fused_rope.py"
+run_test "" "pytest_test_nvfp4.xml" "$TE_PATH/tests/pytorch/nvfp4" "test_nvfp4"
+run_test "" "pytest_test_mxfp8.xml" "$TE_PATH/tests/pytorch/mxfp8" "test_mxfp8"
+run_test "" "pytest_test_quantized_tensor.xml" "$TE_PATH/tests/pytorch/test_quantized_tensor.py" "test_quantized_tensor.py"
+run_test "" "pytest_test_float8blockwisetensor.xml" "$TE_PATH/tests/pytorch/test_float8blockwisetensor.py" "test_float8blockwisetensor.py"
+run_test "" "pytest_test_float8_blockwise_scaling_exact.xml" "$TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py" "test_float8_blockwise_scaling_exact.py"
+run_test "" "pytest_test_float8_blockwise_gemm_exact.xml" "$TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py" "test_float8_blockwise_gemm_exact.py"
+run_test "" "test_grouped_tensor.xml" "$TE_PATH/tests/pytorch/test_grouped_tensor.py" "test_grouped_tensor.py"
+run_test "" "pytest_test_gqa.xml" "$TE_PATH/tests/pytorch/test_gqa.py" "test_gqa.py"
+run_test "" "pytest_test_fused_optimizer.xml" "$TE_PATH/tests/pytorch/test_fused_optimizer.py" "test_fused_optimizer.py"
+run_test "" "pytest_test_multi_tensor.xml" "$TE_PATH/tests/pytorch/test_multi_tensor.py" "test_multi_tensor.py"
+run_test "" "pytest_test_fusible_ops.xml" "$TE_PATH/tests/pytorch/test_fusible_ops.py" "test_fusible_ops.py"
+run_test "" "pytest_test_permutation.xml" "$TE_PATH/tests/pytorch/test_permutation.py" "test_permutation.py"
+run_test "" "pytest_test_parallel_cross_entropy.xml" "$TE_PATH/tests/pytorch/test_parallel_cross_entropy.py" "test_parallel_cross_entropy.py"
+run_test "" "pytest_test_cpu_offloading.xml" "$TE_PATH/tests/pytorch/test_cpu_offloading.py" "test_cpu_offloading.py"
+run_test "NVTE_FLASH_ATTN=0 NVTE_CPU_OFFLOAD_V1=1" "pytest_test_cpu_offloading_v1.xml" "$TE_PATH/tests/pytorch/test_cpu_offloading_v1.py" "test_cpu_offloading_v1.py"
+run_test "" "pytest_test_attention.xml" "$TE_PATH/tests/pytorch/attention/test_attention.py" "test_attention.py"
+run_test "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0" "pytest_test_attention_deterministic.xml" "$TE_PATH/tests/pytorch/attention/test_attention.py" "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention.py"
+run_test "" "pytest_test_kv_cache.xml" "$TE_PATH/tests/pytorch/attention/test_kv_cache.py" "test_kv_cache.py"
+run_test "" "pytest_test_hf_integration.xml" "$TE_PATH/tests/pytorch/test_hf_integration.py" "test_hf_integration.py"
+run_test "" "pytest_test_checkpoint.xml" "$TE_PATH/tests/pytorch/test_checkpoint.py" "test_checkpoint.py"
+run_test "" "pytest_test_fused_router.xml" "$TE_PATH/tests/pytorch/test_fused_router.py" "test_fused_router.py"
+run_test "" "pytest_test_partial_cast.xml" "$TE_PATH/tests/pytorch/test_partial_cast.py" "test_partial_cast.py"
+
+# ── Wait for remaining background jobs ──────────────────────────────────────
+
+if [ "$NUM_GPUS" -gt 1 ]; then
+    wait
+fi
+
+# ── Replay per-test logs into trace ──────────────────────────────────────────
+
+if [ "$NUM_GPUS" -gt 1 ]; then
+    echo ""
+    echo "=== Per-test output (replayed from parallel execution) ==="
+    for logfile in "$XML_LOG_DIR"/*.log; do
+        if [ -f "$logfile" ]; then
+            echo ""
+            echo "────────────────────────────────────────────────────────"
+            echo ">>> $(basename "$logfile" .log)"
+            echo "────────────────────────────────────────────────────────"
+            cat "$logfile"
+        fi
+    done
+    echo ""
+    echo "=== End of per-test output ==="
+fi
+
+# ── Report results ──────────────────────────────────────────────────────────
+
+if [ -s "$FAIL_DIR/failures" ]; then
+    FAILED_CASES=$(cat "$FAIL_DIR/failures" | tr '\n' ' ')
+    echo "Error in the following test cases: $FAILED_CASES"
+    rm -rf "$FAIL_DIR"
     exit 1
 fi
+rm -rf "$FAIL_DIR"
 echo "All tests passed"
 exit 0