Skip to content

Commit 5870271

Browse files
xinhe-nvcodego7250
authored andcommitted
[None][fix] Waive gb200 (NVIDIA#9580)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
1 parent 9acaeb5 commit 5870271

File tree

5 files changed

+4
-185
lines changed

5 files changed

+4
-185
lines changed

tests/integration/defs/.test_durations

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,6 @@
640640
"examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp8]": 314.3205590210273,
641641
"examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]": 329.1954380639363,
642642
"examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]": 216.2645359209855,
643-
"examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]": 3641.9526145930286,
644643
"examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]": 1654.751242957951,
645644
"examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8]": 20655.04908744397,
646645
"examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8]": 13962.460933016031,

tests/integration/defs/examples/test_llama.py

Lines changed: 1 addition & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,10 @@
1818
import os
1919
import re
2020
import shutil
21-
import subprocess
22-
from copy import deepcopy
2321

2422
import defs.ci_profiler
2523
import pytest
26-
from defs.common import (convert_weights, generate_summary_cmd,
27-
get_cpp_benchmark, get_trt_llm_lib_dir, parse_output,
24+
from defs.common import (convert_weights, generate_summary_cmd, parse_output,
2825
quantize_data, similar,
2926
test_llm_torch_multi_lora_support,
3027
test_multi_lora_support, venv_check_call,
@@ -2683,184 +2680,6 @@ def test_llm_llama_v1_multiple_lora_1gpu(data_type, lora_data_type,
26832680
venv_check_call(llm_venv, run_cmd)
26842681

26852682

2686-
@pytest.mark.skip_less_device_memory(80000)
2687-
@pytest.mark.skip_less_device(2)
2688-
@pytest.mark.parametrize("llama_model_root", ['llama-v2-13b-hf'], indirect=True)
2689-
@pytest.mark.parametrize("llm_lora_model_root", ["chinese-llama-2-lora-13b"],
2690-
ids=["chinese_lora"],
2691-
indirect=True)
2692-
def test_llm_llama_v2_lora_benchmark_2gpu(llama_example_root, llama_model_root,
2693-
llm_venv, llm_root, cmodel_dir,
2694-
engine_dir, llm_lora_model_root):
2695-
"benchmark llama with multi lora on 2gpu"
2696-
print("Build engines...")
2697-
2698-
num_layers = 40
2699-
num_lora_mods = 7
2700-
max_lora_rank = 64
2701-
max_len = 1024
2702-
max_batch = 32
2703-
eos_id = 2
2704-
num_loras = (8, 16)
2705-
num_requests = 1024
2706-
2707-
model_dir = convert_weights(llm_venv=llm_venv,
2708-
example_root=llama_example_root,
2709-
cmodel_dir=cmodel_dir,
2710-
model="llama-lora",
2711-
model_path=llama_model_root,
2712-
gpus=2,
2713-
tp_size=2,
2714-
data_type="float16")
2715-
2716-
print("Build engines...")
2717-
build_cmd = [
2718-
"trtllm-build",
2719-
f"--checkpoint_dir={model_dir}",
2720-
f"--output_dir={engine_dir}",
2721-
f"--max_batch_size={max_batch}",
2722-
f"--max_input_len={max_len}",
2723-
f"--max_seq_len={2 * max_len}",
2724-
"--gemm_plugin=float16",
2725-
"--lora_plugin=float16",
2726-
"--use_paged_context_fmha=enable",
2727-
"--lora_target_modules",
2728-
"attn_q",
2729-
"attn_k",
2730-
"attn_v",
2731-
"attn_dense",
2732-
"mlp_h_to_4h",
2733-
"mlp_4h_to_h",
2734-
"mlp_gate",
2735-
f"--max_lora_rank={max_lora_rank}",
2736-
]
2737-
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
2738-
2739-
print("Convert LoRA to cpp format")
2740-
convert_cmd = [
2741-
"python",
2742-
f"{llama_example_root}/../../../hf_lora_convert.py",
2743-
f"-i={llm_lora_model_root}",
2744-
"--storage-type=float16",
2745-
f"-o={llm_venv.get_working_directory()}/lora_cpp",
2746-
]
2747-
check_call(" ".join(convert_cmd), shell=True, env=llm_venv._new_env)
2748-
2749-
print("Prepare datasets")
2750-
benchmark_root = f"{llama_example_root}/../../../../benchmarks/cpp"
2751-
lora_eg = f"{llm_venv.get_working_directory()}/lora-eg"
2752-
base_dataset_cmd = [
2753-
f"mkdir -p {lora_eg}/data",
2754-
"&&",
2755-
"python",
2756-
f"{benchmark_root}/prepare_dataset.py",
2757-
f"--output={lora_eg}/data/token-norm-dist.json",
2758-
f"--tokenizer={llama_model_root}",
2759-
"token-norm-dist",
2760-
f"--num-requests={num_requests}",
2761-
"--input-mean=256",
2762-
"--input-stdev=16",
2763-
"--output-mean=128",
2764-
"--output-stdev 24",
2765-
]
2766-
check_call(" ".join(base_dataset_cmd), shell=True, env=llm_venv._new_env)
2767-
2768-
for nloras in num_loras:
2769-
lora_dataset_cmd = [
2770-
"python",
2771-
f"{benchmark_root}/prepare_dataset.py",
2772-
f"--output={lora_eg}/data/token-norm-dist-lora-{nloras}.json",
2773-
f"--rand-task-id 0 {nloras-1}",
2774-
f"--tokenizer={llama_model_root}",
2775-
"token-norm-dist",
2776-
f"--num-requests={num_requests}",
2777-
"--input-mean=256",
2778-
"--input-stdev=16",
2779-
"--output-mean=128",
2780-
"--output-stdev 24",
2781-
]
2782-
check_call(" ".join(lora_dataset_cmd),
2783-
shell=True,
2784-
env=llm_venv._new_env)
2785-
2786-
print("Generate random lora weights for 16 adapters")
2787-
2788-
lora_weights_cmd = [
2789-
"python", f"{benchmark_root}/utils/generate_rand_loras.py",
2790-
f"{llm_venv.get_working_directory()}/lora_cpp", f"{lora_eg}/loras", "16"
2791-
]
2792-
check_call(" ".join(lora_weights_cmd), shell=True, env=llm_venv._new_env)
2793-
2794-
benchmark_exe = get_cpp_benchmark('gptManagerBenchmark', llm_root)
2795-
envs = deepcopy(os.environ)
2796-
_ = envs.pop("CUDA_VISIBLE_DEVICES", "")
2797-
envs[
2798-
"LD_LIBRARY_PATH"] = f'{get_trt_llm_lib_dir(llm_venv)}:{os.path.dirname(benchmark_exe)}:{envs.get("LD_LIBRARY_PATH", "")}'
2799-
2800-
print(
2801-
f'CUDA_VISIBLE_DEVICES: {os.environ.get("CUDA_VISIBLE_DEVICES", None)}')
2802-
2803-
print("Perform base model benchmarking")
2804-
check_call(f"mkdir -p {lora_eg}/log-base-lora", shell=True, env=envs)
2805-
base_benchmark_cmd = [
2806-
f"{benchmark_exe}",
2807-
f"--engine_dir={engine_dir}",
2808-
"--type=IFB",
2809-
f"--dataset={lora_eg}/data/token-norm-dist.json",
2810-
"--lora_host_cache_bytes=8589934592",
2811-
f"--lora_num_device_mod_layers={32 * num_layers * num_lora_mods * max_lora_rank}",
2812-
"--kv_cache_free_gpu_mem_fraction=0.70",
2813-
"--log_level=info",
2814-
f"--eos_id={eos_id}",
2815-
]
2816-
mpi_cmd = [
2817-
"mpirun",
2818-
"-n",
2819-
"2",
2820-
"--allow-run-as-root",
2821-
"--output-filename",
2822-
f"{lora_eg}/log-base-lora",
2823-
]
2824-
base_benchmark_cmd = mpi_cmd + base_benchmark_cmd
2825-
print(
2826-
f"Running gptManagerBenchmark using base cmd: {' '.join(base_benchmark_cmd)}"
2827-
)
2828-
subprocess.check_output(base_benchmark_cmd, env=envs)
2829-
# check_call(" ".join(base_benchmark_cmd), env=envs)
2830-
2831-
print("Perform lora model benchmarking")
2832-
for nloras in num_loras:
2833-
check_call(f"mkdir -p {lora_eg}/log-lora-{nloras}",
2834-
shell=True,
2835-
env=envs)
2836-
lora_benchmark_cmd = [
2837-
f"{benchmark_exe}",
2838-
f"--engine_dir={engine_dir}",
2839-
"--type=IFB",
2840-
f"--dataset={lora_eg}/data/token-norm-dist-lora-{nloras}.json",
2841-
"--lora_host_cache_bytes=8589934592",
2842-
f"--lora_num_device_mod_layers={16 * num_layers * num_lora_mods * max_lora_rank}",
2843-
"--kv_cache_free_gpu_mem_fraction=0.70",
2844-
"--log_level=info",
2845-
f"--eos_id={eos_id}",
2846-
f"--lora_dir={lora_eg}/loras",
2847-
]
2848-
mpi_cmd = [
2849-
"mpirun",
2850-
"-n",
2851-
"2",
2852-
"--allow-run-as-root",
2853-
"--output-filename",
2854-
f"{lora_eg}/log-lora-{nloras}",
2855-
]
2856-
lora_benchmark_cmd = mpi_cmd + lora_benchmark_cmd
2857-
print(
2858-
f"Running gptManagerBenchmark using lora cmd: {' '.join(lora_benchmark_cmd)}"
2859-
)
2860-
subprocess.check_output(lora_benchmark_cmd, env=envs)
2861-
# check_call(lora_benchmark_cmd, env=envs)
2862-
2863-
28642683
@pytest.mark.timeout(7200)
28652684
@pytest.mark.skip_less_device_memory(80000)
28662685
@pytest.mark.skip_less_device(4)

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-lla
111111
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp8]
112112
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]
113113
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]
114-
examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]
115114
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8] TIMEOUT (120)
116115
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8] TIMEOUT (90)
117116
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ l0_dgx_h200:
159159
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_tp2cp2
160160
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
161161
- examples/test_llama.py::test_llm_llama_long_alpaca_8gpu_summary[pg64317-tp4pp2-nb:4]
162-
- examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]
163162
- examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
164163
- unittest/llmapi/test_llm_multi_gpu.py -m "gpu2 and part0"
165164
- unittest/llmapi/test_llm_multi_gpu.py -m "gpu2 and part1"

tests/integration/test_lists/waives.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,3 +419,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutla
419419
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] SKIP (https://nvbugs/5702795)
420420
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
421421
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5698897)
422+
test_e2e/test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
423+
test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
424+
test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5633700)

0 commit comments

Comments
 (0)