From 1f47aa5b91ce89f0d15158fee20ee57927da23c6 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Sun, 26 Apr 2026 15:12:38 +0800 Subject: [PATCH] refresh test constraints Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_autodeploy.py | 9 ++- .../defs/accuracy/test_llm_api_pytorch.py | 3 +- tests/integration/defs/test_e2e.py | 56 +++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 4aafebe6b36f..52c5654c42e2 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -18,8 +18,8 @@ import pytest import torch import yaml -from defs.conftest import (get_llm_root, get_sm_version, skip_pre_blackwell, - skip_pre_hopper) +from defs.conftest import (get_llm_root, get_sm_version, skip_pre_ada, + skip_pre_blackwell, skip_pre_hopper) from test_common.llm_data import hf_id_to_local_model_dir, llm_models_root from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM @@ -494,6 +494,7 @@ def test_auto_dtype(self, enable_chunked_prefill): **kwargs) as llm: self.evaluate_tasks(llm, sampling_params) + @skip_pre_ada @pytest.mark.skip_less_device_memory(32000) @pytest.mark.parametrize("enable_chunked_prefill", [True]) def test_fp8(self, enable_chunked_prefill): @@ -553,6 +554,8 @@ def get_default_sampling_params(self): def test_accuracy(self, model_id, world_size, attn_backend): if model_id == "nvfp4" and get_sm_version() < 100: pytest.skip("NVFP4 requires Blackwell or later") + if model_id == "fp8" and get_sm_version() < 90: + pytest.skip("FP8 requires Hopper or later") if world_size > get_device_count(): pytest.skip(f"Not enough devices for world_size={world_size}") model_path = self.MODEL_PATHS[model_id] @@ -1111,8 +1114,10 @@ class TestModelRegistryAccuracy(LlmapiAccuracyTestHarness): pytest.param("meta-llama/Llama-3.1-8B-Instruct", {}, [MMLU, GSM8K], id="meta-llama_Llama-3.1-8B-Instruct"), pytest.param("nvidia/Llama-3.1-8B-Instruct-FP8", {}, [MMLU, GSM8K], + marks=skip_pre_ada, id="nvidia_Llama-3.1-8B-Instruct-FP8"), pytest.param("nvidia/Llama-3.1-8B-Instruct-NVFP4", {}, [MMLU, GSM8K], + marks=skip_pre_blackwell, id="nvidia_Llama-3.1-8B-Instruct-NVFP4"), pytest.param("google/gemma-3-1b-it", {}, [MMLU, GSM8K], id="google_gemma-3-1b-it"), diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 76b91cab3f3e..a3f2d89981cc 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1321,6 +1321,7 @@ def test_fp8_prequantized(self): task.evaluate(llm) +@skip_pre_hopper class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "google/gemma-3-1b-it" MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-1b-it/" @@ -1361,7 +1362,6 @@ def test_fp8_prequantized(self, torch_compile): task = MMLU(self.MODEL_NAME) task.evaluate(llm) - @skip_pre_hopper def test_fp8_vswa_reuse(self): # NOTE: Test with VSWA kv cache config. kv_cache_config = KvCacheConfig( @@ -1376,7 +1376,6 @@ def test_fp8_vswa_reuse(self): task = MMLU(self.MODEL_NAME) task.evaluate(llm) - @skip_pre_hopper @pytest.mark.parametrize("backend", ["xgrammar"]) def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker): mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"}) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 9dbd59331e0e..ac4ec98f16b4 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2624,3 +2624,59 @@ def test_get_ci_container_port(): assert container_port_start > 0 assert container_port_num > 0 assert container_port_start + container_port_num <= 60000 + + +@skip_pre_hopper +@pytest.mark.skip_less_device_memory(80000) +@pytest.mark.parametrize("model_name", ["meta/Meta-Llama-3.1-8B"], + ids=["llama3_1-8b"]) +@pytest.mark.parametrize("model_subdir", ["llama-3.1-model/Meta-Llama-3.1-8B"], + ids=["llama_v3_1"]) +def test_trtllm_bench_mig_launch(llm_root, llm_venv, model_name, model_subdir): + """Run benchmark in MIG mode, check if throughput increases with concurrency.""" + results = {} + concurrency_list = [1, 32, 64, 128] + + for concurrency in concurrency_list: + num_requests = concurrency * 10 + runner = BenchRunner(llm_root=llm_root, + llm_venv=llm_venv, + model_name=model_name, + model_subdir=model_subdir, + streaming=False, + use_pytorch_backend=True, + use_mpirun=False, + tp_size=1, + concurrency=concurrency, + num_requests=num_requests) + + output = runner() + results[concurrency] = output + + print(f"\n=== Benchmark Results Comparison ===") + print(f"Model: {model_name}") + print( + f"{'Concurrency':<15} {'Throughput':<15} {'Latency':<15} {'Num Requests':<15}" + ) + print("-" * 60) + + for idx, val in enumerate(concurrency_list): + metrics = results.get(val) + if not isinstance(metrics, dict): + pytest.fail( + f"Unexpected benchmark result type for concurrency {val}: {type(metrics)}" + ) + try: + throughput = float(metrics.get('throughput', 0)) + latency = float(metrics.get('latency', 0)) + num_requests = int(metrics.get('num_requests', 0)) + except (ValueError, TypeError) as e: + pytest.fail( + f"Failed to parse benchmark results for concurrency {val}: {e}") + assert throughput > 0, f"Throughput is 0 for concurrency {val}" + assert latency > 0, f"Latency is 0 for concurrency {val}" + print(f"{val:<15} {throughput:<15} {latency:<15} {num_requests:<15}") + if idx > 0: + prev_throughput = float(results[concurrency_list[idx - 1]].get( + 'throughput', 0)) + assert throughput > prev_throughput * 1.3, f"Throughput is not increasing for concurrency {concurrency_list[idx]}"