From 0c70ce06fc80d7a4af72702a932cc51bbb8cb41a Mon Sep 17 00:00:00 2001 From: Sirui Wang Date: Fri, 17 Apr 2026 00:41:29 -0700 Subject: [PATCH] keep deploy cases and Eagle fixes for merge Signed-off-by: Sirui Wang --- tests/_test_utils/deploy_utils.py | 2 + tests/_test_utils/examples/models.py | 5 ++ tests/examples/diffusers/test_diffusers.py | 14 +++- tests/examples/llm_ptq/test_deploy.py | 80 +++++++++++++++++++++- 4 files changed, 99 insertions(+), 2 deletions(-) diff --git a/tests/_test_utils/deploy_utils.py b/tests/_test_utils/deploy_utils.py index bdf879be83..32f80e7637 100644 --- a/tests/_test_utils/deploy_utils.py +++ b/tests/_test_utils/deploy_utils.py @@ -257,6 +257,8 @@ def _deploy_trtllm_impl(self): qwen3_models = ( "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4", "nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4", + "nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "/s3/nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", ) nemotron_models = ( "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", diff --git a/tests/_test_utils/examples/models.py b/tests/_test_utils/examples/models.py index 8bf2b95a60..1ee3acf315 100644 --- a/tests/_test_utils/examples/models.py +++ b/tests/_test_utils/examples/models.py @@ -69,6 +69,11 @@ def _select_path(remote_id: str, local_id: str) -> str: ) # Diffusers +FLUX_DEV_PATH = _select_path( + remote_id="black-forest-labs/FLUX.1-dev", + local_id="black-forest-labs/FLUX.1-dev", +) + FLUX_SCHNELL_PATH = _select_path( remote_id="hf-internal-testing/tiny-flux-pipe", local_id="black-forest-labs/FLUX.1-schnell", diff --git a/tests/examples/diffusers/test_diffusers.py b/tests/examples/diffusers/test_diffusers.py index 5bc8f981ec..aed78d1d0f 100644 --- a/tests/examples/diffusers/test_diffusers.py +++ b/tests/examples/diffusers/test_diffusers.py @@ -17,7 +17,7 @@ from typing import NamedTuple import pytest -from _test_utils.examples.models import FLUX_SCHNELL_PATH, SD3_PATH, SDXL_1_0_PATH +from _test_utils.examples.models import FLUX_DEV_PATH, FLUX_SCHNELL_PATH, SD3_PATH, SDXL_1_0_PATH from _test_utils.examples.run_command import run_example_command from _test_utils.torch.misc import minimum_sm @@ -99,6 +99,17 @@ def inference(self, tmp_path: Path) -> None: @pytest.mark.parametrize( "model", [ + pytest.param( + DiffuserModel( + name="flux-dev", + path=FLUX_DEV_PATH, + dtype="BFloat16", + format_type="fp8", + quant_algo="max", + collect_method="default", + ), + marks=minimum_sm(89), + ), DiffuserModel( name="flux-schnell", path=FLUX_SCHNELL_PATH, @@ -136,6 +147,7 @@ def inference(self, tmp_path: Path) -> None: ), ], ids=[ + "flux_dev_bf16_fp8_max_3.0_default", "flux_schnell_bf16_int8_smoothquant_3.0_min_mean", "sd3_medium_fp16_int8_smoothquant_3.0_min_mean", "sdxl_1.0_fp16_fp8_max_3.0_default", diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py index bdada9f8c1..3cabb5b2cc 100644 --- a/tests/examples/llm_ptq/test_deploy.py +++ b/tests/examples/llm_ptq/test_deploy.py @@ -246,6 +246,12 @@ def test_llama(command): tensor_parallel_size=4, mini_sm=100, ), + *ModelDeployerList( + model_id="nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), *ModelDeployerList( model_id="nvidia/Qwen3-30B-A3B-NVFP4", backend=("trtllm", "vllm", "sglang"), @@ -295,6 +301,28 @@ def test_qwen(command): command.run() +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/GLM-4.7-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/GLM-5-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_glm(command): + command.run() + + @pytest.mark.parametrize( "command", [ @@ -346,6 +374,13 @@ def test_mixtral(command): mini_sm=89, attn_backend="FLASHINFER", ), + *ModelDeployerList( + model_id="nvidia/Gemma-4-31B-IT-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + attn_backend="FLASHINFER", + ), ], ids=idfn, ) @@ -451,6 +486,18 @@ def test_kimi(command): mini_sm=89, attn_backend="FLASHINFER", ), + *ModelDeployerList( + model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), ], ids=idfn, ) @@ -482,7 +529,6 @@ def test_llama_nemotron(command): ], ids=idfn, ) -@pytest.mark.skip(reason="Medusa is not supported yet") def test_medusa(command): command.run() @@ -497,6 +543,22 @@ def test_medusa(command): tensor_parallel_size=8, mini_sm=89, ), + *ModelDeployerList( + base_model="nvidia/Kimi-K2-Thinking-NVFP4", + model_id="nvidia/Kimi-K2-Thinking-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + eagle3_one_model=False, + ), + *ModelDeployerList( + base_model="nvidia/Kimi-K2.5-NVFP4", + model_id="nvidia/Kimi-K2.5-Thinking-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + eagle3_one_model=False, + ), *ModelDeployerList( base_model="Qwen/Qwen3-235B-A22B", model_id="nvidia/Qwen3-235B-A22B-Eagle3", @@ -588,3 +650,19 @@ def test_eagle(command): command.run() else: pytest.skip(f"Local model not found: {local_path}") + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/MiniMax-M2.5-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_minimax(command): + command.run()