From 0c70ce06fc80d7a4af72702a932cc51bbb8cb41a Mon Sep 17 00:00:00 2001
From: Sirui Wang <siruiw@r6515-0097.ipp1a1.colossus.nvidia.com>
Date: Fri, 17 Apr 2026 00:41:29 -0700
Subject: [PATCH] keep deploy cases and Eagle fixes for merge

Signed-off-by: Sirui Wang <siruiw@r6515-0097.ipp1a1.colossus.nvidia.com>
---
 tests/_test_utils/deploy_utils.py          |  2 +
 tests/_test_utils/examples/models.py       |  5 ++
 tests/examples/diffusers/test_diffusers.py | 14 +++-
 tests/examples/llm_ptq/test_deploy.py      | 80 +++++++++++++++++++++-
 4 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/tests/_test_utils/deploy_utils.py b/tests/_test_utils/deploy_utils.py
index bdf879be83..32f80e7637 100644
--- a/tests/_test_utils/deploy_utils.py
+++ b/tests/_test_utils/deploy_utils.py
@@ -257,6 +257,8 @@ def _deploy_trtllm_impl(self):
         qwen3_models = (
             "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4",
             "nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4",
+            "nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+            "/s3/nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
         )
         nemotron_models = (
             "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
diff --git a/tests/_test_utils/examples/models.py b/tests/_test_utils/examples/models.py
index 8bf2b95a60..1ee3acf315 100644
--- a/tests/_test_utils/examples/models.py
+++ b/tests/_test_utils/examples/models.py
@@ -69,6 +69,11 @@ def _select_path(remote_id: str, local_id: str) -> str:
 )
 
 # Diffusers
+FLUX_DEV_PATH = _select_path(
+    remote_id="black-forest-labs/FLUX.1-dev",
+    local_id="black-forest-labs/FLUX.1-dev",
+)
+
 FLUX_SCHNELL_PATH = _select_path(
     remote_id="hf-internal-testing/tiny-flux-pipe",
     local_id="black-forest-labs/FLUX.1-schnell",
diff --git a/tests/examples/diffusers/test_diffusers.py b/tests/examples/diffusers/test_diffusers.py
index 5bc8f981ec..aed78d1d0f 100644
--- a/tests/examples/diffusers/test_diffusers.py
+++ b/tests/examples/diffusers/test_diffusers.py
@@ -17,7 +17,7 @@
 from typing import NamedTuple
 
 import pytest
-from _test_utils.examples.models import FLUX_SCHNELL_PATH, SD3_PATH, SDXL_1_0_PATH
+from _test_utils.examples.models import FLUX_DEV_PATH, FLUX_SCHNELL_PATH, SD3_PATH, SDXL_1_0_PATH
 from _test_utils.examples.run_command import run_example_command
 from _test_utils.torch.misc import minimum_sm
 
@@ -99,6 +99,17 @@ def inference(self, tmp_path: Path) -> None:
 @pytest.mark.parametrize(
     "model",
     [
+        pytest.param(
+            DiffuserModel(
+                name="flux-dev",
+                path=FLUX_DEV_PATH,
+                dtype="BFloat16",
+                format_type="fp8",
+                quant_algo="max",
+                collect_method="default",
+            ),
+            marks=minimum_sm(89),
+        ),
         DiffuserModel(
             name="flux-schnell",
             path=FLUX_SCHNELL_PATH,
@@ -136,6 +147,7 @@ def inference(self, tmp_path: Path) -> None:
         ),
     ],
     ids=[
+        "flux_dev_bf16_fp8_max_3.0_default",
         "flux_schnell_bf16_int8_smoothquant_3.0_min_mean",
         "sd3_medium_fp16_int8_smoothquant_3.0_min_mean",
         "sdxl_1.0_fp16_fp8_max_3.0_default",
diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py
index bdada9f8c1..3cabb5b2cc 100644
--- a/tests/examples/llm_ptq/test_deploy.py
+++ b/tests/examples/llm_ptq/test_deploy.py
@@ -246,6 +246,12 @@ def test_llama(command):
             tensor_parallel_size=4,
             mini_sm=100,
         ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
         *ModelDeployerList(
             model_id="nvidia/Qwen3-30B-A3B-NVFP4",
             backend=("trtllm", "vllm", "sglang"),
@@ -295,6 +301,28 @@ def test_qwen(command):
     command.run()
 
 
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/GLM-4.7-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/GLM-5-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_glm(command):
+    command.run()
+
+
 @pytest.mark.parametrize(
     "command",
     [
@@ -346,6 +374,13 @@ def test_mixtral(command):
             mini_sm=89,
             attn_backend="FLASHINFER",
         ),
+        *ModelDeployerList(
+            model_id="nvidia/Gemma-4-31B-IT-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+            attn_backend="FLASHINFER",
+        ),
     ],
     ids=idfn,
 )
@@ -451,6 +486,18 @@ def test_kimi(command):
             mini_sm=89,
             attn_backend="FLASHINFER",
         ),
+        *ModelDeployerList(
+            model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
     ],
     ids=idfn,
 )
@@ -482,7 +529,6 @@ def test_llama_nemotron(command):
     ],
     ids=idfn,
 )
-@pytest.mark.skip(reason="Medusa is not supported yet")
 def test_medusa(command):
     command.run()
 
@@ -497,6 +543,22 @@ def test_medusa(command):
             tensor_parallel_size=8,
             mini_sm=89,
         ),
+        *ModelDeployerList(
+            base_model="nvidia/Kimi-K2-Thinking-NVFP4",
+            model_id="nvidia/Kimi-K2-Thinking-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+            eagle3_one_model=False,
+        ),
+        *ModelDeployerList(
+            base_model="nvidia/Kimi-K2.5-NVFP4",
+            model_id="nvidia/Kimi-K2.5-Thinking-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+            eagle3_one_model=False,
+        ),
         *ModelDeployerList(
             base_model="Qwen/Qwen3-235B-A22B",
             model_id="nvidia/Qwen3-235B-A22B-Eagle3",
@@ -588,3 +650,19 @@ def test_eagle(command):
         command.run()
     else:
         pytest.skip(f"Local model not found: {local_path}")
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/MiniMax-M2.5-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_minimax(command):
+    command.run()