Skip to content

Commit ced4b6c

Browse files
committed
add test for Nemotron 3 Nano
Signed-off-by: noeyy-mino <174223378+noeyy-mino@users.noreply.github.com>
1 parent 73495eb commit ced4b6c

File tree

2 files changed

+64
-3
lines changed

2 files changed

+64
-3
lines changed

tests/_test_utils/deploy_utils.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,40 @@ def _deploy_trtllm(self):
100100
spec_config = None
101101
llm = None
102102
kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8)
103-
if "eagle" in self.model_id.lower():
103+
104+
if self.model_id == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8":
105+
llm = LLM(
106+
model=self.model_id,
107+
tensor_parallel_size=self.tensor_parallel_size,
108+
enable_attention_dp=False,
109+
attn_backend=self.attn_backend,
110+
trust_remote_code=True,
111+
max_batch_size=8,
112+
kv_cache_config=KvCacheConfig(
113+
enable_block_reuse=False,
114+
mamba_ssm_cache_dtype="float32",
115+
),
116+
)
117+
elif self.model_id == "nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16":
118+
spec_config = EagleDecodingConfig(
119+
max_draft_len=3,
120+
speculative_model_dir=self.model_id,
121+
eagle3_one_model=self.eagle3_one_model,
122+
)
123+
llm = LLM(
124+
model=self.model_id,
125+
tensor_parallel_size=self.tensor_parallel_size,
126+
enable_attention_dp=False,
127+
attn_backend=self.attn_backend,
128+
trust_remote_code=True,
129+
max_batch_size=8,
130+
speculative_config=spec_config,
131+
kv_cache_config=KvCacheConfig(
132+
enable_block_reuse=False,
133+
mamba_ssm_cache_dtype="float32",
134+
),
135+
)
136+
elif "eagle" in self.model_id.lower():
104137
spec_config = EagleDecodingConfig(
105138
max_draft_len=3,
106139
speculative_model_dir=self.model_id,
@@ -197,6 +230,14 @@ def _deploy_sglang(self):
197230
mem_fraction_static=0.7,
198231
context_length=1024,
199232
)
233+
elif self.model_id == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8":
234+
llm = sgl.Engine(
235+
model_path=self.model_id,
236+
quantization=quantization_method,
237+
tp_size=self.tensor_parallel_size,
238+
trust_remote_code=True,
239+
attention_backend="flashinfer",
240+
)
200241
else:
201242
llm = sgl.Engine(
202243
model_path=self.model_id,

tests/examples/llm_ptq/test_deploy.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,13 @@ def test_kimi(command):
386386
tensor_parallel_size=8,
387387
mini_sm=89,
388388
),
389+
*ModelDeployerList(
390+
model_id="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
391+
backend=("trtllm", "vllm", "sglang"),
392+
tensor_parallel_size=1,
393+
mini_sm=89,
394+
attn_backend="FLASHINFER",
395+
),
389396
],
390397
ids=idfn,
391398
)
@@ -486,7 +493,8 @@ def test_medusa(command):
486493
*ModelDeployerList(
487494
base_model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
488495
model_id="nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
489-
backend=("trtllm", "sglang"),
496+
backend=("trtllm", "vllm", "sglang"),
497+
eagle3_one_model=False,
490498
tensor_parallel_size=8,
491499
mini_sm=89,
492500
),
@@ -501,4 +509,16 @@ def test_medusa(command):
501509
ids=idfn,
502510
)
503511
def test_eagle(command):
504-
command.run()
512+
"""Skip test if MODELOPT_LOCAL_MODEL_ROOT is set but model doesn't exist locally.
513+
speculative models shoule be loaded by local path"""
514+
local_root = os.getenv("MODELOPT_LOCAL_MODEL_ROOT")
515+
if not local_root:
516+
return
517+
518+
local_path = os.path.join(local_root, command.model_id)
519+
if os.path.isdir(local_path):
520+
# Update model_id to use local path
521+
command.model_id = local_path
522+
command.run()
523+
else:
524+
pytest.skip(f"Local model not found: {local_path}")

0 commit comments

Comments
 (0)