Skip to content

Commit 80f779c

Browse files
Merge branch 'main' into fix/granite4-example-updates
2 parents 6e389e2 + 560bb9c commit 80f779c

File tree

7 files changed

+80
-54
lines changed

7 files changed

+80
-54
lines changed

src/llmcompressor/modeling/qwen3_next_moe.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
# coding=utf-8
24
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
35
# All rights reserved.
@@ -13,19 +15,21 @@
1315
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1416
# See the License for the specific language governing permissions and
1517
# limitations under the License.
18+
from typing import TYPE_CHECKING
1619

1720
import torch
1821

1922
from llmcompressor.modeling.moe_context import MoECalibrationModule
2023

21-
22-
@MoECalibrationModule.register("Qwen3NextSparseMoeBlock")
23-
class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule):
24+
if TYPE_CHECKING:
2425
from transformers import Qwen3NextConfig
2526
from transformers.models.qwen3_next.modeling_qwen3_next import (
2627
Qwen3NextSparseMoeBlock,
2728
)
2829

30+
31+
@MoECalibrationModule.register("Qwen3NextSparseMoeBlock")
32+
class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule):
2933
"""
3034
Calibration version of Qwen3NextSparseMoeBlock that sends all tokens to all experts.
3135
"""

tests/e2e/e2e_utils.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,12 @@
1010
from tests.testing_utils import process_dataset
1111

1212

13-
@log_time
14-
def _load_model_and_processor(
15-
model: str,
16-
model_class: str,
17-
):
13+
def load_model(model: str, model_class: str, device_map: str | None = None):
1814
pretrained_model_class = getattr(transformers, model_class)
19-
loaded_model = pretrained_model_class.from_pretrained(model, torch_dtype="auto")
20-
processor = AutoProcessor.from_pretrained(model)
21-
return loaded_model, processor
15+
loaded_model = pretrained_model_class.from_pretrained(
16+
model, torch_dtype="auto", device_map=device_map
17+
)
18+
return loaded_model
2219

2320

2421
@log_time
@@ -41,9 +38,8 @@ def run_oneshot_for_e2e_testing(
4138
# Load model.
4239
oneshot_kwargs = {}
4340

44-
loaded_model, processor = _load_model_and_processor(
45-
model=model, model_class=model_class
46-
)
41+
loaded_model = load_model(model=model, model_class=model_class)
42+
processor = AutoProcessor.from_pretrained(model)
4743

4844
if dataset_id:
4945
ds = load_dataset(dataset_id, name=dataset_config, split=dataset_split)
@@ -88,11 +84,13 @@ def data_collator(batch):
8884
targets="Linear",
8985
scheme=scheme,
9086
actorder=None, # added for consistency with past testing configs
91-
ignore=["lm_head"],
87+
ignore=["lm_head", "re:.*mlp.gate[.].*"],
9288
)
9389
else:
9490
oneshot_kwargs["recipe"] = QuantizationModifier(
95-
targets="Linear", scheme=scheme, ignore=["lm_head"]
91+
targets="Linear",
92+
scheme=scheme,
93+
ignore=["lm_head", "re:.*mlp.gate[.].*"],
9694
)
9795

9896
# Apply quantization.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
cadence: "nightly"
2+
test_type: "regression"
3+
model: Qwen/Qwen3-30B-A3B
4+
scheme: NVFP4
5+
dataset_id: HuggingFaceH4/ultrachat_200k
6+
dataset_split: train_sft
7+
num_calibration_samples: 20
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
cadence: "nightly"
2+
test_type: "regression"
3+
model: Qwen/Qwen3-30B-A3B
4+
scheme: FP8_DYNAMIC

tests/e2e/vLLM/run_vllm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def parse_args():
1818
except json.JSONDecodeError as e:
1919
raise ValueError(f"Invalid JSON input: {e}")
2020

21-
if "W4A16_2of4" in scheme:
21+
if scheme is not None and "W4A16_2of4" in scheme:
2222
# required by the kernel
2323
llm_kwargs["dtype"] = torch.float16
2424

tests/llmcompressor/modeling/test_calib_qwen3_next.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,22 @@
1111
from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
1212
from tests.testing_utils import requires_cadence, requires_gpu
1313

14+
try:
15+
from transformers import Qwen3NextConfig
16+
from transformers.models.qwen3_next.modeling_qwen3_next import (
17+
Qwen3NextSparseMoeBlock,
18+
)
19+
except ImportError:
20+
Qwen3NextConfig = None
21+
Qwen3NextSparseMoeBlock = None
22+
1423

1524
@requires_cadence("weekly")
1625
@pytest.mark.parametrize("model_stub", ["Qwen/Qwen3-Next-80B-A3B-Instruct"])
26+
@pytest.mark.skipif(
27+
Qwen3NextConfig is None,
28+
reason="Qwen3Next not available in this version of transformers",
29+
)
1730
def test_calib_replace_qwen3moe_all_experts(model_stub):
1831
with skip_weights_download():
1932
model = AutoModelForCausalLM.from_pretrained(model_stub)
@@ -60,12 +73,11 @@ def hook_fn(i, module, input, output):
6073

6174

6275
@requires_gpu
76+
@pytest.mark.skipif(
77+
Qwen3NextConfig is None,
78+
reason="Qwen3Next not available in this version of transformers",
79+
)
6380
def test_calib_qwen3_moe_module():
64-
from transformers import Qwen3NextConfig
65-
from transformers.models.qwen3_next.modeling_qwen3_next import (
66-
Qwen3NextSparseMoeBlock,
67-
)
68-
6981
config = Qwen3NextConfig()
7082
with torch.device("cuda"):
7183
original = Qwen3NextSparseMoeBlock(config).eval()

tests/lmeval/test_lmeval.py

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pydantic import BaseModel
1414

1515
from llmcompressor.core import active_session
16-
from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
16+
from tests.e2e.e2e_utils import load_model, run_oneshot_for_e2e_testing
1717
from tests.test_timer.timer_utils import get_singleton_manager, log_time
1818
from tests.testing_utils import requires_gpu
1919

@@ -35,6 +35,10 @@ class LmEvalConfig(BaseModel):
3535

3636
try:
3737
import lm_eval
38+
import lm_eval.api.registry
39+
40+
# needed to populate model registry
41+
import lm_eval.models # noqa
3842

3943
lm_eval_installed = True
4044
except ImportError:
@@ -120,7 +124,7 @@ def test_lm_eval(self, test_data_file: str):
120124

121125
# Always evaluate base model for recovery testing
122126
logger.info("================= Evaluating BASE model ======================")
123-
self.base_results = self._eval_base_model()
127+
base_results = self._eval_base_model()
124128

125129
if not self.save_dir:
126130
self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
@@ -145,22 +149,41 @@ def test_lm_eval(self, test_data_file: str):
145149
self._handle_recipe()
146150

147151
logger.info("================= Running LM Eval on COMPRESSED model ==========")
148-
self._run_lm_eval()
152+
compressed_results = self._eval_compressed_model()
153+
154+
# Always use recovery testing
155+
self._validate_recovery(base_results, compressed_results)
156+
157+
# If absolute metrics provided, show warnings (not failures)
158+
if self.lmeval.metrics:
159+
self._check_absolute_warnings(compressed_results)
149160

150161
self.tear_down()
151162

152163
@log_time
153-
def _eval_base_model(self):
164+
def _eval_base_model(self) -> dict:
154165
"""Evaluate the base (uncompressed) model."""
155-
model_args = {**self.lmeval.model_args, "pretrained": self.model}
166+
return self._eval_model(self.model)
167+
168+
@log_time
169+
def _eval_compressed_model(self) -> dict:
170+
"""Evaluate the compressed model."""
171+
return self._eval_model(self.save_dir)
172+
173+
def _eval_model(self, model: str) -> dict:
174+
# NOTE: pass in PreTrainedModel to avoid lm_eval's model-loading logic
175+
# https://github.com/EleutherAI/lm-evaluation-harness/pull/3393
176+
lm_eval_cls = lm_eval.api.registry.get_model(self.lmeval.model)
156177

157178
results = lm_eval.simple_evaluate(
158-
model=self.lmeval.model,
159-
model_args=model_args,
179+
model=lm_eval_cls(
180+
pretrained=load_model(model, self.model_class, device_map="cuda:0"),
181+
batch_size=self.lmeval.batch_size,
182+
**self.lmeval.model_args,
183+
),
160184
tasks=[self.lmeval.task],
161185
num_fewshot=self.lmeval.num_fewshot,
162186
limit=self.lmeval.limit,
163-
device="cuda:0",
164187
apply_chat_template=self.lmeval.apply_chat_template,
165188
batch_size=self.lmeval.batch_size,
166189
)
@@ -181,31 +204,9 @@ def _handle_recipe(self):
181204
fp.write(recipe_yaml_str)
182205
session.reset()
183206

184-
@log_time
185-
def _run_lm_eval(self):
186-
model_args = {"pretrained": self.save_dir}
187-
model_args.update(self.lmeval.model_args)
188-
results = lm_eval.simple_evaluate(
189-
model=self.lmeval.model,
190-
model_args=model_args,
191-
tasks=[self.lmeval.task],
192-
num_fewshot=self.lmeval.num_fewshot,
193-
limit=self.lmeval.limit,
194-
device="cuda:0",
195-
apply_chat_template=self.lmeval.apply_chat_template,
196-
batch_size=self.lmeval.batch_size,
197-
)
198-
199-
# Always use recovery testing
200-
self._validate_recovery(results)
201-
202-
# If absolute metrics provided, show warnings (not failures)
203-
if self.lmeval.metrics:
204-
self._check_absolute_warnings(results)
205-
206-
def _validate_recovery(self, compressed_results):
207+
def _validate_recovery(self, base_results, compressed_results):
207208
"""Validate using recovery testing - compare against base model."""
208-
base_metrics = self.base_results["results"][self.lmeval.task]
209+
base_metrics = base_results["results"][self.lmeval.task]
209210
compressed_metrics = compressed_results["results"][self.lmeval.task]
210211
higher_is_better_map = compressed_results.get("higher_is_better", {}).get(
211212
self.lmeval.task, {}

0 commit comments

Comments
 (0)