From 18e9bfc725ebbfdb09e34d7aee11381c9166afdf Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 4 Mar 2026 11:28:09 -0800 Subject: [PATCH 1/2] Bump transformers to 5.0 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/gpt-oss/requirements.txt | 1 - examples/llm_distill/requirements.txt | 1 - examples/speculative_decoding/requirements.txt | 3 +-- examples/vlm_ptq/requirements-vila.txt | 3 --- modelopt/torch/__init__.py | 2 +- modelopt/torch/speculative/plugins/transformers.py | 14 ++------------ pyproject.toml | 2 +- tests/_test_utils/torch/transformers_models.py | 9 ++------- tests/unit/torch/quantization/plugins/test_peft.py | 4 ---- tox.ini | 2 +- 10 files changed, 8 insertions(+), 33 deletions(-) delete mode 100644 examples/vlm_ptq/requirements-vila.txt diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index 368097d33..76c3b0a2e 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -1,5 +1,4 @@ kernels>=0.9.0 torch>2.7.1 trackio -transformers>=4.55.0 trl>=0.21.0 diff --git a/examples/llm_distill/requirements.txt b/examples/llm_distill/requirements.txt index 91dda9daf..4bcd19083 100644 --- a/examples/llm_distill/requirements.txt +++ b/examples/llm_distill/requirements.txt @@ -1,4 +1,3 @@ pyarrow torchao>=0.14.1 -transformers<5.0 trl>=0.23.0 diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt index 6324bac62..8e50f9c3f 100644 --- a/examples/speculative_decoding/requirements.txt +++ b/examples/speculative_decoding/requirements.txt @@ -1,2 +1 @@ -accelerate==1.12.0 -transformers==5.0.0rc1 +transformers>=5.0 diff --git a/examples/vlm_ptq/requirements-vila.txt b/examples/vlm_ptq/requirements-vila.txt deleted file mode 100644 index 7391a5f26..000000000 --- a/examples/vlm_ptq/requirements-vila.txt +++ /dev/null @@ -1,3 +0,0 @@ -deepspeed>=0.16.0 -git+https://github.com/bfshi/scaling_on_scales.git -transformers<=4.50.0 diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index bc42b82a6..be4a336a8 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -32,7 +32,7 @@ try: from transformers import __version__ as _transformers_version - if not (_Version("4.53") <= _Version(_transformers_version) < _Version("5.0")): + if not (_Version("4.55") <= _Version(_transformers_version)): _warnings.warn( f"transformers version {_transformers_version} is not tested with nvidia-modelopt and may cause issues. " "Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models.", diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index c21594afe..f35b2ca91 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -35,8 +35,6 @@ from typing import Any import torch -import transformers -from packaging.version import Version from torch import nn from torch.nn import CrossEntropyLoss from torch.nn.attention.flex_attention import BlockMask, create_block_mask @@ -77,14 +75,6 @@ CACHED_SHARD_TTT_MASKS = {} -def _get_empty_cache(config): - """Return an empty cache. Handle different versions of transformers for unit tests.""" - if Version(transformers.__version__) >= Version("4.54"): - return DynamicCache(config=config) - else: - return DynamicCache() - - @MedusaDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"}) class HFMedusaModel(MedusaModel): """Medusa Model Class for huggingface models.""" @@ -908,9 +898,9 @@ def forward( ) if not isinstance(past_key_values, Cache): - past_key_values = _get_empty_cache(self._base_llm_config) + past_key_values = DynamicCache(config=self._base_llm_config) if not isinstance(eagle_cache, Cache): - eagle_cache = _get_empty_cache(self.eagle_module.config) + eagle_cache = DynamicCache(config=self.eagle_module.config) past_key_values.eagle_cache = eagle_cache # ====Prepare inputs for the first eagle forward pass==== diff --git a/pyproject.toml b/pyproject.toml index 92ba9cb5e..3d910bb16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ hf = [ "nltk", "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export - "transformers>=4.53,<5.0", # Should match modelopt/torch/__init__.py and tox.ini + "transformers>=4.55", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] dev-lint = [ diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py index 927ed0ea4..7812f2bd9 100644 --- a/tests/_test_utils/torch/transformers_models.py +++ b/tests/_test_utils/torch/transformers_models.py @@ -19,13 +19,14 @@ import pytest import torch from _test_utils.torch.misc import set_seed -from packaging.version import Version transformers = pytest.importorskip("transformers") from transformers import ( AutoTokenizer, BertConfig, BertForQuestionAnswering, + GptOssConfig, + GptOssForCausalLM, LlamaConfig, LlamaForCausalLM, Qwen3Config, @@ -37,9 +38,6 @@ T5Tokenizer, ) -if Version(transformers.__version__) >= Version("4.55"): - from transformers import GptOssConfig, GptOssForCausalLM - import modelopt.torch.opt as mto SEED = 1234 @@ -141,9 +139,6 @@ def get_tiny_t5(**config_kwargs) -> T5ForConditionalGeneration: def get_tiny_gpt_oss(**config_kwargs) -> "GptOssForCausalLM": set_seed(SEED) - if Version(transformers.__version__) < Version("4.55"): - pytest.skip("GptOssForCausalLM is not supported in transformers < 4.55") - kwargs = { "num_hidden_layers": 4, "num_local_experts": 8, diff --git a/tests/unit/torch/quantization/plugins/test_peft.py b/tests/unit/torch/quantization/plugins/test_peft.py index 7077801a4..a318de09a 100644 --- a/tests/unit/torch/quantization/plugins/test_peft.py +++ b/tests/unit/torch/quantization/plugins/test_peft.py @@ -16,7 +16,6 @@ import pytest import torch from _test_utils.torch.transformers_models import get_tiny_gpt_oss, get_tiny_llama, tf_output_tester -from packaging.version import Version pytest.importorskip("peft") transformers = pytest.importorskip("transformers") @@ -54,9 +53,6 @@ def test_convert_loralinear(): tf_output_tester(model_ref, model_test) -@pytest.mark.skipif( - Version(transformers.__version__) < Version("4.55"), reason="transformers < 4.55" -) def test_peft_flow(tmp_path): model_original = get_tiny_gpt_oss(num_hidden_layers=1) diff --git a/tox.ini b/tox.ini index e1918b077..a0cd1834d 100644 --- a/tox.ini +++ b/tox.ini @@ -24,7 +24,7 @@ deps = -e .[all,dev-test] # Should match pyproject.toml - tf_min: transformers~=4.53.0 + tf_min: transformers~=4.55.0 commands = python -m pytest tests/unit {env:COV_ARGS:} From 3e28adaab776da7a597a154d8671c21c2fcfb077 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 6 Mar 2026 08:15:33 -0800 Subject: [PATCH 2/2] Fix Bert Gradnas tracing for transformers 5.0 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/trace/plugins/transformers.py | 58 ++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/modelopt/torch/trace/plugins/transformers.py b/modelopt/torch/trace/plugins/transformers.py index f07a37601..02e70741c 100644 --- a/modelopt/torch/trace/plugins/transformers.py +++ b/modelopt/torch/trace/plugins/transformers.py @@ -15,7 +15,10 @@ """Utilities to describe symbols in the dynamic attention module.""" +import torch +from packaging.version import Version as _Version from torch import nn +from transformers import __version__ as _transformers_version from transformers.models.bert.modeling_bert import BertAttention from transformers.models.gptj.modeling_gptj import GPTJAttention @@ -56,3 +59,58 @@ def get_hf_attn_sym_info_sortable(mod: nn.Module) -> SymInfo: @SymMap.register([GPTJAttention]) def get_hf_attn_sym_info_unsortable(mod: nn.Module) -> SymInfo: return get_hf_attn_sym_info(sortable_attn=True) + + +# In transformers>=5.0, BertLayer.forward uses tuple unpacking on the BertAttention output +# (e.g. `self_attn_out, _ = self.attention(...)`), which FX symbolic tracing cannot handle when +# BertAttention is a registered leaf (the proxy is not iterable). Patch BertLayer.forward to use +# indexing instead, and call feed_forward_chunk directly (equivalent to apply_chunking_to_forward +# with chunk_size=0, which is the default for BERT). +if _Version(_transformers_version) >= _Version("5.0"): + from transformers.models.bert.modeling_bert import BertLayer as _BertLayer + + def _fx_friendly_bert_layer_forward( + self, + hidden_states: torch.Tensor, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + cache_position=None, + **kwargs, + ): + # Use indexing instead of tuple-unpacking so FX can trace through BertLayer + # when BertAttention is a registered leaf (returns an opaque Proxy). + # Accept **kwargs so that a parent trace (e.g. BertEncoder) passing extra kwargs + # like position_ids does not mark BertLayer as failed. However, do NOT forward + # **kwargs into self.attention: FX represents **kwargs as a Proxy(_kwargs), so + # unpacking it with ** would trigger "Proxy cannot be iterated". Additionally, + # BertSelfAttention ignores these kwargs (e.g. position_ids) in practice. + _attn_outputs = self.attention( + hidden_states, + attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = _attn_outputs[0] + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with" + " cross-attention layers by setting `config.add_cross_attention=True`" + ) + _cross_outputs = self.crossattention( + attention_output, + None, + encoder_hidden_states, + encoder_attention_mask, + past_key_values=past_key_values, + ) + attention_output = _cross_outputs[0] + + # Call feed_forward_chunk directly (equivalent to apply_chunking_to_forward when + # chunk_size_feed_forward=0, which is the BERT default). + return self.feed_forward_chunk(attention_output) + + _BertLayer.forward = _fx_friendly_bert_layer_forward