From 18e9bfc725ebbfdb09e34d7aee11381c9166afdf Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Wed, 4 Mar 2026 11:28:09 -0800
Subject: [PATCH 1/2] Bump transformers to 5.0

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 examples/gpt-oss/requirements.txt                  |  1 -
 examples/llm_distill/requirements.txt              |  1 -
 examples/speculative_decoding/requirements.txt     |  3 +--
 examples/vlm_ptq/requirements-vila.txt             |  3 ---
 modelopt/torch/__init__.py                         |  2 +-
 modelopt/torch/speculative/plugins/transformers.py | 14 ++------------
 pyproject.toml                                     |  2 +-
 tests/_test_utils/torch/transformers_models.py     |  9 ++-------
 tests/unit/torch/quantization/plugins/test_peft.py |  4 ----
 tox.ini                                            |  2 +-
 10 files changed, 8 insertions(+), 33 deletions(-)
 delete mode 100644 examples/vlm_ptq/requirements-vila.txt

diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt
index 368097d33..76c3b0a2e 100644
--- a/examples/gpt-oss/requirements.txt
+++ b/examples/gpt-oss/requirements.txt
@@ -1,5 +1,4 @@
 kernels>=0.9.0
 torch>2.7.1
 trackio
-transformers>=4.55.0
 trl>=0.21.0
diff --git a/examples/llm_distill/requirements.txt b/examples/llm_distill/requirements.txt
index 91dda9daf..4bcd19083 100644
--- a/examples/llm_distill/requirements.txt
+++ b/examples/llm_distill/requirements.txt
@@ -1,4 +1,3 @@
 pyarrow
 torchao>=0.14.1
-transformers<5.0
 trl>=0.23.0
diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt
index 6324bac62..8e50f9c3f 100644
--- a/examples/speculative_decoding/requirements.txt
+++ b/examples/speculative_decoding/requirements.txt
@@ -1,2 +1 @@
-accelerate==1.12.0
-transformers==5.0.0rc1
+transformers>=5.0
diff --git a/examples/vlm_ptq/requirements-vila.txt b/examples/vlm_ptq/requirements-vila.txt
deleted file mode 100644
index 7391a5f26..000000000
--- a/examples/vlm_ptq/requirements-vila.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-deepspeed>=0.16.0
-git+https://github.com/bfshi/scaling_on_scales.git
-transformers<=4.50.0
diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py
index bc42b82a6..be4a336a8 100644
--- a/modelopt/torch/__init__.py
+++ b/modelopt/torch/__init__.py
@@ -32,7 +32,7 @@
 try:
     from transformers import __version__ as _transformers_version
 
-    if not (_Version("4.53") <= _Version(_transformers_version) < _Version("5.0")):
+    if not (_Version("4.55") <= _Version(_transformers_version)):
         _warnings.warn(
             f"transformers version {_transformers_version} is not tested with nvidia-modelopt and may cause issues. "
             "Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models.",
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
index c21594afe..f35b2ca91 100644
--- a/modelopt/torch/speculative/plugins/transformers.py
+++ b/modelopt/torch/speculative/plugins/transformers.py
@@ -35,8 +35,6 @@
 from typing import Any
 
 import torch
-import transformers
-from packaging.version import Version
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.attention.flex_attention import BlockMask, create_block_mask
@@ -77,14 +75,6 @@
 CACHED_SHARD_TTT_MASKS = {}
 
 
-def _get_empty_cache(config):
-    """Return an empty cache. Handle different versions of transformers for unit tests."""
-    if Version(transformers.__version__) >= Version("4.54"):
-        return DynamicCache(config=config)
-    else:
-        return DynamicCache()
-
-
 @MedusaDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"})
 class HFMedusaModel(MedusaModel):
     """Medusa Model Class for huggingface models."""
@@ -908,9 +898,9 @@ def forward(
             )
 
         if not isinstance(past_key_values, Cache):
-            past_key_values = _get_empty_cache(self._base_llm_config)
+            past_key_values = DynamicCache(config=self._base_llm_config)
         if not isinstance(eagle_cache, Cache):
-            eagle_cache = _get_empty_cache(self.eagle_module.config)
+            eagle_cache = DynamicCache(config=self.eagle_module.config)
         past_key_values.eagle_cache = eagle_cache
 
         # ====Prepare inputs for the first eagle forward pass====
diff --git a/pyproject.toml b/pyproject.toml
index 92ba9cb5e..3d910bb16 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,7 +75,7 @@ hf = [
     "nltk",
     "peft>=0.17.0",
     "sentencepiece>=0.2.1",                                                           # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
-    "transformers>=4.53,<5.0",                                                        # Should match modelopt/torch/__init__.py and tox.ini
+    "transformers>=4.55",                                                             # Should match modelopt/torch/__init__.py and tox.ini
     "wonderwords",
 ]
 dev-lint = [
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
index 927ed0ea4..7812f2bd9 100644
--- a/tests/_test_utils/torch/transformers_models.py
+++ b/tests/_test_utils/torch/transformers_models.py
@@ -19,13 +19,14 @@
 import pytest
 import torch
 from _test_utils.torch.misc import set_seed
-from packaging.version import Version
 
 transformers = pytest.importorskip("transformers")
 from transformers import (
     AutoTokenizer,
     BertConfig,
     BertForQuestionAnswering,
+    GptOssConfig,
+    GptOssForCausalLM,
     LlamaConfig,
     LlamaForCausalLM,
     Qwen3Config,
@@ -37,9 +38,6 @@
     T5Tokenizer,
 )
 
-if Version(transformers.__version__) >= Version("4.55"):
-    from transformers import GptOssConfig, GptOssForCausalLM
-
 import modelopt.torch.opt as mto
 
 SEED = 1234
@@ -141,9 +139,6 @@ def get_tiny_t5(**config_kwargs) -> T5ForConditionalGeneration:
 
 def get_tiny_gpt_oss(**config_kwargs) -> "GptOssForCausalLM":
     set_seed(SEED)
-    if Version(transformers.__version__) < Version("4.55"):
-        pytest.skip("GptOssForCausalLM is not supported in transformers < 4.55")
-
     kwargs = {
         "num_hidden_layers": 4,
         "num_local_experts": 8,
diff --git a/tests/unit/torch/quantization/plugins/test_peft.py b/tests/unit/torch/quantization/plugins/test_peft.py
index 7077801a4..a318de09a 100644
--- a/tests/unit/torch/quantization/plugins/test_peft.py
+++ b/tests/unit/torch/quantization/plugins/test_peft.py
@@ -16,7 +16,6 @@
 import pytest
 import torch
 from _test_utils.torch.transformers_models import get_tiny_gpt_oss, get_tiny_llama, tf_output_tester
-from packaging.version import Version
 
 pytest.importorskip("peft")
 transformers = pytest.importorskip("transformers")
@@ -54,9 +53,6 @@ def test_convert_loralinear():
     tf_output_tester(model_ref, model_test)
 
 
-@pytest.mark.skipif(
-    Version(transformers.__version__) < Version("4.55"), reason="transformers < 4.55"
-)
 def test_peft_flow(tmp_path):
     model_original = get_tiny_gpt_oss(num_hidden_layers=1)
 
diff --git a/tox.ini b/tox.ini
index e1918b077..a0cd1834d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -24,7 +24,7 @@ deps =
     -e .[all,dev-test]
 
     # Should match pyproject.toml
-    tf_min: transformers~=4.53.0
+    tf_min: transformers~=4.55.0
 commands =
     python -m pytest tests/unit {env:COV_ARGS:}
 

From 3e28adaab776da7a597a154d8671c21c2fcfb077 Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 6 Mar 2026 08:15:33 -0800
Subject: [PATCH 2/2] Fix Bert Gradnas tracing for transformers 5.0

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 modelopt/torch/trace/plugins/transformers.py | 58 ++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/modelopt/torch/trace/plugins/transformers.py b/modelopt/torch/trace/plugins/transformers.py
index f07a37601..02e70741c 100644
--- a/modelopt/torch/trace/plugins/transformers.py
+++ b/modelopt/torch/trace/plugins/transformers.py
@@ -15,7 +15,10 @@
 
 """Utilities to describe symbols in the dynamic attention module."""
 
+import torch
+from packaging.version import Version as _Version
 from torch import nn
+from transformers import __version__ as _transformers_version
 from transformers.models.bert.modeling_bert import BertAttention
 from transformers.models.gptj.modeling_gptj import GPTJAttention
 
@@ -56,3 +59,58 @@ def get_hf_attn_sym_info_sortable(mod: nn.Module) -> SymInfo:
 @SymMap.register([GPTJAttention])
 def get_hf_attn_sym_info_unsortable(mod: nn.Module) -> SymInfo:
     return get_hf_attn_sym_info(sortable_attn=True)
+
+
+# In transformers>=5.0, BertLayer.forward uses tuple unpacking on the BertAttention output
+# (e.g. `self_attn_out, _ = self.attention(...)`), which FX symbolic tracing cannot handle when
+# BertAttention is a registered leaf (the proxy is not iterable). Patch BertLayer.forward to use
+# indexing instead, and call feed_forward_chunk directly (equivalent to apply_chunking_to_forward
+# with chunk_size=0, which is the default for BERT).
+if _Version(_transformers_version) >= _Version("5.0"):
+    from transformers.models.bert.modeling_bert import BertLayer as _BertLayer
+
+    def _fx_friendly_bert_layer_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        # Use indexing instead of tuple-unpacking so FX can trace through BertLayer
+        # when BertAttention is a registered leaf (returns an opaque Proxy).
+        # Accept **kwargs so that a parent trace (e.g. BertEncoder) passing extra kwargs
+        # like position_ids does not mark BertLayer as failed. However, do NOT forward
+        # **kwargs into self.attention: FX represents **kwargs as a Proxy(_kwargs), so
+        # unpacking it with ** would trigger "Proxy cannot be iterated". Additionally,
+        # BertSelfAttention ignores these kwargs (e.g. position_ids) in practice.
+        _attn_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = _attn_outputs[0]
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with"
+                    " cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            _cross_outputs = self.crossattention(
+                attention_output,
+                None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values=past_key_values,
+            )
+            attention_output = _cross_outputs[0]
+
+        # Call feed_forward_chunk directly (equivalent to apply_chunking_to_forward when
+        # chunk_size_feed_forward=0, which is the BERT default).
+        return self.feed_forward_chunk(attention_output)
+
+    _BertLayer.forward = _fx_friendly_bert_layer_forward