From f2531d1762dc906456d9b10093591973b95e0d03 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 29 Jan 2026 20:34:43 +0000
Subject: [PATCH] Add transformers v5 compatibility

---
 constraints-dev.txt                         |  8 ++++----
 requirements.txt                            |  2 +-
 src/instructlab/training/data_process.py    |  4 ++--
 src/instructlab/training/main_ds.py         |  3 +++
 src/instructlab/training/tokenizer_utils.py | 14 ++++++--------
 5 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/constraints-dev.txt b/constraints-dev.txt
index 33c66013..f45e171a 100644
--- a/constraints-dev.txt
+++ b/constraints-dev.txt
@@ -48,11 +48,11 @@ gitdb==4.0.12             # via gitpython
 gitpython==3.1.45         # via wandb
 grpcio==1.74.0            # via tensorboard
 h11==0.16.0               # via httpcore
-hf-xet==1.1.9             # via huggingface-hub
+hf-xet==1.2.0             # via huggingface-hub
 hjson==3.1.0              # via deepspeed
 httpcore==1.0.9           # via httpx
 httpx==0.28.1             # via jupyterlab
-huggingface-hub==0.34.4   # via accelerate, datasets, peft, tokenizers, transformers, -r requirements-dev.txt
+huggingface-hub==1.3.4    # via accelerate, datasets, peft, tokenizers, transformers, -r requirements-dev.txt
 identify==2.6.13          # via pre-commit
 idna==3.10                # via anyio, httpx, jsonschema, requests, yarl
 iniconfig==2.1.0          # via pytest
@@ -181,7 +181,7 @@ tensorboard==2.20.0       # via -r requirements-dev.txt
 tensorboard-data-server==0.7.2  # via tensorboard
 terminado==0.18.1         # via jupyter-server, jupyter-server-terminals
 tinycss2==1.4.0           # via bleach
-tokenizers==0.22.0        # via transformers
+tokenizers==0.22.2        # via transformers
 tomlkit==0.13.3           # via pylint
 torch==2.6.0              # via accelerate, bitsandbytes, deepspeed, flash-attn, liger-kernel, peft, -c constraints-dev.txt.in, -r requirements.txt
 tornado==6.5.2            # via ipykernel, jupyter-client, jupyter-server, jupyterlab, notebook, terminado
@@ -189,7 +189,7 @@ tox==4.29.0               # via tox-uv, -r requirements-dev.txt
 tox-uv==1.25.0            # via -r requirements-dev.txt
 tqdm==4.67.1              # via datasets, deepspeed, huggingface-hub, peft, transformers
 traitlets==5.14.3         # via ipykernel, ipython, ipywidgets, jupyter-client, jupyter-console, jupyter-core, jupyter-events, jupyter-server, jupyterlab, matplotlib-inline, nbclient, nbconvert, nbformat
-transformers==4.56.0      # via peft, trl, -r requirements.txt
+transformers==5.0.0       # via peft, trl, -r requirements.txt
 triton==3.2.0             # via liger-kernel, torch
 trl==0.22.1               # via -r requirements.txt
 types-python-dateutil==2.9.0.20250822  # via arrow
diff --git a/requirements.txt b/requirements.txt
index 721dd914..8769aafe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ wheel>=0.43
 pyyaml
 py-cpuinfo
 torch>=2.6.0
-transformers>=4.55.0
+transformers>=5.0.0
 
 datasets>=2.15.0
 numba>=0.62.0
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index b0eaa268..01266639 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -393,7 +393,7 @@ def process_messages_into_input_ids_with_chat_template(args: DataProcessArgs):
 
     # Adding after tokenizer setup as these are temp tokens, not to be saved
     tokenizer.add_special_tokens(
-        {"additional_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]}
+        {"extra_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]}
     )
 
     try:
@@ -1300,7 +1300,7 @@ def configure_tokenizer(model_path: str) -> PreTrainedTokenizer:
     # Add special tokens for masking
     tokenizer.add_special_tokens(
         {
-            "additional_special_tokens": [
+            "extra_special_tokens": [
                 UNMASK_BEGIN_TOKEN,
                 UNMASK_END_TOKEN,
                 UNMASK_REASONING_BEGIN_TOKEN,
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index ff5cea7d..e05c9eae 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -9,6 +9,9 @@
 import time
 import warnings
 
+# Suppress verbose HTTP request logs from httpx (used by huggingface_hub in transformers v5+)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
 try:
     # Third Party
     from deepspeed.ops.adam import DeepSpeedCPUAdam
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index 2ef67fb9..ef9a92c7 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -18,19 +18,17 @@ def setup_tokenizer_with_existing_chat_template(
         # we need to set the padding token
         tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
 
-    # ensure the pad token is in the additional special tokens without duplicating anything else
+    # ensure the pad token is in the extra special tokens without duplicating anything else
     new_tokens = []
-    if tokenizer.pad_token not in tokenizer.additional_special_tokens:
+    if tokenizer.pad_token not in tokenizer.extra_special_tokens:
         new_tokens.append(tokenizer.pad_token)
-    if tokenizer.eos_token not in tokenizer.additional_special_tokens:
+    if tokenizer.eos_token not in tokenizer.extra_special_tokens:
         new_tokens.append(tokenizer.eos_token)
 
     # ensure the tokens are being sorted to prevent any issues
     new_tokens = sorted(new_tokens)
-    additional_special_tokens = tokenizer.additional_special_tokens + new_tokens
-    tokenizer.add_special_tokens(
-        {"additional_special_tokens": additional_special_tokens}
-    )
+    extra_special_tokens = tokenizer.extra_special_tokens + new_tokens
+    tokenizer.add_special_tokens({"extra_special_tokens": extra_special_tokens})
 
     # ensure the necessary tokens exist
     assert len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1, (
@@ -57,7 +55,7 @@ def setup_tokenizer_from_new_chat_template(
         }
     )
     tokenizer.add_special_tokens(
-        {"additional_special_tokens": SPECIAL_TOKENS.get_tokens_to_add()}
+        {"extra_special_tokens": SPECIAL_TOKENS.get_tokens_to_add()}
     )
     if getattr(tokenizer, "add_bos_token", False) or getattr(
         tokenizer, "add_eos_token", False