From f2531d1762dc906456d9b10093591973b95e0d03 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 29 Jan 2026 20:34:43 +0000 Subject: [PATCH] Add transformers v5 compatibility --- constraints-dev.txt | 8 ++++---- requirements.txt | 2 +- src/instructlab/training/data_process.py | 4 ++-- src/instructlab/training/main_ds.py | 3 +++ src/instructlab/training/tokenizer_utils.py | 14 ++++++-------- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/constraints-dev.txt b/constraints-dev.txt index 33c66013..f45e171a 100644 --- a/constraints-dev.txt +++ b/constraints-dev.txt @@ -48,11 +48,11 @@ gitdb==4.0.12 # via gitpython gitpython==3.1.45 # via wandb grpcio==1.74.0 # via tensorboard h11==0.16.0 # via httpcore -hf-xet==1.1.9 # via huggingface-hub +hf-xet==1.2.0 # via huggingface-hub hjson==3.1.0 # via deepspeed httpcore==1.0.9 # via httpx httpx==0.28.1 # via jupyterlab -huggingface-hub==0.34.4 # via accelerate, datasets, peft, tokenizers, transformers, -r requirements-dev.txt +huggingface-hub==1.3.4 # via accelerate, datasets, peft, tokenizers, transformers, -r requirements-dev.txt identify==2.6.13 # via pre-commit idna==3.10 # via anyio, httpx, jsonschema, requests, yarl iniconfig==2.1.0 # via pytest @@ -181,7 +181,7 @@ tensorboard==2.20.0 # via -r requirements-dev.txt tensorboard-data-server==0.7.2 # via tensorboard terminado==0.18.1 # via jupyter-server, jupyter-server-terminals tinycss2==1.4.0 # via bleach -tokenizers==0.22.0 # via transformers +tokenizers==0.22.2 # via transformers tomlkit==0.13.3 # via pylint torch==2.6.0 # via accelerate, bitsandbytes, deepspeed, flash-attn, liger-kernel, peft, -c constraints-dev.txt.in, -r requirements.txt tornado==6.5.2 # via ipykernel, jupyter-client, jupyter-server, jupyterlab, notebook, terminado @@ -189,7 +189,7 @@ tox==4.29.0 # via tox-uv, -r requirements-dev.txt tox-uv==1.25.0 # via -r requirements-dev.txt tqdm==4.67.1 # via datasets, deepspeed, huggingface-hub, peft, transformers traitlets==5.14.3 # via ipykernel, ipython, ipywidgets, jupyter-client, jupyter-console, jupyter-core, jupyter-events, jupyter-server, jupyterlab, matplotlib-inline, nbclient, nbconvert, nbformat -transformers==4.56.0 # via peft, trl, -r requirements.txt +transformers==5.0.0 # via peft, trl, -r requirements.txt triton==3.2.0 # via liger-kernel, torch trl==0.22.1 # via -r requirements.txt types-python-dateutil==2.9.0.20250822 # via arrow diff --git a/requirements.txt b/requirements.txt index 721dd914..8769aafe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ wheel>=0.43 pyyaml py-cpuinfo torch>=2.6.0 -transformers>=4.55.0 +transformers>=5.0.0 datasets>=2.15.0 numba>=0.62.0 diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index b0eaa268..01266639 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -393,7 +393,7 @@ def process_messages_into_input_ids_with_chat_template(args: DataProcessArgs): # Adding after tokenizer setup as these are temp tokens, not to be saved tokenizer.add_special_tokens( - {"additional_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]} + {"extra_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]} ) try: @@ -1300,7 +1300,7 @@ def configure_tokenizer(model_path: str) -> PreTrainedTokenizer: # Add special tokens for masking tokenizer.add_special_tokens( { - "additional_special_tokens": [ + "extra_special_tokens": [ UNMASK_BEGIN_TOKEN, UNMASK_END_TOKEN, UNMASK_REASONING_BEGIN_TOKEN, diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index ff5cea7d..e05c9eae 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -9,6 +9,9 @@ import time import warnings +# Suppress verbose HTTP request logs from httpx (used by huggingface_hub in transformers v5+) +logging.getLogger("httpx").setLevel(logging.WARNING) + try: # Third Party from deepspeed.ops.adam import DeepSpeedCPUAdam diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index 2ef67fb9..ef9a92c7 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -18,19 +18,17 @@ def setup_tokenizer_with_existing_chat_template( # we need to set the padding token tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token}) - # ensure the pad token is in the additional special tokens without duplicating anything else + # ensure the pad token is in the extra special tokens without duplicating anything else new_tokens = [] - if tokenizer.pad_token not in tokenizer.additional_special_tokens: + if tokenizer.pad_token not in tokenizer.extra_special_tokens: new_tokens.append(tokenizer.pad_token) - if tokenizer.eos_token not in tokenizer.additional_special_tokens: + if tokenizer.eos_token not in tokenizer.extra_special_tokens: new_tokens.append(tokenizer.eos_token) # ensure the tokens are being sorted to prevent any issues new_tokens = sorted(new_tokens) - additional_special_tokens = tokenizer.additional_special_tokens + new_tokens - tokenizer.add_special_tokens( - {"additional_special_tokens": additional_special_tokens} - ) + extra_special_tokens = tokenizer.extra_special_tokens + new_tokens + tokenizer.add_special_tokens({"extra_special_tokens": extra_special_tokens}) # ensure the necessary tokens exist assert len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1, ( @@ -57,7 +55,7 @@ def setup_tokenizer_from_new_chat_template( } ) tokenizer.add_special_tokens( - {"additional_special_tokens": SPECIAL_TOKENS.get_tokens_to_add()} + {"extra_special_tokens": SPECIAL_TOKENS.get_tokens_to_add()} ) if getattr(tokenizer, "add_bos_token", False) or getattr( tokenizer, "add_eos_token", False