From 381ac9dc5158a0bff55857a7074a102a7afe78d1 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 23 Dec 2025 06:08:39 +0000 Subject: [PATCH] patch dsv3 config when needed Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 21 +++++++++++++++++++++ examples/llm_ptq/hf_ptq.py | 9 +++++++++ examples/llm_ptq/multinode_ptq.py | 10 +++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 4ac8b7f02..4823ad62e 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -274,6 +274,25 @@ def get_dtype(dtype): return dtype +def maybe_patch_deepseek_v3_config(hf_config): + """Patch DeepSeek V3 config to add missing qk_head_dim attribute if needed. + + Args: + hf_config: HuggingFace model config object + + Returns: + The patched config object + """ + if hf_config.model_type == "deepseek_v3" and not hasattr(hf_config, "qk_head_dim"): + if hasattr(hf_config, "qk_nope_head_dim") and hasattr(hf_config, "qk_rope_head_dim"): + hf_config.qk_head_dim = hf_config.qk_nope_head_dim + hf_config.qk_rope_head_dim + print( + f"Patched DeepSeek V3 config: qk_head_dim = {hf_config.qk_head_dim} " + f"(qk_nope_head_dim={hf_config.qk_nope_head_dim} + qk_rope_head_dim={hf_config.qk_rope_head_dim})" + ) + return hf_config + + def get_model( ckpt_path, device="cuda", @@ -301,6 +320,8 @@ def get_model( # Load config once and handle VL model detection try: hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs) + hf_config = maybe_patch_deepseek_v3_config(hf_config) + if is_nemotron_vl(hf_config): print( "Detected Nemotron VL model from config. " diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index a9862a742..c34af8a17 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -30,6 +30,7 @@ get_tokenizer, is_enc_dec, is_nemotron_vl, + maybe_patch_deepseek_v3_config, run_nemotron_vl_preview, ) from torch.utils.data import DataLoader @@ -270,12 +271,20 @@ def load_model(args: argparse.Namespace): ) # Do not use real quant GEMM so the calibration can be more accurate. + # Load and patch config for DeepSeek V3 before initializing the model + from transformers import AutoConfig + + config_kwargs = {"trust_remote_code": args.trust_remote_code} + hf_config = AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs) + hf_config = maybe_patch_deepseek_v3_config(hf_config) + with init_quantized_weights( quant_cfg, gpu_mem_percentage=args.gpu_max_mem_percentage, quant_gemm=False ): model_kwargs = {"trust_remote_code": args.trust_remote_code} if args.attn_implementation is not None: model_kwargs["attn_implementation"] = args.attn_implementation + model_kwargs["config"] = hf_config full_model = AutoModelForCausalLM.from_pretrained( args.pyt_ckpt_path, **model_kwargs, diff --git a/examples/llm_ptq/multinode_ptq.py b/examples/llm_ptq/multinode_ptq.py index 2ae7dde4a..1fb76fd66 100644 --- a/examples/llm_ptq/multinode_ptq.py +++ b/examples/llm_ptq/multinode_ptq.py @@ -28,7 +28,7 @@ import torch import torch.nn as nn from accelerate import Accelerator -from example_utils import build_quant_cfg, get_tokenizer +from example_utils import build_quant_cfg, get_tokenizer, maybe_patch_deepseek_v3_config from tqdm import tqdm from transformers import AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -146,10 +146,18 @@ def load_and_prepare_model( Returns: Tuple of (prepared_model, model_type, original_architectures, calibration_dataloader) """ + # Load and patch config for DeepSeek V3 before initializing the model + from transformers import AutoConfig + + config_kwargs = {"trust_remote_code": trust_remote_code} + hf_config = AutoConfig.from_pretrained(model_path, **config_kwargs) + hf_config = maybe_patch_deepseek_v3_config(hf_config) + model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype="auto", trust_remote_code=trust_remote_code, + config=hf_config, ) model.eval() model_type = get_model_type(model)