From 381ac9dc5158a0bff55857a7074a102a7afe78d1 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 23 Dec 2025 06:08:39 +0000
Subject: [PATCH] patch dsv3 config when needed

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py | 21 +++++++++++++++++++++
 examples/llm_ptq/hf_ptq.py        |  9 +++++++++
 examples/llm_ptq/multinode_ptq.py | 10 +++++++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 4ac8b7f02..4823ad62e 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -274,6 +274,25 @@ def get_dtype(dtype):
     return dtype
 
 
+def maybe_patch_deepseek_v3_config(hf_config):
+    """Patch DeepSeek V3 config to add missing qk_head_dim attribute if needed.
+
+    Args:
+        hf_config: HuggingFace model config object
+
+    Returns:
+        The patched config object
+    """
+    if hf_config.model_type == "deepseek_v3" and not hasattr(hf_config, "qk_head_dim"):
+        if hasattr(hf_config, "qk_nope_head_dim") and hasattr(hf_config, "qk_rope_head_dim"):
+            hf_config.qk_head_dim = hf_config.qk_nope_head_dim + hf_config.qk_rope_head_dim
+            print(
+                f"Patched DeepSeek V3 config: qk_head_dim = {hf_config.qk_head_dim} "
+                f"(qk_nope_head_dim={hf_config.qk_nope_head_dim} + qk_rope_head_dim={hf_config.qk_rope_head_dim})"
+            )
+    return hf_config
+
+
 def get_model(
     ckpt_path,
     device="cuda",
@@ -301,6 +320,8 @@ def get_model(
     # Load config once and handle VL model detection
     try:
         hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
+        hf_config = maybe_patch_deepseek_v3_config(hf_config)
+
         if is_nemotron_vl(hf_config):
             print(
                 "Detected Nemotron VL model from config. "
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index a9862a742..c34af8a17 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -30,6 +30,7 @@
     get_tokenizer,
     is_enc_dec,
     is_nemotron_vl,
+    maybe_patch_deepseek_v3_config,
     run_nemotron_vl_preview,
 )
 from torch.utils.data import DataLoader
@@ -270,12 +271,20 @@ def load_model(args: argparse.Namespace):
             )
 
         # Do not use real quant GEMM so the calibration can be more accurate.
+        # Load and patch config for DeepSeek V3 before initializing the model
+        from transformers import AutoConfig
+
+        config_kwargs = {"trust_remote_code": args.trust_remote_code}
+        hf_config = AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs)
+        hf_config = maybe_patch_deepseek_v3_config(hf_config)
+
         with init_quantized_weights(
             quant_cfg, gpu_mem_percentage=args.gpu_max_mem_percentage, quant_gemm=False
         ):
             model_kwargs = {"trust_remote_code": args.trust_remote_code}
             if args.attn_implementation is not None:
                 model_kwargs["attn_implementation"] = args.attn_implementation
+            model_kwargs["config"] = hf_config
             full_model = AutoModelForCausalLM.from_pretrained(
                 args.pyt_ckpt_path,
                 **model_kwargs,
diff --git a/examples/llm_ptq/multinode_ptq.py b/examples/llm_ptq/multinode_ptq.py
index 2ae7dde4a..1fb76fd66 100644
--- a/examples/llm_ptq/multinode_ptq.py
+++ b/examples/llm_ptq/multinode_ptq.py
@@ -28,7 +28,7 @@
 import torch
 import torch.nn as nn
 from accelerate import Accelerator
-from example_utils import build_quant_cfg, get_tokenizer
+from example_utils import build_quant_cfg, get_tokenizer, maybe_patch_deepseek_v3_config
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -146,10 +146,18 @@ def load_and_prepare_model(
     Returns:
         Tuple of (prepared_model, model_type, original_architectures, calibration_dataloader)
     """
+    # Load and patch config for DeepSeek V3 before initializing the model
+    from transformers import AutoConfig
+
+    config_kwargs = {"trust_remote_code": trust_remote_code}
+    hf_config = AutoConfig.from_pretrained(model_path, **config_kwargs)
+    hf_config = maybe_patch_deepseek_v3_config(hf_config)
+
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         torch_dtype="auto",
         trust_remote_code=trust_remote_code,
+        config=hf_config,
     )
     model.eval()
     model_type = get_model_type(model)