From c52df5dabf9234dc24cd7d49f6d3ca11d1801a1c Mon Sep 17 00:00:00 2001
From: "huanghaian@pjlab.org.cn" <huanghaian@pjlab.org.cn>
Date: Tue, 27 Jan 2026 10:07:16 +0000
Subject: [PATCH 1/2] qwen3vl support get_model_config_from_hf

---
 tests/model/test_qwen3_vl.py                  | 20 +++++--
 xtuner/v1/model/__init__.py                   |  5 +-
 xtuner/v1/model/base.py                       | 19 ++++++-
 .../compose/intern_s1/intern_s1_config.py     |  3 +-
 xtuner/v1/model/compose/qwen3_vl/__init__.py  |  3 +-
 .../model/compose/qwen3_vl/qwen3_vl_config.py | 45 +++++++++++++--
 xtuner/v1/model/dense/qwen2.py                | 15 ++---
 xtuner/v1/model/dense/qwen3.py                | 26 +++++----
 xtuner/v1/model/dense/qwen3vl_text.py         |  7 ++-
 xtuner/v1/model/moe/deepseek_v3.py            | 13 +++--
 xtuner/v1/model/moe/gpt_oss.py                | 12 ++--
 xtuner/v1/model/moe/qwen3.py                  | 22 +++++---
 xtuner/v1/model/moe/qwen3vl_text.py           |  7 ++-
 xtuner/v1/utils/test_utils.py                 | 56 ++++++++++++++++++-
 14 files changed, 200 insertions(+), 53 deletions(-)

diff --git a/tests/model/test_qwen3_vl.py b/tests/model/test_qwen3_vl.py
index 777974513..016a7e88e 100644
--- a/tests/model/test_qwen3_vl.py
+++ b/tests/model/test_qwen3_vl.py
@@ -21,6 +21,8 @@
     MixedPrecisionPolicy,
     fully_shard,
 )
+from xtuner.v1.model import get_model_config_from_hf
+from xtuner.v1.utils.test_utils import compare_pydantic_models
 
 QWEN3_VL_MOE_PATH = os.environ["QWEN3_VL_MOE_PATH"]
 QWEN3_VL_DENSE_PATH = os.environ["QWEN3_VL_DENSE_PATH"]
@@ -186,9 +188,14 @@ def test_qwen3vl_run(self, device, sp_size, tol):
             device_map="cpu"
         ).eval()
         patch_hf_rms_norm(hf_model)
-
+        
         with torch.device("meta"):
-            model_cfg = Qwen3VLDense4BConfig(compile_cfg=False)
+            model_cfg = get_model_config_from_hf(QWEN3_VL_DENSE_PATH)
+            model_cfg.compile_cfg = False
+
+            model_cfg_origin = Qwen3VLDense4BConfig(compile_cfg=False)
+            self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin))
+
             qwen3vl_model = model_cfg.build().to(torch.bfloat16)
 
         qwen3vl_model.from_hf(QWEN3_VL_DENSE_PATH)
@@ -220,9 +227,12 @@ def test_fsdp_qwen3_run(self, device, sp_size, compile, tol):
         patch_hf_rms_norm(hf_model)
 
         with torch.device("meta"):
-            model_cfg = Qwen3VLDense4BConfig()
+            model_cfg = get_model_config_from_hf(QWEN3_VL_DENSE_PATH)
+            model_cfg_origin = Qwen3VLDense4BConfig()
             if compile is False:
                 model_cfg.compile_cfg = False
+                model_cfg_origin.compile_cfg = False
+            self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin))
             qwen3vl_model = model_cfg.build().to(torch.bfloat16)
 
         fsdp_config = FSDPConfig(
@@ -265,7 +275,9 @@ def test_fsdp_qwen3_run(self, device, sp_size, compile, tol):
     def test_save_hf(self, device, tp_size):
         self.create_pg(device)
         with torch.device("meta"):
-            model_cfg = Qwen3VLMoE30BA3Config()
+            model_cfg = get_model_config_from_hf(QWEN3_VL_MOE_PATH)
+            model_cfg_origin = Qwen3VLMoE30BA3Config()
+            self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin))
             qwen3vl_model = model_cfg.build().to(torch.bfloat16)
 
         fsdp_config = FSDPConfig(
diff --git a/xtuner/v1/model/__init__.py b/xtuner/v1/model/__init__.py
index 3f77cd9b8..b94550479 100644
--- a/xtuner/v1/model/__init__.py
+++ b/xtuner/v1/model/__init__.py
@@ -12,13 +12,14 @@
     InternVLBaseConfig,
 )
 from .compose.qwen3_vl import (
+    Qwen3VLBaseConfig,
     Qwen3VLDense4BConfig,
     Qwen3VLDense8BConfig,
     Qwen3VLMoE30BA3Config,
     Qwen3VLMoE235BA22Config,
 )
 from .dense.dense import Dense
-from .dense.qwen2 import Qwen2Dense7BConfig, Qwen2DenseConfig
+from .dense.qwen2 import Qwen2DenseConfig
 from .dense.qwen3 import Qwen3Dense0P6BConfig, Qwen3Dense4BConfig, Qwen3Dense8BConfig, Qwen3DenseConfig
 from .moe.deepseek_v3 import DeepSeekV3Config
 from .moe.gpt_oss import GptOss21BA3P6Config, GptOss117BA5P8Config, GptOssConfig
@@ -61,6 +62,8 @@ def get_model_config_from_hf(model_path: Path):
         return GptOssConfig.from_hf(model_path)
     elif cfg.model_type == "deepseek_v3":
         return DeepSeekV3Config.from_hf(model_path)
+    elif cfg.model_type == "qwen3_vl_moe" or cfg.model_type == "qwen3_vl":
+        return Qwen3VLBaseConfig.from_hf(model_path)
     else:
         raise ValueError(f"Unsupported model type: {cfg.model_type}")
 
diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py
index 70142c15c..94b2eaf33 100644
--- a/xtuner/v1/model/base.py
+++ b/xtuner/v1/model/base.py
@@ -9,6 +9,7 @@
 from shutil import copy, copytree
 from typing import Annotated, Generator, Iterable, Literal, Mapping, cast
 
+from typing import overload
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -150,9 +151,19 @@ def layers_type(self) -> list[Literal["full_attention", "sliding_attention"]]:
 
     def build(self) -> "BaseModel":
         raise NotImplementedError
+    
+    @overload
+    @classmethod
+    def from_hf(cls, hf_path: str | Path):
+        ...
+
+    @overload
+    @classmethod
+    def from_hf(cls, hf_config: PretrainedConfig):
+        ...
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
         """Build a `TransformerConfig` from a pre-trained HuggingFace model.
 
         This method creates a configuration object based on a `PretrainedConfig` loaded from the specified HuggingFace model path.
@@ -161,9 +172,11 @@ def from_hf(cls, hf_path: str | Path) -> Self:
         Note:
             The `hf_config` field needs to be set to the `PretrainedConfig` object loaded from `hf_path`,
             otherwise it cannot be saved in HuggingFace format.
+            The `hf_config` parameter is provided to avoid redundant loading of the configuration. Only for Compose model.
 
         Args:
-            hf_path (str | Path): Path to the HuggingFace model.
+            hf_path (str | Path | None): Path to the HuggingFace model.
+            hf_config (PretrainedConfig | None): Optional pre-loaded HuggingFace configuration. Only for Compose model.
 
         Returns:
             TransformerConfig: A configuration object populated with values from the pre-trained model.
@@ -172,7 +185,7 @@ def from_hf(cls, hf_path: str | Path) -> Self:
             NotImplementedError: This method must be implemented by subclasses.
         """
         raise NotImplementedError
-
+    
     @property
     def hf_config(self) -> PretrainedConfig | None:
         """HuggingFace configuration."""
diff --git a/xtuner/v1/model/compose/intern_s1/intern_s1_config.py b/xtuner/v1/model/compose/intern_s1/intern_s1_config.py
index 7cdcf6d11..cb99e0be4 100644
--- a/xtuner/v1/model/compose/intern_s1/intern_s1_config.py
+++ b/xtuner/v1/model/compose/intern_s1/intern_s1_config.py
@@ -5,6 +5,7 @@
 from pydantic import ConfigDict
 from typing_extensions import Self
 
+from transformers import PretrainedConfig
 from xtuner.v1.model.base import XTunerBaseModelConfig
 from xtuner.v1.model.dense.qwen3 import Qwen3Dense8BConfig
 from xtuner.v1.model.moe.moe import MoEConfig, TransformerConfig
@@ -113,7 +114,7 @@ def build(self) -> "InternS1ForConditionalGeneration":
         return InternS1ForConditionalGeneration(self)
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
         raise NotImplementedError
 
     @property
diff --git a/xtuner/v1/model/compose/qwen3_vl/__init__.py b/xtuner/v1/model/compose/qwen3_vl/__init__.py
index 5e5011262..664fd7819 100644
--- a/xtuner/v1/model/compose/qwen3_vl/__init__.py
+++ b/xtuner/v1/model/compose/qwen3_vl/__init__.py
@@ -1,5 +1,5 @@
 from .modeling_qwen3_vl import Qwen3VLForConditionalGeneration
-from .qwen3_vl_config import Qwen3VLDense4BConfig, Qwen3VLDense8BConfig, Qwen3VLMoE30BA3Config, Qwen3VLMoE235BA22Config
+from .qwen3_vl_config import Qwen3VLBaseConfig, Qwen3VLDense4BConfig, Qwen3VLDense8BConfig, Qwen3VLMoE30BA3Config, Qwen3VLMoE235BA22Config
 
 
 __all__ = [
@@ -8,4 +8,5 @@
     "Qwen3VLDense4BConfig",
     "Qwen3VLDense8BConfig",
     "Qwen3VLMoE235BA22Config",
+    "Qwen3VLBaseConfig",
 ]
diff --git a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
index 235b65560..ad5e026a2 100644
--- a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
+++ b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
@@ -4,13 +4,16 @@
 from mmengine import is_installed
 from pydantic import ConfigDict
 from typing_extensions import Self
+from pydantic import SerializeAsAny
 
 from xtuner.v1.model.base import TransformerConfig, XTunerBaseModelConfig
 from xtuner.v1.model.dense.qwen3vl_text import Qwen3VLTextDense4BConfig, Qwen3VLTextDense8BConfig
-from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config, Qwen3MoE235BA22Config
-from xtuner.v1.model.moe.qwen3vl_text import Qwen3VLTextMoE30BA3Config, Qwen3VLTextMoE235BA22Config
+from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config, Qwen3MoE235BA22Config, Qwen3MoEConfig
+from xtuner.v1.model.dense.qwen3vl_text import Qwen3VLTextBaseConfig
+from xtuner.v1.model.moe.qwen3vl_text import Qwen3VLTextMoE30BA3Config, Qwen3VLTextMoE235BA22Config, Qwen3VLTextMoEBaseConfig
 from xtuner.v1.module.rope import RopeScalingConfig
 from xtuner.v1.utils import get_device, get_logger
+from transformers import AutoConfig, PretrainedConfig
 
 from ..base import BaseComposeConfig
 
@@ -78,7 +81,7 @@ class Qwen3VLBaseConfig(BaseComposeConfig):
     )
     vision_config: Qwen3VLVisionConfig
     projector_config: Qwen3VLProjectorConfig
-    text_config: TransformerConfig
+    text_config: SerializeAsAny[TransformerConfig]
 
     image_token_id: int = 151655
     video_token_id: int = 151656
@@ -95,8 +98,36 @@ def build(self):
         return Qwen3VLForConditionalGeneration(self)
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
-        raise NotImplementedError
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
+        hf_config = AutoConfig.from_pretrained(hf_path,trust_remote_code=True)
+        
+        hf_vision_config = hf_config.vision_config
+        vision_config = Qwen3VLVisionConfig(
+            depth=hf_vision_config.depth,
+            hidden_size=hf_vision_config.hidden_size,
+            intermediate_size=hf_vision_config.intermediate_size,
+            num_attention_heads=hf_vision_config.num_heads,
+            deepstack_visual_indexes=hf_vision_config.deepstack_visual_indexes, 
+        )
+        projector_config = Qwen3VLProjectorConfig(
+            vision_hidden_size=hf_vision_config.hidden_size,
+            text_hidden_size=hf_config.text_config.hidden_size,
+            deepstack_visual_indexes=hf_vision_config.deepstack_visual_indexes,
+        )
+
+        if hf_config.model_type == "qwen3_vl_moe":
+            text_config = Qwen3VLTextMoEBaseConfig.from_hf(hf_config=hf_config.text_config)
+        elif hf_config.model_type == "qwen3_vl":
+            text_config = Qwen3VLTextBaseConfig.from_hf(hf_config=hf_config.text_config)
+
+        config = cls(vision_config=vision_config, 
+                     projector_config=projector_config, 
+                     text_config=text_config,
+                     image_token_id=hf_config.image_token_id,
+                     video_token_id=hf_config.video_token_id,
+                     vision_start_token_id=hf_config.vision_start_token_id,
+                     vision_end_token_id=hf_config.vision_end_token_id)
+        return config
 
     @property
     def hf_config(self):
@@ -114,6 +145,7 @@ class Qwen3VLMoE30BA3Config(Qwen3VLBaseConfig):
     vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig()
     projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig()
     text_config: Qwen3MoE30BA3Config = Qwen3VLTextMoE30BA3Config(
+        model_type="qwen3_vl_moe_text",
         max_position_embeddings=262144,
         rope_theta=5000000,
         rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
@@ -124,6 +156,7 @@ class Qwen3VLMoE235BA22Config(Qwen3VLBaseConfig):
     vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig()
     projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig(text_hidden_size=4096)
     text_config: Qwen3MoE235BA22Config = Qwen3VLTextMoE235BA22Config(
+        model_type="qwen3_vl_moe_text",
         max_position_embeddings=262144,
         rope_theta=5000000,
         rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
@@ -138,6 +171,7 @@ class Qwen3VLDense4BConfig(Qwen3VLBaseConfig):
         vision_hidden_size=1024, text_hidden_size=2560, deepstack_visual_indexes=[5, 11, 17]
     )
     text_config: Qwen3VLTextDense4BConfig = Qwen3VLTextDense4BConfig(
+        model_type="qwen3_vl_text",
         max_position_embeddings=262144,
         rope_theta=5000000,
         rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
@@ -148,6 +182,7 @@ class Qwen3VLDense8BConfig(Qwen3VLBaseConfig):
     vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig()
     projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig(text_hidden_size=4096)
     text_config: Qwen3VLTextDense8BConfig = Qwen3VLTextDense8BConfig(
+        model_type="qwen3_vl_text",
         max_position_embeddings=262144,
         rope_theta=5000000,
         rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
diff --git a/xtuner/v1/model/dense/qwen2.py b/xtuner/v1/model/dense/qwen2.py
index 07884398b..c1e33768c 100644
--- a/xtuner/v1/model/dense/qwen2.py
+++ b/xtuner/v1/model/dense/qwen2.py
@@ -4,6 +4,7 @@
 import torch
 from typing_extensions import Self
 
+from transformers import PretrainedConfig
 from transformers.models.qwen2 import Qwen2Config as HFQwen2DenseConfig
 from xtuner.v1.model.base import TransformerConfig
 from xtuner.v1.module.attention import MHAConfig
@@ -36,13 +37,13 @@ def build(self) -> Qwen2Dense:
         return Qwen2Dense(self)
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
-        from transformers import AutoConfig
-        from transformers.models.qwen2 import Qwen2Config as HFConfig
-
-        hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
-
-        assert isinstance(hf_config, HFConfig)
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
+        if hf_path is not None:
+            from transformers import AutoConfig
+            from transformers.models.qwen2 import Qwen2Config as HFConfig
+            hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
+            assert isinstance(hf_config, HFConfig)
+        assert hf_config is not None and isinstance(hf_config, PretrainedConfig)
 
         config = cls(
             vocab_size=hf_config.vocab_size,
diff --git a/xtuner/v1/model/dense/qwen3.py b/xtuner/v1/model/dense/qwen3.py
index 6d663735e..0525ad15d 100644
--- a/xtuner/v1/model/dense/qwen3.py
+++ b/xtuner/v1/model/dense/qwen3.py
@@ -4,9 +4,11 @@
 import torch
 from typing_extensions import Self
 
+from transformers import PretrainedConfig
 from transformers.models.qwen3 import Qwen3Config as HFQwen3DenseConfig
 from xtuner.v1.model.base import TransformerConfig
 from xtuner.v1.module.attention import MHAConfig
+from xtuner.v1.module.rope.rope import RopeScalingConfig
 
 from .dense import Dense
 
@@ -37,13 +39,13 @@ def build(self) -> Qwen3Dense:
         return Qwen3Dense(self)
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
-        from transformers import AutoConfig
-        from transformers.models.qwen3 import Qwen3Config as HFConfig
-
-        hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
-
-        assert isinstance(hf_config, HFConfig)
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
+        if hf_path is not None:
+            from transformers import AutoConfig
+            from transformers.models.qwen3 import Qwen3Config as HFConfig
+            hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
+            assert isinstance(hf_config, HFConfig)
+        assert hf_config is not None and isinstance(hf_config, PretrainedConfig)
 
         config = cls(
             vocab_size=hf_config.vocab_size,
@@ -52,7 +54,7 @@ def from_hf(cls, hf_path: str | Path) -> Self:
             bos_token_id=hf_config.bos_token_id,
             eos_token_id=hf_config.eos_token_id,
             num_hidden_layers=hf_config.num_hidden_layers,
-            max_window_layers=hf_config.max_window_layers,
+            max_window_layers=hf_config.max_window_layers if hasattr(hf_config, "max_window_layers") else hf_config.num_hidden_layers,
             hidden_size=hf_config.hidden_size,
             intermediate_size=hf_config.intermediate_size,
             rms_norm_eps=hf_config.rms_norm_eps,
@@ -62,11 +64,15 @@ def from_hf(cls, hf_path: str | Path) -> Self:
                 num_attention_heads=hf_config.num_attention_heads,
                 num_key_value_heads=hf_config.num_key_value_heads,
                 head_dim=hf_config.head_dim,
-                sliding_window=hf_config.sliding_window,
+                sliding_window=hf_config.sliding_window if hasattr(hf_config, "sliding_window") else 1024,
                 qk_norm=True,
             ),
-            use_sliding_window=hf_config.use_sliding_window,
+            model_type=hf_config.model_type,
+            use_sliding_window=hf_config.use_sliding_window if hasattr(hf_config, "use_sliding_window") else False,
             tie_word_embeddings=hf_config.tie_word_embeddings,
+            rope_scaling_cfg = RopeScalingConfig(type="qwen3_vl", 
+                                                mrope_section=hf_config.rope_scaling['mrope_section']) 
+                                                if hasattr(hf_config, "rope_scaling") else None,
         )
 
         return config
diff --git a/xtuner/v1/model/dense/qwen3vl_text.py b/xtuner/v1/model/dense/qwen3vl_text.py
index 15e40c40b..4f214815b 100644
--- a/xtuner/v1/model/dense/qwen3vl_text.py
+++ b/xtuner/v1/model/dense/qwen3vl_text.py
@@ -6,7 +6,7 @@
 from xtuner.v1.loss import CELossContext
 from xtuner.v1.model.base import ModelOutputs
 
-from .qwen3 import Qwen3Dense, Qwen3Dense4BConfig, Qwen3Dense8BConfig
+from .qwen3 import Qwen3Dense, Qwen3Dense4BConfig, Qwen3Dense8BConfig, Qwen3DenseConfig
 
 
 class Qwen3VLTextDense(Qwen3Dense):
@@ -85,6 +85,11 @@ def forward(
         return ModelOutputs(**output)  # type: ignore[typeddict-item]
 
 
+class Qwen3VLTextBaseConfig(Qwen3DenseConfig):
+    def build(self) -> Qwen3VLTextDense:
+        return Qwen3VLTextDense(self)
+
+
 class Qwen3VLTextDense4BConfig(Qwen3Dense4BConfig):
     def build(self) -> Qwen3VLTextDense:
         return Qwen3VLTextDense(self)
diff --git a/xtuner/v1/model/moe/deepseek_v3.py b/xtuner/v1/model/moe/deepseek_v3.py
index 4157b3655..42f06f981 100644
--- a/xtuner/v1/model/moe/deepseek_v3.py
+++ b/xtuner/v1/model/moe/deepseek_v3.py
@@ -3,7 +3,7 @@
 
 import torch
 from typing_extensions import Self
-
+from transformers import PretrainedConfig
 from transformers.models.deepseek_v3 import DeepseekV3Config as HFDeepseekV3Config
 from xtuner.v1.model.moe.moe import BalancingLossConfig, MoEConfig, ZLossConfig
 from xtuner.v1.module.attention import MLAConfig
@@ -103,10 +103,13 @@ def build(self) -> DeepSeekV3:
         return DeepSeekV3(self)
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
-        cfg = HFDeepseekV3Config.from_pretrained(hf_path)
-
-        assert isinstance(cfg, HFDeepseekV3Config)
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
+        if hf_path is not None:
+            cfg = HFDeepseekV3Config.from_pretrained(hf_path)
+            assert isinstance(cfg, HFDeepseekV3Config)
+        else:
+            cfg = hf_config
+        assert cfg is not None and isinstance(cfg, PretrainedConfig)
 
         config = cls(
             vocab_size=cfg.vocab_size,
diff --git a/xtuner/v1/model/moe/gpt_oss.py b/xtuner/v1/model/moe/gpt_oss.py
index 6de04bf13..c51f569e7 100644
--- a/xtuner/v1/model/moe/gpt_oss.py
+++ b/xtuner/v1/model/moe/gpt_oss.py
@@ -6,6 +6,7 @@
 from pydantic import computed_field
 from typing_extensions import Self
 
+from transformers import PretrainedConfig
 from transformers.models.gpt_oss import GptOssConfig as HFGptOssConfig
 from xtuner.v1.model.moe.moe import BalancingLossConfig, MoEConfig
 from xtuner.v1.module.attention import MHAConfig
@@ -132,10 +133,13 @@ def build(self) -> GptOss:
         return GptOss(self)
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
-        cfg = HFGptOssConfig.from_pretrained(hf_path)
-
-        assert isinstance(cfg, HFGptOssConfig)
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
+        if hf_path is not None:
+            cfg = HFGptOssConfig.from_pretrained(hf_path)
+            assert isinstance(cfg, HFGptOssConfig)
+        else:
+            cfg = hf_config
+        assert hf_config is not None and isinstance(cfg, PretrainedConfig)
 
         config = cls(
             hf_config=cfg,
diff --git a/xtuner/v1/model/moe/qwen3.py b/xtuner/v1/model/moe/qwen3.py
index 04ca80406..73f2bfd67 100644
--- a/xtuner/v1/model/moe/qwen3.py
+++ b/xtuner/v1/model/moe/qwen3.py
@@ -56,10 +56,11 @@ def build(self) -> Qwen3MoE:
         return Qwen3MoE(self)
 
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
-        hf_config = HFQwen3MoeConfig.from_pretrained(hf_path)
-
-        assert isinstance(hf_config, HFQwen3MoeConfig)
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
+        if hf_path is not None:
+            hf_config = HFQwen3MoeConfig.from_pretrained(hf_path)
+            assert isinstance(hf_config, HFQwen3MoeConfig)
+        assert hf_config is not None and isinstance(hf_config, PretrainedConfig)
 
         config = cls(
             vocab_size=hf_config.vocab_size,
@@ -68,7 +69,7 @@ def from_hf(cls, hf_path: str | Path) -> Self:
             bos_token_id=hf_config.bos_token_id,
             eos_token_id=hf_config.eos_token_id,
             num_hidden_layers=hf_config.num_hidden_layers,
-            max_window_layers=getattr(hf_config, "max_window_layers"),
+            max_window_layers=hf_config.max_window_layers if hasattr(hf_config, "max_window_layers") else hf_config.num_hidden_layers,
             hidden_size=hf_config.hidden_size,
             intermediate_size=hf_config.intermediate_size,
             rms_norm_eps=hf_config.rms_norm_eps,
@@ -79,10 +80,10 @@ def from_hf(cls, hf_path: str | Path) -> Self:
                 num_attention_heads=hf_config.num_attention_heads,
                 num_key_value_heads=hf_config.num_key_value_heads,
                 head_dim=hf_config.head_dim,
-                sliding_window=hf_config.sliding_window,
+                sliding_window=hf_config.sliding_window if hasattr(hf_config, "sliding_window") else 1024,
                 qk_norm=True,
             ),
-            use_sliding_window=hf_config.use_sliding_window,
+            use_sliding_window=hf_config.use_sliding_window if hasattr(hf_config, "use_sliding_window") else False,
             tie_word_embeddings=hf_config.tie_word_embeddings,
             n_routed_experts=hf_config.num_experts,
             n_shared_experts=0,
@@ -93,11 +94,14 @@ def from_hf(cls, hf_path: str | Path) -> Self:
                 norm_topk_prob=hf_config.norm_topk_prob,
                 router_scaling_factor=1.0,
             ),
+            rope_scaling_cfg = RopeScalingConfig(type="qwen3_vl", 
+                                    mrope_section=hf_config.rope_scaling['mrope_section']) 
+                                    if hasattr(hf_config, "rope_scaling") else None,
             balancing_loss_cfg=BalancingLossConfig(),
         )
 
         return config
-
+    
     @property
     def hf_config(self) -> HFQwen3MoeConfig:
         """HuggingFace configuration."""
@@ -201,7 +205,7 @@ class Qwen3MoE235BA22Config(Qwen3MoEConfig):
 
 class Qwen3MoEFoPEConfig(Qwen3MoEConfig):
     @classmethod
-    def from_hf(cls, hf_path: str | Path) -> Self:
+    def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
         hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
 
         assert isinstance(hf_config, PretrainedConfig) and hf_config.model_type == "qwen3_moe_fope"
diff --git a/xtuner/v1/model/moe/qwen3vl_text.py b/xtuner/v1/model/moe/qwen3vl_text.py
index 1996edcf7..aac1da171 100644
--- a/xtuner/v1/model/moe/qwen3vl_text.py
+++ b/xtuner/v1/model/moe/qwen3vl_text.py
@@ -8,7 +8,7 @@
 from xtuner.v1.utils.activation_offload import async_save_on_cpu
 
 from .moe import MoEModelOutputs
-from .qwen3 import Qwen3MoE, Qwen3MoE30BA3Config, Qwen3MoE235BA22Config
+from .qwen3 import Qwen3MoE, Qwen3MoE30BA3Config, Qwen3MoE235BA22Config, Qwen3MoEConfig
 
 
 class Qwen3VLTextMoE(Qwen3MoE):
@@ -221,6 +221,11 @@ def _forward(
         return MoEModelOutputs(**output)  # type: ignore[typeddict-item]
 
 
+class Qwen3VLTextMoEBaseConfig(Qwen3MoEConfig):
+    def build(self) -> Qwen3MoE:
+        return Qwen3VLTextMoE(self)
+
+
 class Qwen3VLTextMoE30BA3Config(Qwen3MoE30BA3Config):
     def build(self) -> Qwen3MoE:
         return Qwen3VLTextMoE(self)
diff --git a/xtuner/v1/utils/test_utils.py b/xtuner/v1/utils/test_utils.py
index 9748e5a8f..2251d21c1 100644
--- a/xtuner/v1/utils/test_utils.py
+++ b/xtuner/v1/utils/test_utils.py
@@ -5,7 +5,7 @@
 import torch
 import torch.distributed as dist
 from torch.distributed.device_mesh import init_device_mesh
-
+from pydantic import BaseModel
 import transformers
 from xtuner.v1.datasets.mllm_tokenize_fn.qwen3_vl_utils import sort_frames
 
@@ -267,3 +267,57 @@ def add_video_root(messages: list[dict], video_root: Path | str):
                 content["path"] = new_image_list
             else:
                 content["path"] = str(content_path)
+
+
+def _compare_dicts(d1, d2, path, differences):
+    """Recursively compare two dictionaries."""
+    all_keys = set(d1.keys()) | set(d2.keys())
+    
+    for key in all_keys:
+        current_path = f"{path}.{key}" if path else key
+        
+        if key not in d1:
+            differences.append(f"{current_path}: missing in first model")
+        elif key not in d2:
+            differences.append(f"{current_path}: missing in second model")
+        elif type(d1[key]) != type(d2[key]):
+            differences.append(f"{current_path}: type mismatch ({type(d1[key]).__name__} vs {type(d2[key]).__name__})")
+        elif isinstance(d1[key], dict):
+            _compare_dicts(d1[key], d2[key], current_path, differences)
+        elif isinstance(d1[key], (list, tuple)):
+            _compare_sequences(d1[key], d2[key], current_path, differences)
+        elif d1[key] != d2[key]:
+            differences.append(f"{current_path}: {d1[key]} != {d2[key]}")
+
+
+def _compare_sequences(seq1, seq2, path, differences):
+    """Compare two sequences (lists or tuples)."""
+    if len(seq1) != len(seq2):
+        differences.append(f"{path}: length mismatch ({len(seq1)} vs {len(seq2)})")
+        return
+    
+    for i, (item1, item2) in enumerate(zip(seq1, seq2)):
+        current_path = f"{path}[{i}]"
+        if isinstance(item1, dict) and isinstance(item2, dict):
+            _compare_dicts(item1, item2, current_path, differences)
+        elif item1 != item2:
+            differences.append(f"{current_path}: {item1} != {item2}")
+
+
+def compare_pydantic_models(model1: BaseModel, model2: BaseModel):
+    """Compare two Pydantic models by their __dict__ attributes."""
+    dict1 = model1.model_dump()
+    dict2 = model2.model_dump()
+    
+    # diff = DeepDiff(dict1, dict2, ignore_order=True)
+
+    differences = []
+    _compare_dicts(dict1, dict2, "", differences)
+
+    if not differences:
+        return True
+    else:
+        print('Differences found:')
+        for diff in differences:
+            print(f"  {diff}")
+        return False

From dcb0d3e6120156c2f896996343aba645b83af257 Mon Sep 17 00:00:00 2001
From: "huanghaian@pjlab.org.cn" <huanghaian@pjlab.org.cn>
Date: Wed, 28 Jan 2026 02:10:51 +0000
Subject: [PATCH 2/2] add save_hf

---
 .../model/compose/qwen3_vl/qwen3_vl_config.py | 57 ++++++++++++++-----
 xtuner/v1/model/dense/qwen3.py                |  5 ++
 xtuner/v1/model/moe/qwen3.py                  |  9 ++-
 3 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
index ad5e026a2..c639510de 100644
--- a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
+++ b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
@@ -14,7 +14,8 @@
 from xtuner.v1.module.rope import RopeScalingConfig
 from xtuner.v1.utils import get_device, get_logger
 from transformers import AutoConfig, PretrainedConfig
-
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig,Qwen3VLTextConfig, Qwen3VLVisionConfig
+from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig, Qwen3VLMoeVisionConfig
 from ..base import BaseComposeConfig
 
 
@@ -51,7 +52,8 @@ def build(self):
         from .modeling_vision import Qwen3VLVisionModel
 
         return Qwen3VLVisionModel(self)
-
+    
+    # Only the outermost module needs to support hf_config, the internal vision and projector modules do not need it.
     @property
     def hf_config(self):
         return None
@@ -68,7 +70,8 @@ def build(self):
         from .modeling_projector import Qwen3VLProjector
 
         return Qwen3VLProjector(self)
-
+    
+    # Only the outermost module needs to support hf_config, the internal vision and projector modules do not need it.
     @property
     def hf_config(self):
         return None
@@ -130,16 +133,44 @@ def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig
         return config
 
     @property
-    def hf_config(self):
-        # TODO(pppppM) Support saving HuggingFace format config
-        logger.warning(
-            f"{type(self)} does not support conversion to HuggingFace config format. "
-            "Only the original HuggingFace config will be retained in the saved HuggingFace format checkpoint. "
-            f"If you have changed the default values in {type(self)}, it may cause the config in the saved "
-            "HuggingFace format checkpoint to not match the weights."
-        )
-        return None
-
+    def hf_config(self) -> Qwen3VLConfig | Qwen3VLMoeConfig:
+        text_config = self.text_config.hf_config
+        if isinstance(self.text_config, Qwen3VLTextMoEBaseConfig):
+            vision_config = Qwen3VLMoeVisionConfig(
+                depth=self.vision_config.depth,
+                hidden_size=self.vision_config.hidden_size,
+                intermediate_size=self.vision_config.intermediate_size,
+                num_heads=self.vision_config.num_attention_heads,
+                deepstack_visual_indexes=self.vision_config.deepstack_visual_indexes,
+            )
+            return Qwen3VLMoeConfig(
+                architectures=['Qwen3VLMoeForConditionalGeneration'],
+                image_token_id=self.image_token_id,
+                video_token_id=self.video_token_id,
+                vision_start_token_id=self.vision_start_token_id,
+                vision_end_token_id=self.vision_end_token_id,
+                tie_word_embeddings=self.text_config.tie_word_embeddings,
+                text_config=text_config.to_dict(),
+                vision_config=vision_config.to_dict(),
+                )
+        else:
+            vision_config = Qwen3VLVisionConfig(
+                depth=self.vision_config.depth,
+                hidden_size=self.vision_config.hidden_size,
+                intermediate_size=self.vision_config.intermediate_size,
+                num_heads=self.vision_config.num_attention_heads,
+                deepstack_visual_indexes=self.vision_config.deepstack_visual_indexes,
+            )
+            return Qwen3VLConfig(
+                architectures=['Qwen3VLForConditionalGeneration'],
+                image_token_id=self.image_token_id,
+                video_token_id=self.video_token_id,
+                vision_start_token_id=self.vision_start_token_id,
+                vision_end_token_id=self.vision_end_token_id,
+                tie_word_embeddings=self.text_config.tie_word_embeddings,
+                text_config=text_config,
+                vision_config=vision_config,
+                )
 
 class Qwen3VLMoE30BA3Config(Qwen3VLBaseConfig):
     vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig()
diff --git a/xtuner/v1/model/dense/qwen3.py b/xtuner/v1/model/dense/qwen3.py
index 0525ad15d..034b834d5 100644
--- a/xtuner/v1/model/dense/qwen3.py
+++ b/xtuner/v1/model/dense/qwen3.py
@@ -100,6 +100,11 @@ def hf_config(self) -> HFQwen3DenseConfig:
             sliding_window=self.attention.sliding_window,
             use_sliding_window=self.use_sliding_window,
             tie_word_embeddings=self.tie_word_embeddings,
+            rope_scaling={
+                'rope_type': 'default',
+                'mrope_section': self.rope_scaling_cfg.mrope_section,
+                "mrope_interleaved": True
+            } if self.rope_scaling_cfg is not None else None,
             dtype=torch.bfloat16,
         )
 
diff --git a/xtuner/v1/model/moe/qwen3.py b/xtuner/v1/model/moe/qwen3.py
index 73f2bfd67..f3ac216e3 100644
--- a/xtuner/v1/model/moe/qwen3.py
+++ b/xtuner/v1/model/moe/qwen3.py
@@ -94,7 +94,8 @@ def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig
                 norm_topk_prob=hf_config.norm_topk_prob,
                 router_scaling_factor=1.0,
             ),
-            rope_scaling_cfg = RopeScalingConfig(type="qwen3_vl", 
+            rope_scaling_cfg = RopeScalingConfig(
+                                    type="qwen3_vl", 
                                     mrope_section=hf_config.rope_scaling['mrope_section']) 
                                     if hasattr(hf_config, "rope_scaling") else None,
             balancing_loss_cfg=BalancingLossConfig(),
@@ -121,7 +122,6 @@ def hf_config(self) -> HFQwen3MoeConfig:
             rms_norm_eps=self.rms_norm_eps,
             model_type=self.model_type,
             rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling_cfg.model_dump() if self.rope_scaling_cfg is not None else None,
             hidden_act=self.hidden_act,
             num_attention_heads=self.attention.num_attention_heads,
             num_key_value_heads=self.attention.num_key_value_heads,
@@ -132,6 +132,11 @@ def hf_config(self) -> HFQwen3MoeConfig:
             num_experts=self.n_routed_experts,
             num_experts_per_tok=self.num_experts_per_tok,
             norm_topk_prob=self.router.norm_topk_prob,
+            rope_scaling={
+                'rope_type': 'default',
+                'mrope_section': self.rope_scaling_cfg.mrope_section,
+                "mrope_interleaved": True
+            } if self.rope_scaling_cfg is not None else None,
             torch_dtype=torch.bfloat16,  # TODO: update all outdated hf `dtype` kwarg @jayhenry
         )