From c52df5dabf9234dc24cd7d49f6d3ca11d1801a1c Mon Sep 17 00:00:00 2001 From: "huanghaian@pjlab.org.cn" Date: Tue, 27 Jan 2026 10:07:16 +0000 Subject: [PATCH 1/2] qwen3vl support get_model_config_from_hf --- tests/model/test_qwen3_vl.py | 20 +++++-- xtuner/v1/model/__init__.py | 5 +- xtuner/v1/model/base.py | 19 ++++++- .../compose/intern_s1/intern_s1_config.py | 3 +- xtuner/v1/model/compose/qwen3_vl/__init__.py | 3 +- .../model/compose/qwen3_vl/qwen3_vl_config.py | 45 +++++++++++++-- xtuner/v1/model/dense/qwen2.py | 15 ++--- xtuner/v1/model/dense/qwen3.py | 26 +++++---- xtuner/v1/model/dense/qwen3vl_text.py | 7 ++- xtuner/v1/model/moe/deepseek_v3.py | 13 +++-- xtuner/v1/model/moe/gpt_oss.py | 12 ++-- xtuner/v1/model/moe/qwen3.py | 22 +++++--- xtuner/v1/model/moe/qwen3vl_text.py | 7 ++- xtuner/v1/utils/test_utils.py | 56 ++++++++++++++++++- 14 files changed, 200 insertions(+), 53 deletions(-) diff --git a/tests/model/test_qwen3_vl.py b/tests/model/test_qwen3_vl.py index 777974513..016a7e88e 100644 --- a/tests/model/test_qwen3_vl.py +++ b/tests/model/test_qwen3_vl.py @@ -21,6 +21,8 @@ MixedPrecisionPolicy, fully_shard, ) +from xtuner.v1.model import get_model_config_from_hf +from xtuner.v1.utils.test_utils import compare_pydantic_models QWEN3_VL_MOE_PATH = os.environ["QWEN3_VL_MOE_PATH"] QWEN3_VL_DENSE_PATH = os.environ["QWEN3_VL_DENSE_PATH"] @@ -186,9 +188,14 @@ def test_qwen3vl_run(self, device, sp_size, tol): device_map="cpu" ).eval() patch_hf_rms_norm(hf_model) - + with torch.device("meta"): - model_cfg = Qwen3VLDense4BConfig(compile_cfg=False) + model_cfg = get_model_config_from_hf(QWEN3_VL_DENSE_PATH) + model_cfg.compile_cfg = False + + model_cfg_origin = Qwen3VLDense4BConfig(compile_cfg=False) + self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin)) + qwen3vl_model = model_cfg.build().to(torch.bfloat16) qwen3vl_model.from_hf(QWEN3_VL_DENSE_PATH) @@ -220,9 +227,12 @@ def test_fsdp_qwen3_run(self, device, sp_size, compile, tol): patch_hf_rms_norm(hf_model) with torch.device("meta"): - model_cfg = Qwen3VLDense4BConfig() + model_cfg = get_model_config_from_hf(QWEN3_VL_DENSE_PATH) + model_cfg_origin = Qwen3VLDense4BConfig() if compile is False: model_cfg.compile_cfg = False + model_cfg_origin.compile_cfg = False + self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin)) qwen3vl_model = model_cfg.build().to(torch.bfloat16) fsdp_config = FSDPConfig( @@ -265,7 +275,9 @@ def test_fsdp_qwen3_run(self, device, sp_size, compile, tol): def test_save_hf(self, device, tp_size): self.create_pg(device) with torch.device("meta"): - model_cfg = Qwen3VLMoE30BA3Config() + model_cfg = get_model_config_from_hf(QWEN3_VL_MOE_PATH) + model_cfg_origin = Qwen3VLMoE30BA3Config() + self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin)) qwen3vl_model = model_cfg.build().to(torch.bfloat16) fsdp_config = FSDPConfig( diff --git a/xtuner/v1/model/__init__.py b/xtuner/v1/model/__init__.py index 3f77cd9b8..b94550479 100644 --- a/xtuner/v1/model/__init__.py +++ b/xtuner/v1/model/__init__.py @@ -12,13 +12,14 @@ InternVLBaseConfig, ) from .compose.qwen3_vl import ( + Qwen3VLBaseConfig, Qwen3VLDense4BConfig, Qwen3VLDense8BConfig, Qwen3VLMoE30BA3Config, Qwen3VLMoE235BA22Config, ) from .dense.dense import Dense -from .dense.qwen2 import Qwen2Dense7BConfig, Qwen2DenseConfig +from .dense.qwen2 import Qwen2DenseConfig from .dense.qwen3 import Qwen3Dense0P6BConfig, Qwen3Dense4BConfig, Qwen3Dense8BConfig, Qwen3DenseConfig from .moe.deepseek_v3 import DeepSeekV3Config from .moe.gpt_oss import GptOss21BA3P6Config, GptOss117BA5P8Config, GptOssConfig @@ -61,6 +62,8 @@ def get_model_config_from_hf(model_path: Path): return GptOssConfig.from_hf(model_path) elif cfg.model_type == "deepseek_v3": return DeepSeekV3Config.from_hf(model_path) + elif cfg.model_type == "qwen3_vl_moe" or cfg.model_type == "qwen3_vl": + return Qwen3VLBaseConfig.from_hf(model_path) else: raise ValueError(f"Unsupported model type: {cfg.model_type}") diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py index 70142c15c..94b2eaf33 100644 --- a/xtuner/v1/model/base.py +++ b/xtuner/v1/model/base.py @@ -9,6 +9,7 @@ from shutil import copy, copytree from typing import Annotated, Generator, Iterable, Literal, Mapping, cast +from typing import overload import torch import torch.distributed as dist import torch.nn as nn @@ -150,9 +151,19 @@ def layers_type(self) -> list[Literal["full_attention", "sliding_attention"]]: def build(self) -> "BaseModel": raise NotImplementedError + + @overload + @classmethod + def from_hf(cls, hf_path: str | Path): + ... + + @overload + @classmethod + def from_hf(cls, hf_config: PretrainedConfig): + ... @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: """Build a `TransformerConfig` from a pre-trained HuggingFace model. This method creates a configuration object based on a `PretrainedConfig` loaded from the specified HuggingFace model path. @@ -161,9 +172,11 @@ def from_hf(cls, hf_path: str | Path) -> Self: Note: The `hf_config` field needs to be set to the `PretrainedConfig` object loaded from `hf_path`, otherwise it cannot be saved in HuggingFace format. + The `hf_config` parameter is provided to avoid redundant loading of the configuration. Only for Compose model. Args: - hf_path (str | Path): Path to the HuggingFace model. + hf_path (str | Path | None): Path to the HuggingFace model. + hf_config (PretrainedConfig | None): Optional pre-loaded HuggingFace configuration. Only for Compose model. Returns: TransformerConfig: A configuration object populated with values from the pre-trained model. @@ -172,7 +185,7 @@ def from_hf(cls, hf_path: str | Path) -> Self: NotImplementedError: This method must be implemented by subclasses. """ raise NotImplementedError - + @property def hf_config(self) -> PretrainedConfig | None: """HuggingFace configuration.""" diff --git a/xtuner/v1/model/compose/intern_s1/intern_s1_config.py b/xtuner/v1/model/compose/intern_s1/intern_s1_config.py index 7cdcf6d11..cb99e0be4 100644 --- a/xtuner/v1/model/compose/intern_s1/intern_s1_config.py +++ b/xtuner/v1/model/compose/intern_s1/intern_s1_config.py @@ -5,6 +5,7 @@ from pydantic import ConfigDict from typing_extensions import Self +from transformers import PretrainedConfig from xtuner.v1.model.base import XTunerBaseModelConfig from xtuner.v1.model.dense.qwen3 import Qwen3Dense8BConfig from xtuner.v1.model.moe.moe import MoEConfig, TransformerConfig @@ -113,7 +114,7 @@ def build(self) -> "InternS1ForConditionalGeneration": return InternS1ForConditionalGeneration(self) @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: raise NotImplementedError @property diff --git a/xtuner/v1/model/compose/qwen3_vl/__init__.py b/xtuner/v1/model/compose/qwen3_vl/__init__.py index 5e5011262..664fd7819 100644 --- a/xtuner/v1/model/compose/qwen3_vl/__init__.py +++ b/xtuner/v1/model/compose/qwen3_vl/__init__.py @@ -1,5 +1,5 @@ from .modeling_qwen3_vl import Qwen3VLForConditionalGeneration -from .qwen3_vl_config import Qwen3VLDense4BConfig, Qwen3VLDense8BConfig, Qwen3VLMoE30BA3Config, Qwen3VLMoE235BA22Config +from .qwen3_vl_config import Qwen3VLBaseConfig, Qwen3VLDense4BConfig, Qwen3VLDense8BConfig, Qwen3VLMoE30BA3Config, Qwen3VLMoE235BA22Config __all__ = [ @@ -8,4 +8,5 @@ "Qwen3VLDense4BConfig", "Qwen3VLDense8BConfig", "Qwen3VLMoE235BA22Config", + "Qwen3VLBaseConfig", ] diff --git a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py index 235b65560..ad5e026a2 100644 --- a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py +++ b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py @@ -4,13 +4,16 @@ from mmengine import is_installed from pydantic import ConfigDict from typing_extensions import Self +from pydantic import SerializeAsAny from xtuner.v1.model.base import TransformerConfig, XTunerBaseModelConfig from xtuner.v1.model.dense.qwen3vl_text import Qwen3VLTextDense4BConfig, Qwen3VLTextDense8BConfig -from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config, Qwen3MoE235BA22Config -from xtuner.v1.model.moe.qwen3vl_text import Qwen3VLTextMoE30BA3Config, Qwen3VLTextMoE235BA22Config +from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config, Qwen3MoE235BA22Config, Qwen3MoEConfig +from xtuner.v1.model.dense.qwen3vl_text import Qwen3VLTextBaseConfig +from xtuner.v1.model.moe.qwen3vl_text import Qwen3VLTextMoE30BA3Config, Qwen3VLTextMoE235BA22Config, Qwen3VLTextMoEBaseConfig from xtuner.v1.module.rope import RopeScalingConfig from xtuner.v1.utils import get_device, get_logger +from transformers import AutoConfig, PretrainedConfig from ..base import BaseComposeConfig @@ -78,7 +81,7 @@ class Qwen3VLBaseConfig(BaseComposeConfig): ) vision_config: Qwen3VLVisionConfig projector_config: Qwen3VLProjectorConfig - text_config: TransformerConfig + text_config: SerializeAsAny[TransformerConfig] image_token_id: int = 151655 video_token_id: int = 151656 @@ -95,8 +98,36 @@ def build(self): return Qwen3VLForConditionalGeneration(self) @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: - raise NotImplementedError + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: + hf_config = AutoConfig.from_pretrained(hf_path,trust_remote_code=True) + + hf_vision_config = hf_config.vision_config + vision_config = Qwen3VLVisionConfig( + depth=hf_vision_config.depth, + hidden_size=hf_vision_config.hidden_size, + intermediate_size=hf_vision_config.intermediate_size, + num_attention_heads=hf_vision_config.num_heads, + deepstack_visual_indexes=hf_vision_config.deepstack_visual_indexes, + ) + projector_config = Qwen3VLProjectorConfig( + vision_hidden_size=hf_vision_config.hidden_size, + text_hidden_size=hf_config.text_config.hidden_size, + deepstack_visual_indexes=hf_vision_config.deepstack_visual_indexes, + ) + + if hf_config.model_type == "qwen3_vl_moe": + text_config = Qwen3VLTextMoEBaseConfig.from_hf(hf_config=hf_config.text_config) + elif hf_config.model_type == "qwen3_vl": + text_config = Qwen3VLTextBaseConfig.from_hf(hf_config=hf_config.text_config) + + config = cls(vision_config=vision_config, + projector_config=projector_config, + text_config=text_config, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id) + return config @property def hf_config(self): @@ -114,6 +145,7 @@ class Qwen3VLMoE30BA3Config(Qwen3VLBaseConfig): vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig() projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig() text_config: Qwen3MoE30BA3Config = Qwen3VLTextMoE30BA3Config( + model_type="qwen3_vl_moe_text", max_position_embeddings=262144, rope_theta=5000000, rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]), @@ -124,6 +156,7 @@ class Qwen3VLMoE235BA22Config(Qwen3VLBaseConfig): vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig() projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig(text_hidden_size=4096) text_config: Qwen3MoE235BA22Config = Qwen3VLTextMoE235BA22Config( + model_type="qwen3_vl_moe_text", max_position_embeddings=262144, rope_theta=5000000, rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]), @@ -138,6 +171,7 @@ class Qwen3VLDense4BConfig(Qwen3VLBaseConfig): vision_hidden_size=1024, text_hidden_size=2560, deepstack_visual_indexes=[5, 11, 17] ) text_config: Qwen3VLTextDense4BConfig = Qwen3VLTextDense4BConfig( + model_type="qwen3_vl_text", max_position_embeddings=262144, rope_theta=5000000, rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]), @@ -148,6 +182,7 @@ class Qwen3VLDense8BConfig(Qwen3VLBaseConfig): vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig() projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig(text_hidden_size=4096) text_config: Qwen3VLTextDense8BConfig = Qwen3VLTextDense8BConfig( + model_type="qwen3_vl_text", max_position_embeddings=262144, rope_theta=5000000, rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]), diff --git a/xtuner/v1/model/dense/qwen2.py b/xtuner/v1/model/dense/qwen2.py index 07884398b..c1e33768c 100644 --- a/xtuner/v1/model/dense/qwen2.py +++ b/xtuner/v1/model/dense/qwen2.py @@ -4,6 +4,7 @@ import torch from typing_extensions import Self +from transformers import PretrainedConfig from transformers.models.qwen2 import Qwen2Config as HFQwen2DenseConfig from xtuner.v1.model.base import TransformerConfig from xtuner.v1.module.attention import MHAConfig @@ -36,13 +37,13 @@ def build(self) -> Qwen2Dense: return Qwen2Dense(self) @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: - from transformers import AutoConfig - from transformers.models.qwen2 import Qwen2Config as HFConfig - - hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True) - - assert isinstance(hf_config, HFConfig) + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: + if hf_path is not None: + from transformers import AutoConfig + from transformers.models.qwen2 import Qwen2Config as HFConfig + hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True) + assert isinstance(hf_config, HFConfig) + assert hf_config is not None and isinstance(hf_config, PretrainedConfig) config = cls( vocab_size=hf_config.vocab_size, diff --git a/xtuner/v1/model/dense/qwen3.py b/xtuner/v1/model/dense/qwen3.py index 6d663735e..0525ad15d 100644 --- a/xtuner/v1/model/dense/qwen3.py +++ b/xtuner/v1/model/dense/qwen3.py @@ -4,9 +4,11 @@ import torch from typing_extensions import Self +from transformers import PretrainedConfig from transformers.models.qwen3 import Qwen3Config as HFQwen3DenseConfig from xtuner.v1.model.base import TransformerConfig from xtuner.v1.module.attention import MHAConfig +from xtuner.v1.module.rope.rope import RopeScalingConfig from .dense import Dense @@ -37,13 +39,13 @@ def build(self) -> Qwen3Dense: return Qwen3Dense(self) @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: - from transformers import AutoConfig - from transformers.models.qwen3 import Qwen3Config as HFConfig - - hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True) - - assert isinstance(hf_config, HFConfig) + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: + if hf_path is not None: + from transformers import AutoConfig + from transformers.models.qwen3 import Qwen3Config as HFConfig + hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True) + assert isinstance(hf_config, HFConfig) + assert hf_config is not None and isinstance(hf_config, PretrainedConfig) config = cls( vocab_size=hf_config.vocab_size, @@ -52,7 +54,7 @@ def from_hf(cls, hf_path: str | Path) -> Self: bos_token_id=hf_config.bos_token_id, eos_token_id=hf_config.eos_token_id, num_hidden_layers=hf_config.num_hidden_layers, - max_window_layers=hf_config.max_window_layers, + max_window_layers=hf_config.max_window_layers if hasattr(hf_config, "max_window_layers") else hf_config.num_hidden_layers, hidden_size=hf_config.hidden_size, intermediate_size=hf_config.intermediate_size, rms_norm_eps=hf_config.rms_norm_eps, @@ -62,11 +64,15 @@ def from_hf(cls, hf_path: str | Path) -> Self: num_attention_heads=hf_config.num_attention_heads, num_key_value_heads=hf_config.num_key_value_heads, head_dim=hf_config.head_dim, - sliding_window=hf_config.sliding_window, + sliding_window=hf_config.sliding_window if hasattr(hf_config, "sliding_window") else 1024, qk_norm=True, ), - use_sliding_window=hf_config.use_sliding_window, + model_type=hf_config.model_type, + use_sliding_window=hf_config.use_sliding_window if hasattr(hf_config, "use_sliding_window") else False, tie_word_embeddings=hf_config.tie_word_embeddings, + rope_scaling_cfg = RopeScalingConfig(type="qwen3_vl", + mrope_section=hf_config.rope_scaling['mrope_section']) + if hasattr(hf_config, "rope_scaling") else None, ) return config diff --git a/xtuner/v1/model/dense/qwen3vl_text.py b/xtuner/v1/model/dense/qwen3vl_text.py index 15e40c40b..4f214815b 100644 --- a/xtuner/v1/model/dense/qwen3vl_text.py +++ b/xtuner/v1/model/dense/qwen3vl_text.py @@ -6,7 +6,7 @@ from xtuner.v1.loss import CELossContext from xtuner.v1.model.base import ModelOutputs -from .qwen3 import Qwen3Dense, Qwen3Dense4BConfig, Qwen3Dense8BConfig +from .qwen3 import Qwen3Dense, Qwen3Dense4BConfig, Qwen3Dense8BConfig, Qwen3DenseConfig class Qwen3VLTextDense(Qwen3Dense): @@ -85,6 +85,11 @@ def forward( return ModelOutputs(**output) # type: ignore[typeddict-item] +class Qwen3VLTextBaseConfig(Qwen3DenseConfig): + def build(self) -> Qwen3VLTextDense: + return Qwen3VLTextDense(self) + + class Qwen3VLTextDense4BConfig(Qwen3Dense4BConfig): def build(self) -> Qwen3VLTextDense: return Qwen3VLTextDense(self) diff --git a/xtuner/v1/model/moe/deepseek_v3.py b/xtuner/v1/model/moe/deepseek_v3.py index 4157b3655..42f06f981 100644 --- a/xtuner/v1/model/moe/deepseek_v3.py +++ b/xtuner/v1/model/moe/deepseek_v3.py @@ -3,7 +3,7 @@ import torch from typing_extensions import Self - +from transformers import PretrainedConfig from transformers.models.deepseek_v3 import DeepseekV3Config as HFDeepseekV3Config from xtuner.v1.model.moe.moe import BalancingLossConfig, MoEConfig, ZLossConfig from xtuner.v1.module.attention import MLAConfig @@ -103,10 +103,13 @@ def build(self) -> DeepSeekV3: return DeepSeekV3(self) @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: - cfg = HFDeepseekV3Config.from_pretrained(hf_path) - - assert isinstance(cfg, HFDeepseekV3Config) + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: + if hf_path is not None: + cfg = HFDeepseekV3Config.from_pretrained(hf_path) + assert isinstance(cfg, HFDeepseekV3Config) + else: + cfg = hf_config + assert cfg is not None and isinstance(cfg, PretrainedConfig) config = cls( vocab_size=cfg.vocab_size, diff --git a/xtuner/v1/model/moe/gpt_oss.py b/xtuner/v1/model/moe/gpt_oss.py index 6de04bf13..c51f569e7 100644 --- a/xtuner/v1/model/moe/gpt_oss.py +++ b/xtuner/v1/model/moe/gpt_oss.py @@ -6,6 +6,7 @@ from pydantic import computed_field from typing_extensions import Self +from transformers import PretrainedConfig from transformers.models.gpt_oss import GptOssConfig as HFGptOssConfig from xtuner.v1.model.moe.moe import BalancingLossConfig, MoEConfig from xtuner.v1.module.attention import MHAConfig @@ -132,10 +133,13 @@ def build(self) -> GptOss: return GptOss(self) @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: - cfg = HFGptOssConfig.from_pretrained(hf_path) - - assert isinstance(cfg, HFGptOssConfig) + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: + if hf_path is not None: + cfg = HFGptOssConfig.from_pretrained(hf_path) + assert isinstance(cfg, HFGptOssConfig) + else: + cfg = hf_config + assert hf_config is not None and isinstance(cfg, PretrainedConfig) config = cls( hf_config=cfg, diff --git a/xtuner/v1/model/moe/qwen3.py b/xtuner/v1/model/moe/qwen3.py index 04ca80406..73f2bfd67 100644 --- a/xtuner/v1/model/moe/qwen3.py +++ b/xtuner/v1/model/moe/qwen3.py @@ -56,10 +56,11 @@ def build(self) -> Qwen3MoE: return Qwen3MoE(self) @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: - hf_config = HFQwen3MoeConfig.from_pretrained(hf_path) - - assert isinstance(hf_config, HFQwen3MoeConfig) + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: + if hf_path is not None: + hf_config = HFQwen3MoeConfig.from_pretrained(hf_path) + assert isinstance(hf_config, HFQwen3MoeConfig) + assert hf_config is not None and isinstance(hf_config, PretrainedConfig) config = cls( vocab_size=hf_config.vocab_size, @@ -68,7 +69,7 @@ def from_hf(cls, hf_path: str | Path) -> Self: bos_token_id=hf_config.bos_token_id, eos_token_id=hf_config.eos_token_id, num_hidden_layers=hf_config.num_hidden_layers, - max_window_layers=getattr(hf_config, "max_window_layers"), + max_window_layers=hf_config.max_window_layers if hasattr(hf_config, "max_window_layers") else hf_config.num_hidden_layers, hidden_size=hf_config.hidden_size, intermediate_size=hf_config.intermediate_size, rms_norm_eps=hf_config.rms_norm_eps, @@ -79,10 +80,10 @@ def from_hf(cls, hf_path: str | Path) -> Self: num_attention_heads=hf_config.num_attention_heads, num_key_value_heads=hf_config.num_key_value_heads, head_dim=hf_config.head_dim, - sliding_window=hf_config.sliding_window, + sliding_window=hf_config.sliding_window if hasattr(hf_config, "sliding_window") else 1024, qk_norm=True, ), - use_sliding_window=hf_config.use_sliding_window, + use_sliding_window=hf_config.use_sliding_window if hasattr(hf_config, "use_sliding_window") else False, tie_word_embeddings=hf_config.tie_word_embeddings, n_routed_experts=hf_config.num_experts, n_shared_experts=0, @@ -93,11 +94,14 @@ def from_hf(cls, hf_path: str | Path) -> Self: norm_topk_prob=hf_config.norm_topk_prob, router_scaling_factor=1.0, ), + rope_scaling_cfg = RopeScalingConfig(type="qwen3_vl", + mrope_section=hf_config.rope_scaling['mrope_section']) + if hasattr(hf_config, "rope_scaling") else None, balancing_loss_cfg=BalancingLossConfig(), ) return config - + @property def hf_config(self) -> HFQwen3MoeConfig: """HuggingFace configuration.""" @@ -201,7 +205,7 @@ class Qwen3MoE235BA22Config(Qwen3MoEConfig): class Qwen3MoEFoPEConfig(Qwen3MoEConfig): @classmethod - def from_hf(cls, hf_path: str | Path) -> Self: + def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self: hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True) assert isinstance(hf_config, PretrainedConfig) and hf_config.model_type == "qwen3_moe_fope" diff --git a/xtuner/v1/model/moe/qwen3vl_text.py b/xtuner/v1/model/moe/qwen3vl_text.py index 1996edcf7..aac1da171 100644 --- a/xtuner/v1/model/moe/qwen3vl_text.py +++ b/xtuner/v1/model/moe/qwen3vl_text.py @@ -8,7 +8,7 @@ from xtuner.v1.utils.activation_offload import async_save_on_cpu from .moe import MoEModelOutputs -from .qwen3 import Qwen3MoE, Qwen3MoE30BA3Config, Qwen3MoE235BA22Config +from .qwen3 import Qwen3MoE, Qwen3MoE30BA3Config, Qwen3MoE235BA22Config, Qwen3MoEConfig class Qwen3VLTextMoE(Qwen3MoE): @@ -221,6 +221,11 @@ def _forward( return MoEModelOutputs(**output) # type: ignore[typeddict-item] +class Qwen3VLTextMoEBaseConfig(Qwen3MoEConfig): + def build(self) -> Qwen3MoE: + return Qwen3VLTextMoE(self) + + class Qwen3VLTextMoE30BA3Config(Qwen3MoE30BA3Config): def build(self) -> Qwen3MoE: return Qwen3VLTextMoE(self) diff --git a/xtuner/v1/utils/test_utils.py b/xtuner/v1/utils/test_utils.py index 9748e5a8f..2251d21c1 100644 --- a/xtuner/v1/utils/test_utils.py +++ b/xtuner/v1/utils/test_utils.py @@ -5,7 +5,7 @@ import torch import torch.distributed as dist from torch.distributed.device_mesh import init_device_mesh - +from pydantic import BaseModel import transformers from xtuner.v1.datasets.mllm_tokenize_fn.qwen3_vl_utils import sort_frames @@ -267,3 +267,57 @@ def add_video_root(messages: list[dict], video_root: Path | str): content["path"] = new_image_list else: content["path"] = str(content_path) + + +def _compare_dicts(d1, d2, path, differences): + """Recursively compare two dictionaries.""" + all_keys = set(d1.keys()) | set(d2.keys()) + + for key in all_keys: + current_path = f"{path}.{key}" if path else key + + if key not in d1: + differences.append(f"{current_path}: missing in first model") + elif key not in d2: + differences.append(f"{current_path}: missing in second model") + elif type(d1[key]) != type(d2[key]): + differences.append(f"{current_path}: type mismatch ({type(d1[key]).__name__} vs {type(d2[key]).__name__})") + elif isinstance(d1[key], dict): + _compare_dicts(d1[key], d2[key], current_path, differences) + elif isinstance(d1[key], (list, tuple)): + _compare_sequences(d1[key], d2[key], current_path, differences) + elif d1[key] != d2[key]: + differences.append(f"{current_path}: {d1[key]} != {d2[key]}") + + +def _compare_sequences(seq1, seq2, path, differences): + """Compare two sequences (lists or tuples).""" + if len(seq1) != len(seq2): + differences.append(f"{path}: length mismatch ({len(seq1)} vs {len(seq2)})") + return + + for i, (item1, item2) in enumerate(zip(seq1, seq2)): + current_path = f"{path}[{i}]" + if isinstance(item1, dict) and isinstance(item2, dict): + _compare_dicts(item1, item2, current_path, differences) + elif item1 != item2: + differences.append(f"{current_path}: {item1} != {item2}") + + +def compare_pydantic_models(model1: BaseModel, model2: BaseModel): + """Compare two Pydantic models by their __dict__ attributes.""" + dict1 = model1.model_dump() + dict2 = model2.model_dump() + + # diff = DeepDiff(dict1, dict2, ignore_order=True) + + differences = [] + _compare_dicts(dict1, dict2, "", differences) + + if not differences: + return True + else: + print('Differences found:') + for diff in differences: + print(f" {diff}") + return False From dcb0d3e6120156c2f896996343aba645b83af257 Mon Sep 17 00:00:00 2001 From: "huanghaian@pjlab.org.cn" Date: Wed, 28 Jan 2026 02:10:51 +0000 Subject: [PATCH 2/2] add save_hf --- .../model/compose/qwen3_vl/qwen3_vl_config.py | 57 ++++++++++++++----- xtuner/v1/model/dense/qwen3.py | 5 ++ xtuner/v1/model/moe/qwen3.py | 9 ++- 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py index ad5e026a2..c639510de 100644 --- a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py +++ b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py @@ -14,7 +14,8 @@ from xtuner.v1.module.rope import RopeScalingConfig from xtuner.v1.utils import get_device, get_logger from transformers import AutoConfig, PretrainedConfig - +from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig,Qwen3VLTextConfig, Qwen3VLVisionConfig +from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig, Qwen3VLMoeVisionConfig from ..base import BaseComposeConfig @@ -51,7 +52,8 @@ def build(self): from .modeling_vision import Qwen3VLVisionModel return Qwen3VLVisionModel(self) - + + # Only the outermost module needs to support hf_config, the internal vision and projector modules do not need it. @property def hf_config(self): return None @@ -68,7 +70,8 @@ def build(self): from .modeling_projector import Qwen3VLProjector return Qwen3VLProjector(self) - + + # Only the outermost module needs to support hf_config, the internal vision and projector modules do not need it. @property def hf_config(self): return None @@ -130,16 +133,44 @@ def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig return config @property - def hf_config(self): - # TODO(pppppM) Support saving HuggingFace format config - logger.warning( - f"{type(self)} does not support conversion to HuggingFace config format. " - "Only the original HuggingFace config will be retained in the saved HuggingFace format checkpoint. " - f"If you have changed the default values in {type(self)}, it may cause the config in the saved " - "HuggingFace format checkpoint to not match the weights." - ) - return None - + def hf_config(self) -> Qwen3VLConfig | Qwen3VLMoeConfig: + text_config = self.text_config.hf_config + if isinstance(self.text_config, Qwen3VLTextMoEBaseConfig): + vision_config = Qwen3VLMoeVisionConfig( + depth=self.vision_config.depth, + hidden_size=self.vision_config.hidden_size, + intermediate_size=self.vision_config.intermediate_size, + num_heads=self.vision_config.num_attention_heads, + deepstack_visual_indexes=self.vision_config.deepstack_visual_indexes, + ) + return Qwen3VLMoeConfig( + architectures=['Qwen3VLMoeForConditionalGeneration'], + image_token_id=self.image_token_id, + video_token_id=self.video_token_id, + vision_start_token_id=self.vision_start_token_id, + vision_end_token_id=self.vision_end_token_id, + tie_word_embeddings=self.text_config.tie_word_embeddings, + text_config=text_config.to_dict(), + vision_config=vision_config.to_dict(), + ) + else: + vision_config = Qwen3VLVisionConfig( + depth=self.vision_config.depth, + hidden_size=self.vision_config.hidden_size, + intermediate_size=self.vision_config.intermediate_size, + num_heads=self.vision_config.num_attention_heads, + deepstack_visual_indexes=self.vision_config.deepstack_visual_indexes, + ) + return Qwen3VLConfig( + architectures=['Qwen3VLForConditionalGeneration'], + image_token_id=self.image_token_id, + video_token_id=self.video_token_id, + vision_start_token_id=self.vision_start_token_id, + vision_end_token_id=self.vision_end_token_id, + tie_word_embeddings=self.text_config.tie_word_embeddings, + text_config=text_config, + vision_config=vision_config, + ) class Qwen3VLMoE30BA3Config(Qwen3VLBaseConfig): vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig() diff --git a/xtuner/v1/model/dense/qwen3.py b/xtuner/v1/model/dense/qwen3.py index 0525ad15d..034b834d5 100644 --- a/xtuner/v1/model/dense/qwen3.py +++ b/xtuner/v1/model/dense/qwen3.py @@ -100,6 +100,11 @@ def hf_config(self) -> HFQwen3DenseConfig: sliding_window=self.attention.sliding_window, use_sliding_window=self.use_sliding_window, tie_word_embeddings=self.tie_word_embeddings, + rope_scaling={ + 'rope_type': 'default', + 'mrope_section': self.rope_scaling_cfg.mrope_section, + "mrope_interleaved": True + } if self.rope_scaling_cfg is not None else None, dtype=torch.bfloat16, ) diff --git a/xtuner/v1/model/moe/qwen3.py b/xtuner/v1/model/moe/qwen3.py index 73f2bfd67..f3ac216e3 100644 --- a/xtuner/v1/model/moe/qwen3.py +++ b/xtuner/v1/model/moe/qwen3.py @@ -94,7 +94,8 @@ def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig norm_topk_prob=hf_config.norm_topk_prob, router_scaling_factor=1.0, ), - rope_scaling_cfg = RopeScalingConfig(type="qwen3_vl", + rope_scaling_cfg = RopeScalingConfig( + type="qwen3_vl", mrope_section=hf_config.rope_scaling['mrope_section']) if hasattr(hf_config, "rope_scaling") else None, balancing_loss_cfg=BalancingLossConfig(), @@ -121,7 +122,6 @@ def hf_config(self) -> HFQwen3MoeConfig: rms_norm_eps=self.rms_norm_eps, model_type=self.model_type, rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling_cfg.model_dump() if self.rope_scaling_cfg is not None else None, hidden_act=self.hidden_act, num_attention_heads=self.attention.num_attention_heads, num_key_value_heads=self.attention.num_key_value_heads, @@ -132,6 +132,11 @@ def hf_config(self) -> HFQwen3MoeConfig: num_experts=self.n_routed_experts, num_experts_per_tok=self.num_experts_per_tok, norm_topk_prob=self.router.norm_topk_prob, + rope_scaling={ + 'rope_type': 'default', + 'mrope_section': self.rope_scaling_cfg.mrope_section, + "mrope_interleaved": True + } if self.rope_scaling_cfg is not None else None, torch_dtype=torch.bfloat16, # TODO: update all outdated hf `dtype` kwarg @jayhenry )