Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions tests/model/test_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
MixedPrecisionPolicy,
fully_shard,
)
from xtuner.v1.model import get_model_config_from_hf
from xtuner.v1.utils.test_utils import compare_pydantic_models

QWEN3_VL_MOE_PATH = os.environ["QWEN3_VL_MOE_PATH"]
QWEN3_VL_DENSE_PATH = os.environ["QWEN3_VL_DENSE_PATH"]
Expand Down Expand Up @@ -186,9 +188,14 @@ def test_qwen3vl_run(self, device, sp_size, tol):
device_map="cpu"
).eval()
patch_hf_rms_norm(hf_model)

with torch.device("meta"):
model_cfg = Qwen3VLDense4BConfig(compile_cfg=False)
model_cfg = get_model_config_from_hf(QWEN3_VL_DENSE_PATH)
model_cfg.compile_cfg = False

model_cfg_origin = Qwen3VLDense4BConfig(compile_cfg=False)
self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin))

qwen3vl_model = model_cfg.build().to(torch.bfloat16)

qwen3vl_model.from_hf(QWEN3_VL_DENSE_PATH)
Expand Down Expand Up @@ -220,9 +227,12 @@ def test_fsdp_qwen3_run(self, device, sp_size, compile, tol):
patch_hf_rms_norm(hf_model)

with torch.device("meta"):
model_cfg = Qwen3VLDense4BConfig()
model_cfg = get_model_config_from_hf(QWEN3_VL_DENSE_PATH)
model_cfg_origin = Qwen3VLDense4BConfig()
if compile is False:
model_cfg.compile_cfg = False
model_cfg_origin.compile_cfg = False
self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin))
qwen3vl_model = model_cfg.build().to(torch.bfloat16)

fsdp_config = FSDPConfig(
Expand Down Expand Up @@ -265,7 +275,9 @@ def test_fsdp_qwen3_run(self, device, sp_size, compile, tol):
def test_save_hf(self, device, tp_size):
self.create_pg(device)
with torch.device("meta"):
model_cfg = Qwen3VLMoE30BA3Config()
model_cfg = get_model_config_from_hf(QWEN3_VL_MOE_PATH)
model_cfg_origin = Qwen3VLMoE30BA3Config()
self.assertTrue(compare_pydantic_models(model_cfg, model_cfg_origin))
qwen3vl_model = model_cfg.build().to(torch.bfloat16)

fsdp_config = FSDPConfig(
Expand Down
5 changes: 4 additions & 1 deletion xtuner/v1/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
InternVLBaseConfig,
)
from .compose.qwen3_vl import (
Qwen3VLBaseConfig,
Qwen3VLDense4BConfig,
Qwen3VLDense8BConfig,
Qwen3VLMoE30BA3Config,
Qwen3VLMoE235BA22Config,
)
from .dense.dense import Dense
from .dense.qwen2 import Qwen2Dense7BConfig, Qwen2DenseConfig
from .dense.qwen2 import Qwen2DenseConfig
from .dense.qwen3 import Qwen3Dense0P6BConfig, Qwen3Dense4BConfig, Qwen3Dense8BConfig, Qwen3DenseConfig
from .moe.deepseek_v3 import DeepSeekV3Config
from .moe.gpt_oss import GptOss21BA3P6Config, GptOss117BA5P8Config, GptOssConfig
Expand Down Expand Up @@ -61,6 +62,8 @@ def get_model_config_from_hf(model_path: Path):
return GptOssConfig.from_hf(model_path)
elif cfg.model_type == "deepseek_v3":
return DeepSeekV3Config.from_hf(model_path)
elif cfg.model_type == "qwen3_vl_moe" or cfg.model_type == "qwen3_vl":
return Qwen3VLBaseConfig.from_hf(model_path)
else:
raise ValueError(f"Unsupported model type: {cfg.model_type}")

Expand Down
19 changes: 16 additions & 3 deletions xtuner/v1/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from shutil import copy, copytree
from typing import Annotated, Generator, Iterable, Literal, Mapping, cast

from typing import overload
import torch
import torch.distributed as dist
import torch.nn as nn
Expand Down Expand Up @@ -157,9 +158,19 @@ def layers_type(self) -> list[Literal["full_attention", "sliding_attention"]]:

def build(self) -> "BaseModel":
raise NotImplementedError

@overload
@classmethod
def from_hf(cls, hf_path: str | Path):
...

@overload
@classmethod
def from_hf(cls, hf_config: PretrainedConfig):
...

@classmethod
def from_hf(cls, hf_path: str | Path) -> Self:
def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
"""Build a `TransformerConfig` from a pre-trained HuggingFace model.

This method creates a configuration object based on a `PretrainedConfig` loaded from the specified HuggingFace model path.
Expand All @@ -168,9 +179,11 @@ def from_hf(cls, hf_path: str | Path) -> Self:
Note:
The `hf_config` field needs to be set to the `PretrainedConfig` object loaded from `hf_path`,
otherwise it cannot be saved in HuggingFace format.
The `hf_config` parameter is provided to avoid redundant loading of the configuration. Only for Compose model.

Args:
hf_path (str | Path): Path to the HuggingFace model.
hf_path (str | Path | None): Path to the HuggingFace model.
hf_config (PretrainedConfig | None): Optional pre-loaded HuggingFace configuration. Only for Compose model.

Returns:
TransformerConfig: A configuration object populated with values from the pre-trained model.
Expand All @@ -179,7 +192,7 @@ def from_hf(cls, hf_path: str | Path) -> Self:
NotImplementedError: This method must be implemented by subclasses.
"""
raise NotImplementedError

@property
def hf_config(self) -> PretrainedConfig | None:
"""HuggingFace configuration."""
Expand Down
3 changes: 2 additions & 1 deletion xtuner/v1/model/compose/intern_s1/intern_s1_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pydantic import ConfigDict
from typing_extensions import Self

from transformers import PretrainedConfig
from xtuner.v1.model.base import XTunerBaseModelConfig
from xtuner.v1.model.dense.qwen3 import Qwen3Dense8BConfig
from xtuner.v1.model.moe.moe import MoEConfig, TransformerConfig
Expand Down Expand Up @@ -113,7 +114,7 @@ def build(self) -> "InternS1ForConditionalGeneration":
return InternS1ForConditionalGeneration(self)

@classmethod
def from_hf(cls, hf_path: str | Path) -> Self:
def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
raise NotImplementedError

@property
Expand Down
3 changes: 2 additions & 1 deletion xtuner/v1/model/compose/qwen3_vl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .modeling_qwen3_vl import Qwen3VLForConditionalGeneration
from .qwen3_vl_config import Qwen3VLDense4BConfig, Qwen3VLDense8BConfig, Qwen3VLMoE30BA3Config, Qwen3VLMoE235BA22Config
from .qwen3_vl_config import Qwen3VLBaseConfig, Qwen3VLDense4BConfig, Qwen3VLDense8BConfig, Qwen3VLMoE30BA3Config, Qwen3VLMoE235BA22Config


__all__ = [
Expand All @@ -8,4 +8,5 @@
"Qwen3VLDense4BConfig",
"Qwen3VLDense8BConfig",
"Qwen3VLMoE235BA22Config",
"Qwen3VLBaseConfig",
]
102 changes: 84 additions & 18 deletions xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
from mmengine import is_installed
from pydantic import ConfigDict
from typing_extensions import Self
from pydantic import SerializeAsAny

from xtuner.v1.model.base import TransformerConfig, XTunerBaseModelConfig
from xtuner.v1.model.dense.qwen3vl_text import Qwen3VLTextDense4BConfig, Qwen3VLTextDense8BConfig
from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config, Qwen3MoE235BA22Config
from xtuner.v1.model.moe.qwen3vl_text import Qwen3VLTextMoE30BA3Config, Qwen3VLTextMoE235BA22Config
from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config, Qwen3MoE235BA22Config, Qwen3MoEConfig
from xtuner.v1.model.dense.qwen3vl_text import Qwen3VLTextBaseConfig
from xtuner.v1.model.moe.qwen3vl_text import Qwen3VLTextMoE30BA3Config, Qwen3VLTextMoE235BA22Config, Qwen3VLTextMoEBaseConfig
from xtuner.v1.module.rope import RopeScalingConfig
from xtuner.v1.utils import get_device, get_logger

from transformers import AutoConfig, PretrainedConfig
from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig,Qwen3VLTextConfig, Qwen3VLVisionConfig
from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig, Qwen3VLMoeVisionConfig
from ..base import BaseComposeConfig


Expand Down Expand Up @@ -48,7 +52,8 @@ def build(self):
from .modeling_vision import Qwen3VLVisionModel

return Qwen3VLVisionModel(self)


# Only the outermost module needs to support hf_config, the internal vision and projector modules do not need it.
@property
def hf_config(self):
return None
Expand All @@ -65,7 +70,8 @@ def build(self):
from .modeling_projector import Qwen3VLProjector

return Qwen3VLProjector(self)


# Only the outermost module needs to support hf_config, the internal vision and projector modules do not need it.
@property
def hf_config(self):
return None
Expand All @@ -78,7 +84,7 @@ class Qwen3VLBaseConfig(BaseComposeConfig):
)
vision_config: Qwen3VLVisionConfig
projector_config: Qwen3VLProjectorConfig
text_config: TransformerConfig
text_config: SerializeAsAny[TransformerConfig]

image_token_id: int = 151655
video_token_id: int = 151656
Expand All @@ -95,25 +101,82 @@ def build(self):
return Qwen3VLForConditionalGeneration(self)

@classmethod
def from_hf(cls, hf_path: str | Path) -> Self:
raise NotImplementedError

@property
def hf_config(self):
# TODO(pppppM) Support saving HuggingFace format config
logger.warning(
f"{type(self)} does not support conversion to HuggingFace config format. "
"Only the original HuggingFace config will be retained in the saved HuggingFace format checkpoint. "
f"If you have changed the default values in {type(self)}, it may cause the config in the saved "
"HuggingFace format checkpoint to not match the weights."
def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
hf_config = AutoConfig.from_pretrained(hf_path,trust_remote_code=True)

hf_vision_config = hf_config.vision_config
vision_config = Qwen3VLVisionConfig(
depth=hf_vision_config.depth,
hidden_size=hf_vision_config.hidden_size,
intermediate_size=hf_vision_config.intermediate_size,
num_attention_heads=hf_vision_config.num_heads,
deepstack_visual_indexes=hf_vision_config.deepstack_visual_indexes,
)
return None
projector_config = Qwen3VLProjectorConfig(
vision_hidden_size=hf_vision_config.hidden_size,
text_hidden_size=hf_config.text_config.hidden_size,
deepstack_visual_indexes=hf_vision_config.deepstack_visual_indexes,
)

if hf_config.model_type == "qwen3_vl_moe":
text_config = Qwen3VLTextMoEBaseConfig.from_hf(hf_config=hf_config.text_config)
elif hf_config.model_type == "qwen3_vl":
text_config = Qwen3VLTextBaseConfig.from_hf(hf_config=hf_config.text_config)

config = cls(vision_config=vision_config,
projector_config=projector_config,
text_config=text_config,
image_token_id=hf_config.image_token_id,
video_token_id=hf_config.video_token_id,
vision_start_token_id=hf_config.vision_start_token_id,
vision_end_token_id=hf_config.vision_end_token_id)
return config

@property
def hf_config(self) -> Qwen3VLConfig | Qwen3VLMoeConfig:
text_config = self.text_config.hf_config
if isinstance(self.text_config, Qwen3VLTextMoEBaseConfig):
vision_config = Qwen3VLMoeVisionConfig(
depth=self.vision_config.depth,
hidden_size=self.vision_config.hidden_size,
intermediate_size=self.vision_config.intermediate_size,
num_heads=self.vision_config.num_attention_heads,
deepstack_visual_indexes=self.vision_config.deepstack_visual_indexes,
)
return Qwen3VLMoeConfig(
architectures=['Qwen3VLMoeForConditionalGeneration'],
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
vision_start_token_id=self.vision_start_token_id,
vision_end_token_id=self.vision_end_token_id,
tie_word_embeddings=self.text_config.tie_word_embeddings,
text_config=text_config.to_dict(),
vision_config=vision_config.to_dict(),
)
else:
vision_config = Qwen3VLVisionConfig(
depth=self.vision_config.depth,
hidden_size=self.vision_config.hidden_size,
intermediate_size=self.vision_config.intermediate_size,
num_heads=self.vision_config.num_attention_heads,
deepstack_visual_indexes=self.vision_config.deepstack_visual_indexes,
)
return Qwen3VLConfig(
architectures=['Qwen3VLForConditionalGeneration'],
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
vision_start_token_id=self.vision_start_token_id,
vision_end_token_id=self.vision_end_token_id,
tie_word_embeddings=self.text_config.tie_word_embeddings,
text_config=text_config,
vision_config=vision_config,
)

class Qwen3VLMoE30BA3Config(Qwen3VLBaseConfig):
vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig()
projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig()
text_config: Qwen3MoE30BA3Config = Qwen3VLTextMoE30BA3Config(
model_type="qwen3_vl_moe_text",
max_position_embeddings=262144,
rope_theta=5000000,
rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
Expand All @@ -124,6 +187,7 @@ class Qwen3VLMoE235BA22Config(Qwen3VLBaseConfig):
vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig()
projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig(text_hidden_size=4096)
text_config: Qwen3MoE235BA22Config = Qwen3VLTextMoE235BA22Config(
model_type="qwen3_vl_moe_text",
max_position_embeddings=262144,
rope_theta=5000000,
rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
Expand All @@ -138,6 +202,7 @@ class Qwen3VLDense4BConfig(Qwen3VLBaseConfig):
vision_hidden_size=1024, text_hidden_size=2560, deepstack_visual_indexes=[5, 11, 17]
)
text_config: Qwen3VLTextDense4BConfig = Qwen3VLTextDense4BConfig(
model_type="qwen3_vl_text",
max_position_embeddings=262144,
rope_theta=5000000,
rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
Expand All @@ -148,6 +213,7 @@ class Qwen3VLDense8BConfig(Qwen3VLBaseConfig):
vision_config: Qwen3VLVisionConfig = Qwen3VLVisionConfig()
projector_config: Qwen3VLProjectorConfig = Qwen3VLProjectorConfig(text_hidden_size=4096)
text_config: Qwen3VLTextDense8BConfig = Qwen3VLTextDense8BConfig(
model_type="qwen3_vl_text",
max_position_embeddings=262144,
rope_theta=5000000,
rope_scaling_cfg=RopeScalingConfig(type="qwen3_vl", mrope_section=[24, 20, 20]),
Expand Down
15 changes: 8 additions & 7 deletions xtuner/v1/model/dense/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import torch
from typing_extensions import Self

from transformers import PretrainedConfig
from transformers.models.qwen2 import Qwen2Config as HFQwen2DenseConfig
from xtuner.v1.model.base import TransformerConfig
from xtuner.v1.module.attention import MHAConfig
Expand Down Expand Up @@ -36,13 +37,13 @@ def build(self) -> Qwen2Dense:
return Qwen2Dense(self)

@classmethod
def from_hf(cls, hf_path: str | Path) -> Self:
from transformers import AutoConfig
from transformers.models.qwen2 import Qwen2Config as HFConfig

hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)

assert isinstance(hf_config, HFConfig)
def from_hf(cls, hf_path: str | Path | None = None, hf_config: PretrainedConfig | None = None) -> Self:
if hf_path is not None:
from transformers import AutoConfig
from transformers.models.qwen2 import Qwen2Config as HFConfig
hf_config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
assert isinstance(hf_config, HFConfig)
assert hf_config is not None and isinstance(hf_config, PretrainedConfig)

config = cls(
vocab_size=hf_config.vocab_size,
Expand Down
Loading
Loading