Skip to content

Commit e73d386

Browse files
committed
support qwen1.5
1 parent 8c6f1b6 commit e73d386

File tree

11 files changed

+4787
-8
lines changed

11 files changed

+4787
-8
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ llm-export是一个llm模型导出工具,能够将llm模型导出为onnx和mnn
2626
- [![Download][download-tinyllama-1.1b-chat-onnx]][release-tinyllama-1.1b-chat-onnx]
2727
- [![Download][download-yi-6b-chat-onnx]][release-yi-6b-chat-onnx]
2828
- [![Download][download-deepseek-7b-chat-onnx]][release-deepseek-7b-chat-onnx]
29+
- [![Download][download-qwen1.5-0.5b-chat-onnx]][release-qwen1.5-0.5b-chat-onnx]
30+
- [![Download][download-qwen1.5-1.8b-chat-onnx]][release-qwen1.5-1.8b-chat-onnx]
31+
- [![Download][download-qwen1.5-4b-chat-onnx]][release-qwen1.5-4b-chat-onnx]
2932

3033
[download-chatglm-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm-6b-onnx/total
3134
[download-chatglm2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm2-6b-onnx/total
@@ -42,6 +45,9 @@ llm-export是一个llm模型导出工具,能够将llm模型导出为onnx和mnn
4245
[download-tinyllama-1.1b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/tinyllama-1.1b-chat-onnx/total
4346
[download-yi-6b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/yi-6b-chat-onnx/total
4447
[download-deepseek-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/deepseek-7b-chat-onnx/total
48+
[download-qwen1.5-0.5b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-0.5b-chat-onnx/total
49+
[download-qwen1.5-1.8b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-1.8b-chat-onnx/total
50+
[download-qwen1.5-4b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-4b-chat-onnx/total
4551
[release-chatglm-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm-6b-onnx
4652
[release-chatglm2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm2-6b-onnx
4753
[release-chatglm3-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm3-6b-onnx
@@ -57,6 +63,9 @@ llm-export是一个llm模型导出工具,能够将llm模型导出为onnx和mnn
5763
[release-tinyllama-1.1b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/tinyllama-1.1b-chat-onnx
5864
[release-yi-6b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/yi-6b-chat-onnx
5965
[release-deepseek-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/deepseek-7b-chat-onnx
66+
[release-qwen1.5-0.5b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-0.5b-chat-onnx
67+
[release-qwen1.5-1.8b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-1.8b-chat-onnx
68+
[release-qwen1.5-4b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-4b-chat-onnx
6069

6170
## 用法
6271
1. 将该项目clone到本地
@@ -134,6 +143,7 @@ optional arguments:
134143
--export_block EXPORT_BLOCK
135144
export llm block [id] to an `onnx` model.
136145
--export_blocks export llm all blocks to `onnx` models.
146+
--embed_bin export embedding weight as bin file with dtype `bfloat16`
137147
--embed_bf16 using `bfloat16` replace `float32` in embedding.
138148
--skip_slim Whether or not to skip onnx-slim.
139149
```

llm_export.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -766,16 +766,17 @@ def visual_embed(self, input_ids):
766766
return hidden_states.view(-1, 1, self.hidden_size)
767767

768768
class QWEN2Block(torch.nn.Module):
769-
def __init__(self, name, block, block_id, hidden_size, final_layernorm = None):
769+
def __init__(self, name, block, block_id, hidden_size, head_dim, final_layernorm = None):
770770
super().__init__()
771771
self.name = name
772772
self.block = block
773773
self.block_id = block_id
774774
self.final_layernorm = final_layernorm
775775
self.hidden_size = hidden_size
776+
self.head_dim = head_dim
776777

777778
def forward(self, hidden_states, attention_mask, position_ids, past_kv):
778-
theta = 1.0 / (10000.0 ** (torch.arange(0, 128, 2, dtype=torch.float32) / 128))
779+
theta = 1.0 / (10000.0 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim))
779780
position_ids = position_ids.float().reshape(-1, 1)
780781
idx_theta = position_ids * theta
781782
rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1)
@@ -792,6 +793,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
792793
hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
793794
if isinstance(presents, tuple):
794795
presents = torch.stack(presents)
796+
# print('###', presents.shape)
795797
return hidden_states, presents
796798

797799
class Qwen2_Chat(LLM):
@@ -801,8 +803,9 @@ def __init__(self, args):
801803
def load_model(self, model_path: str):
802804
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
803805
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval()
806+
self.config = model.config
804807
# Qwen2 models
805-
self.model_name = 'Qwen2-7B'
808+
self.model_name = 'Qwen2'
806809
transformer = model.model
807810
self.lm_ = model.lm_head
808811
self.embed_ = transformer.embed_tokens
@@ -814,13 +817,14 @@ def load_model(self, model_path: str):
814817
self.stop_ids.append(self.stop_id)
815818
for id in model.generation_config.eos_token_id:
816819
self.stop_ids.append(id)
817-
self.block_nums = len(self.blocks_)
818-
self.hidden_size = self.embed_.weight.shape[-1]
820+
self.block_nums = self.config.num_hidden_layers
821+
self.hidden_size = self.config.hidden_size
822+
self.num_heads = self.config.num_attention_heads
823+
self.head_dim = self.hidden_size // self.num_heads
819824
self.embed = Embedding(self.embed_, self.embed_bf16)
820825
self.lm = Lm(self.lm_)
821-
self.blocks = [QWEN2Block(self.model_name, self.blocks_[i], i, self.hidden_size, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
822-
# 4b
823-
self.past_kv_shape = [self.block_nums, 2, 1, 20, 0, 128]
826+
self.past_kv_shape = [self.block_nums, 2, 1, self.num_heads, 0, self.head_dim]
827+
self.blocks = [QWEN2Block(self.model_name, self.blocks_[i], i, self.hidden_size, self.head_dim, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
824828
# some config for export
825829
self.block_dynamic_axes = {
826830
"inputs_embeds" : { 0: "seq_len" },
@@ -1181,7 +1185,10 @@ def export(self):
11811185
'Qwen-7B-Chat': Qwen_Chat,
11821186
'Qwen-1_8B-Chat': Qwen_Chat,
11831187
'Qwen-VL-Chat': Qwen_Chat,
1188+
'Qwen1_5-0_5B-Chat': Qwen2_Chat,
1189+
'Qwen1_5-1_8B-Chat': Qwen2_Chat,
11841190
'Qwen1_5-4B-Chat': Qwen2_Chat,
1191+
'Qwen1_5-7B-Chat': Qwen2_Chat,
11851192
'Baichuan2-7B-Chat': Llama2_7b_Chat,
11861193
'Llama-2-7b-chat-ms': Llama2_7b_Chat,
11871194
'internlm-chat-7b': Llama2_7b_Chat,
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"architectures": [
3+
"Qwen2ForCausalLM"
4+
],
5+
"auto_map": {
6+
"AutoConfig": "configuration_qwen2.Qwen2Config",
7+
"AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
8+
},
9+
"attention_dropout": 0.0,
10+
"bos_token_id": 151643,
11+
"eos_token_id": 151645,
12+
"hidden_act": "silu",
13+
"hidden_size": 1024,
14+
"initializer_range": 0.02,
15+
"intermediate_size": 2816,
16+
"max_position_embeddings": 32768,
17+
"max_window_layers": 21,
18+
"model_type": "qwen2",
19+
"num_attention_heads": 16,
20+
"num_hidden_layers": 24,
21+
"num_key_value_heads": 16,
22+
"rms_norm_eps": 1e-06,
23+
"rope_theta": 1000000.0,
24+
"sliding_window": 32768,
25+
"tie_word_embeddings": true,
26+
"torch_dtype": "bfloat16",
27+
"transformers_version": "4.37.0",
28+
"use_cache": true,
29+
"use_sliding_window": false,
30+
"vocab_size": 151936
31+
}
32+
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# coding=utf-8
2+
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
""" Qwen2 model configuration"""
16+
17+
from transformers.configuration_utils import PretrainedConfig
18+
from transformers.utils import logging
19+
20+
21+
logger = logging.get_logger(__name__)
22+
23+
QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24+
"Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
25+
}
26+
27+
28+
class Qwen2Config(PretrainedConfig):
29+
r"""
30+
This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
31+
Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
32+
with the defaults will yield a similar configuration to that of
33+
Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
34+
35+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36+
documentation from [`PretrainedConfig`] for more information.
37+
38+
39+
Args:
40+
vocab_size (`int`, *optional*, defaults to 151936):
41+
Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
42+
`inputs_ids` passed when calling [`Qwen2Model`]
43+
hidden_size (`int`, *optional*, defaults to 4096):
44+
Dimension of the hidden representations.
45+
intermediate_size (`int`, *optional*, defaults to 22016):
46+
Dimension of the MLP representations.
47+
num_hidden_layers (`int`, *optional*, defaults to 32):
48+
Number of hidden layers in the Transformer encoder.
49+
num_attention_heads (`int`, *optional*, defaults to 32):
50+
Number of attention heads for each attention layer in the Transformer encoder.
51+
num_key_value_heads (`int`, *optional*, defaults to 32):
52+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
53+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54+
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
56+
by meanpooling all the original heads within that group. For more details checkout [this
57+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
58+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59+
The non-linear activation function (function or string) in the decoder.
60+
max_position_embeddings (`int`, *optional*, defaults to 32768):
61+
The maximum sequence length that this model might ever be used with.
62+
initializer_range (`float`, *optional*, defaults to 0.02):
63+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64+
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
65+
The epsilon used by the rms normalization layers.
66+
use_cache (`bool`, *optional*, defaults to `True`):
67+
Whether or not the model should return the last key/values attentions (not used by all models). Only
68+
relevant if `config.is_decoder=True`.
69+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
70+
Whether the model's input and output word embeddings should be tied.
71+
rope_theta (`float`, *optional*, defaults to 10000.0):
72+
The base period of the RoPE embeddings.
73+
use_sliding_window (`bool`, *optional*, defaults to `False`):
74+
Whether to use sliding window attention.
75+
sliding_window (`int`, *optional*, defaults to 4096):
76+
Sliding window attention (SWA) window size. If not specified, will default to `4096`.
77+
max_window_layers (`int`, *optional*, defaults to 28):
78+
The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
79+
attention_dropout (`float`, *optional*, defaults to 0.0):
80+
The dropout ratio for the attention probabilities.
81+
82+
```python
83+
>>> from transformers import Qwen2Model, Qwen2Config
84+
85+
>>> # Initializing a Qwen2 style configuration
86+
>>> configuration = Qwen2Config()
87+
88+
>>> # Initializing a model from the Qwen2-7B style configuration
89+
>>> model = Qwen2Model(configuration)
90+
91+
>>> # Accessing the model configuration
92+
>>> configuration = model.config
93+
```"""
94+
95+
model_type = "qwen2"
96+
keys_to_ignore_at_inference = ["past_key_values"]
97+
98+
def __init__(
99+
self,
100+
vocab_size=151936,
101+
hidden_size=4096,
102+
intermediate_size=22016,
103+
num_hidden_layers=32,
104+
num_attention_heads=32,
105+
num_key_value_heads=32,
106+
hidden_act="silu",
107+
max_position_embeddings=32768,
108+
initializer_range=0.02,
109+
rms_norm_eps=1e-6,
110+
use_cache=True,
111+
tie_word_embeddings=False,
112+
rope_theta=10000.0,
113+
use_sliding_window=False,
114+
sliding_window=4096,
115+
max_window_layers=28,
116+
attention_dropout=0.0,
117+
**kwargs,
118+
):
119+
self.vocab_size = vocab_size
120+
self.max_position_embeddings = max_position_embeddings
121+
self.hidden_size = hidden_size
122+
self.intermediate_size = intermediate_size
123+
self.num_hidden_layers = num_hidden_layers
124+
self.num_attention_heads = num_attention_heads
125+
self.use_sliding_window = use_sliding_window
126+
self.sliding_window = sliding_window
127+
self.max_window_layers = max_window_layers
128+
129+
# for backward compatibility
130+
if num_key_value_heads is None:
131+
num_key_value_heads = num_attention_heads
132+
133+
self.num_key_value_heads = num_key_value_heads
134+
self.hidden_act = hidden_act
135+
self.initializer_range = initializer_range
136+
self.rms_norm_eps = rms_norm_eps
137+
self.use_cache = use_cache
138+
self.rope_theta = rope_theta
139+
self.attention_dropout = attention_dropout
140+
141+
super().__init__(
142+
tie_word_embeddings=tie_word_embeddings,
143+
**kwargs,
144+
)

0 commit comments

Comments
 (0)