support qwen-1.8b export

wangzhaode · wangzhaode · commit 29fd8ddc11a6 · 2023-12-05T16:13:04.000+08:00
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ llm-export是一个llm模型导出工具，能够将llm模型导出到onnx模型
 - [![Download][download-qwen-7b-chat-onnx]][release-qwen-7b-chat-onnx]
 - [![Download][download-baichuan2-7b-chat-onnx]][release-baichuan2-7b-chat-onnx]
 - [![Download][download-llama2-7b-chat-onnx]][release-llama2-7b-chat-onnx]
+- [![Download][download-qwen-1.8b-chat-onnx]][release-qwen-1.8b-chat-onnx]
 
 [download-chatglm-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm-6b-onnx/total
 [download-chatglm2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm2-6b-onnx/total
@@ -23,13 +24,15 @@ llm-export是一个llm模型导出工具，能够将llm模型导出到onnx模型
 [download-qwen-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-7b-chat-onnx/total
 [download-baichuan2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/baichuan2-7b-chat-onnx/total
 [download-llama2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/llama2-7b-chat-onnx/total
+[download-qwen-1.8b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-1.8b-onnx/total
 [release-chatglm-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm-6b-onnx
 [release-chatglm2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm2-6b-onnx
 [release-chatglm3-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm3-6b-onnx
 [release-codegeex2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/codegeex2-6b-onnx
 [release-qwen-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-7b-chat-onnx
 [release-baichuan2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/baichuan2-7b-chat-onnx
 [release-llama2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/llama2-7b-chat-onnx
+[release-qwen-1.8b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-1.8b-onnx
 
 ## 用法
 1. 将该项目clone到本地
diff --git a/llm_export.py b/llm_export.py
@@ -14,6 +14,7 @@ class Embedding(torch.nn.Module):
     def __init__(self, embed, using_bf16: bool = False):
         super().__init__()
         self.bf16 = using_bf16
+        self.embed_dim = embed.weight.shape[-1]
         if using_bf16:
             # using bf16 embedding weight
             self.embed = embed.bfloat16()
@@ -24,7 +25,7 @@ def forward(self, input_ids):
         res = self.embed(input_ids)
         if self.bf16:
             res = res.float()
-        return res.view(-1, 1, 4096)
+        return res.view(-1, 1, self.embed_dim)
 
 class Lm(torch.nn.Module):
     def __init__(self, lm):
@@ -52,8 +53,9 @@ def __init__(self, args):
             self.sp_model = spm.SentencePieceProcessor(tokenizer_model)
         else:
             self.sp_model = None
-        self.load_model(args.path)
         self.max_length = 1024
+        self.hidden_size = 4096
+        self.load_model(args.path)
 
     def load_model(self, model_path: str):
         raise NotImplementedError
@@ -131,7 +133,7 @@ def assert_equal(self, torch_outs, onnx_outs):
 
     def export_lm(self):
         model = self.lm
-        hidden_states = torch.randn(1, 4096)
+        hidden_states = torch.randn(1, self.hidden_size)
         onnx_model = f'./{self.export_path}/lm.onnx'
         torch.onnx.export(model, (hidden_states),
                         onnx_model,
@@ -177,7 +179,7 @@ def export_embed(self):
     def export_block(self, block_id: int):
         self.seq_len = 3
         self.token_len = 0
-        inputs_embeds = torch.randn((self.seq_len, 1, 4096))
+        inputs_embeds = torch.randn((self.seq_len, 1, self.hidden_size))
         attention_mask =  self.get_attention_mask()
         position_ids = self.get_position_ids()
         past_key_values = torch.zeros(self.past_kv_shape[1:])
@@ -294,7 +296,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
                                              use_cache=True)
         if self.final_layernorm is not None:
             hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, 4096)[-1].view(1, 1, 4096)
+            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
         if isinstance(presents, tuple):
             presents = torch.stack(presents)
         return hidden_states, presents
@@ -376,7 +378,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
                                             rotary_pos_emb=rotary_pos_emb)
         if self.final_layernorm is not None:
             hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, 4096)[-1].view(1, 1, 4096)
+            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
         if isinstance(presents, tuple):
             presents = torch.stack(presents)
         return hidden_states, presents
@@ -442,27 +444,55 @@ def build_prompt(self, query):
 
 # qwen
 class QWENBlock(torch.nn.Module):
-    def __init__(self, block, block_id, final_layernorm = None):
+    def __init__(self, block, block_id, hidden_size, final_layernorm = None):
         super().__init__()
         self.block = block
         self.block_id = block_id
         self.final_layernorm = final_layernorm
+        self.hidden_size = hidden_size
 
     def forward(self, hidden_states, attention_mask, position_ids, past_kv):
         theta = 1.0 / (10000.0 ** (torch.arange(0, 128, 2, dtype=torch.float32) / 128))
         position_ids = position_ids.float().reshape(-1, 1)
         idx_theta = position_ids * theta
         rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1)
         rotary_pos_emb = rotary_pos_emb.unsqueeze(1).unsqueeze(0)
-        hidden_states = hidden_states.view(1, -1, 4096)
+        hidden_states = hidden_states.view(1, -1, self.hidden_size)
         hidden_states, presents = self.block(hidden_states,
                                              past_kv,
                                              attention_mask,
                                              rotary_pos_emb,
                                              use_cache=True)
         if self.final_layernorm is not None:
             hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, 4096)[-1].view(1, 1, 4096)
+            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
+        if isinstance(presents, tuple):
+            presents = torch.stack(presents)
+        return hidden_states, presents
+
+class QWEN18Block(torch.nn.Module):
+    def __init__(self, block, block_id, hidden_size, final_layernorm = None):
+        super().__init__()
+        self.block = block
+        self.block_id = block_id
+        self.final_layernorm = final_layernorm
+        self.hidden_size = hidden_size
+
+    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
+        theta = 1.0 / (10000.0 ** (torch.arange(0, 128, 2, dtype=torch.float32) / 128))
+        position_ids = position_ids.float().reshape(-1, 1)
+        idx_theta = position_ids * theta
+        rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1).unsqueeze(1).unsqueeze(0)
+        rotary_pos_emb = torch.stack([torch.cos(rotary_pos_emb), torch.sin(rotary_pos_emb)])
+        hidden_states = hidden_states.view(1, -1, self.hidden_size)
+        hidden_states, presents = self.block(hidden_states,
+                                             rotary_pos_emb,
+                                             past_kv,
+                                             attention_mask,
+                                             use_cache=True)
+        if self.final_layernorm is not None:
+            hidden_states = self.final_layernorm(hidden_states)
+            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
         if isinstance(presents, tuple):
             presents = torch.stack(presents)
         return hidden_states, presents
@@ -482,11 +512,18 @@ def load_model(self, model_path: str):
         # some wrapper
         self.stop_id = self.tokenizer.im_end_id
         self.block_nums = len(self.blocks_)
+        self.hidden_size = transformer.embed_dim
         self.embed = Embedding(self.embed_, self.embed_bf16)
         self.lm = Lm(self.lm_)
-        self.blocks = [QWENBlock(self.blocks_[i], i, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
+        if self.block_nums == 32:
+            # qwen-7b
+            self.blocks = [QWENBlock(self.blocks_[i], i, self.hidden_size, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
+            self.past_kv_shape = [32, 2, 1, 0, 32, 128]
+        elif self.block_nums == 24:
+            # qwen-1.8b
+            self.blocks = [QWEN18Block(self.blocks_[i], i, self.hidden_size, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
+            self.past_kv_shape = [24, 2, 1, 0, 16, 128]
         # some config for export
-        self.past_kv_shape = [32, 2, 1, 0, 32, 128]
         self.block_dynamic_axes = {
             "inputs_embeds" : { 0: "seq_len" },
             "attention_mask" : { 2: "seq_len", 3: "seq_len" },
@@ -594,6 +631,7 @@ def get_position_ids(self) -> torch.Tensor:
         'chatglm3-6b': Chatglm3_6b,
         'codegeex2-6b': Chatglm2_6b,
         'Qwen-7B-Chat': Qwen_7b_Chat,
+        'Qwen-1_8B-Chat': Qwen_7b_Chat,
         'Baichuan2-7B-Chat': Llama2_7b_Chat,
         'Llama-2-7b-chat-ms': Llama2_7b_Chat
     }
diff --git a/llm_models/Qwen-1_8B-Chat/modeling_qwen.py b/llm_models/Qwen-1_8B-Chat/modeling_qwen.py