wangzhaode
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎llm_export.py‎
Lines changed: 15 additions & 8 deletions b/‎llm_export.py‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎llm_models/Qwen1_5-0_5B-Chat/config.json‎
Lines changed: 32 additions & 0 deletions b/‎llm_models/Qwen1_5-0_5B-Chat/config.json‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎llm_models/Qwen1_5-0_5B-Chat/configuration_qwen2.py‎
Lines changed: 144 additions & 0 deletions b/‎llm_models/Qwen1_5-0_5B-Chat/configuration_qwen2.py‎
Lines changed: 144 additions & 0 deletions
@@ -26,6 +26,9 @@ llm-export是一个llm模型导出工具，能够将llm模型导出为onnx和mnn
 - [![Download][download-tinyllama-1.1b-chat-onnx]][release-tinyllama-1.1b-chat-onnx]
 - [![Download][download-yi-6b-chat-onnx]][release-yi-6b-chat-onnx]
 - [![Download][download-deepseek-7b-chat-onnx]][release-deepseek-7b-chat-onnx]
+- [![Download][download-qwen1.5-0.5b-chat-onnx]][release-qwen1.5-0.5b-chat-onnx]
+- [![Download][download-qwen1.5-1.8b-chat-onnx]][release-qwen1.5-1.8b-chat-onnx]
+- [![Download][download-qwen1.5-4b-chat-onnx]][release-qwen1.5-4b-chat-onnx]
 
 [download-chatglm-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm-6b-onnx/total
 [download-chatglm2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm2-6b-onnx/total
@@ -42,6 +45,9 @@ llm-export是一个llm模型导出工具，能够将llm模型导出为onnx和mnn
 [download-tinyllama-1.1b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/tinyllama-1.1b-chat-onnx/total
 [download-yi-6b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/yi-6b-chat-onnx/total
 [download-deepseek-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/deepseek-7b-chat-onnx/total
+[download-qwen1.5-0.5b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-0.5b-chat-onnx/total
+[download-qwen1.5-1.8b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-1.8b-chat-onnx/total
+[download-qwen1.5-4b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-4b-chat-onnx/total
 [release-chatglm-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm-6b-onnx
 [release-chatglm2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm2-6b-onnx
 [release-chatglm3-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm3-6b-onnx
@@ -57,6 +63,9 @@ llm-export是一个llm模型导出工具，能够将llm模型导出为onnx和mnn
 [release-tinyllama-1.1b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/tinyllama-1.1b-chat-onnx
 [release-yi-6b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/yi-6b-chat-onnx
 [release-deepseek-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/deepseek-7b-chat-onnx
+[release-qwen1.5-0.5b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-0.5b-chat-onnx
+[release-qwen1.5-1.8b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-1.8b-chat-onnx
+[release-qwen1.5-4b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-4b-chat-onnx
 
 ## 用法
 1. 将该项目clone到本地
@@ -134,6 +143,7 @@ optional arguments:
   --export_block EXPORT_BLOCK
                         export llm block [id] to an `onnx` model.
   --export_blocks       export llm all blocks to `onnx` models.
+  --embed_bin           export embedding weight as bin file with dtype `bfloat16`
   --embed_bf16          using `bfloat16` replace `float32` in embedding.
   --skip_slim           Whether or not to skip onnx-slim.
 ```
@@ -766,16 +766,17 @@ def visual_embed(self, input_ids):
         return hidden_states.view(-1, 1, self.hidden_size)
 
 class QWEN2Block(torch.nn.Module):
-    def __init__(self, name, block, block_id, hidden_size, final_layernorm = None):
+    def __init__(self, name, block, block_id, hidden_size, head_dim, final_layernorm = None):
         super().__init__()
         self.name = name
         self.block = block
         self.block_id = block_id
         self.final_layernorm = final_layernorm
         self.hidden_size = hidden_size
+        self.head_dim = head_dim
 
     def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        theta = 1.0 / (10000.0 ** (torch.arange(0, 128, 2, dtype=torch.float32) / 128))
+        theta = 1.0 / (10000.0 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim))
         position_ids = position_ids.float().reshape(-1, 1)
         idx_theta = position_ids * theta
         rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1)
@@ -792,6 +793,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
             hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
         if isinstance(presents, tuple):
             presents = torch.stack(presents)
+        # print('###', presents.shape)
         return hidden_states, presents
 
 class Qwen2_Chat(LLM):
@@ -801,8 +803,9 @@ def __init__(self, args):
     def load_model(self, model_path: str):
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval()
+        self.config = model.config
         # Qwen2 models
-        self.model_name = 'Qwen2-7B'
+        self.model_name = 'Qwen2'
         transformer = model.model
         self.lm_ = model.lm_head
         self.embed_ = transformer.embed_tokens
@@ -814,13 +817,14 @@ def load_model(self, model_path: str):
             self.stop_ids.append(self.stop_id)
             for id in model.generation_config.eos_token_id:
                 self.stop_ids.append(id)
-        self.block_nums = len(self.blocks_)
-        self.hidden_size = self.embed_.weight.shape[-1]
+        self.block_nums = self.config.num_hidden_layers
+        self.hidden_size = self.config.hidden_size
+        self.num_heads = self.config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
         self.embed = Embedding(self.embed_, self.embed_bf16)
         self.lm = Lm(self.lm_)
-        self.blocks = [QWEN2Block(self.model_name, self.blocks_[i], i, self.hidden_size, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
-        # 4b
-        self.past_kv_shape = [self.block_nums, 2, 1, 20, 0, 128]
+        self.past_kv_shape = [self.block_nums, 2, 1, self.num_heads, 0, self.head_dim]
+        self.blocks = [QWEN2Block(self.model_name, self.blocks_[i], i, self.hidden_size, self.head_dim, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
         # some config for export
         self.block_dynamic_axes = {
             "inputs_embeds" : { 0: "seq_len" },
@@ -1181,7 +1185,10 @@ def export(self):
         'Qwen-7B-Chat': Qwen_Chat,
         'Qwen-1_8B-Chat': Qwen_Chat,
         'Qwen-VL-Chat': Qwen_Chat,
+        'Qwen1_5-0_5B-Chat': Qwen2_Chat,
+        'Qwen1_5-1_8B-Chat': Qwen2_Chat,
         'Qwen1_5-4B-Chat': Qwen2_Chat,
+        'Qwen1_5-7B-Chat': Qwen2_Chat,
         'Baichuan2-7B-Chat': Llama2_7b_Chat,
         'Llama-2-7b-chat-ms': Llama2_7b_Chat,
         'internlm-chat-7b': Llama2_7b_Chat,
 
@@ -0,0 +1,32 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_qwen2.Qwen2Config",
+    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 2816,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.37.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Qwen2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
+}
+
+
+class Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )