open slim default.

wangzhaode · wangzhaode · commit a667117c69a3 · 2024-01-25T21:02:41.000+08:00
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ llm-export是一个llm模型导出工具，能够将llm模型导出为onnx和mnn
 - 🚀 均完成`onnxruntime`正确性测试
 - 🚀 优化原始代码，支持动态形状
 - 🚀 优化原始代码，减少常量部分
+- 🚀 使用[OnnxSlim](https://github.com/WeLoveAI/OnnxSlim)优化onnx模型，性能提升约5%; by [@inisis](https://github.com/inisis)
 
 
 ## 模型支持与下载
@@ -94,12 +95,14 @@ python llm_export.py \
 - 支持将tokenizer导出为文本文件，使用`--export_token`
 - 支持将导出的onnx模型转换为mnn模型，默认转换为非对称4bit量化，使用`--export_mnn`
 - 指定导出路径使用`--onnx_path`和`--mnn_path`
+- 默认会使用onnx-slim对onnx模型进行优化，跳过该步骤使用`--skip_slim`
 
 ## 参数
 ```
-usage: llm_export.py [-h] --path PATH [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b}] [--onnx_path ONNX_PATH]
-                     [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token] [--export_embed] [--export_visual] [--export_lm]
-                     [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bf16]
+usage: llm_export.py [-h] --path PATH
+                     [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh}]
+                     [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token] [--export_embed] [--export_visual] [--export_lm]
+                     [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bf16] [--skip_slim]
 
 llm_exporter
 
@@ -109,7 +112,7 @@ optional arguments:
                         Can be either:
                                 - A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
                                 - A path to a *directory* clone from repo like `../chatglm-6b`.
-  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b}
+  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh}
                         type(`str`, *optional*):
                                 The pretrain llm model type.
   --onnx_path ONNX_PATH
@@ -132,4 +135,5 @@ optional arguments:
                         export llm block [id] to an `onnx` model.
   --export_blocks       export llm all blocks to `onnx` models.
   --embed_bf16          using `bfloat16` replace `float32` in embedding.
+  --skip_slim           Whether or not to skip onnx-slim.
 ```
diff --git a/README_en.md b/README_en.md
@@ -6,6 +6,7 @@ llm-export is a tool for exporting llm models, capable of converting llm models
 - 🚀 All passed `onnxruntime` correctness tests
 - 🚀 Optimized the original code to support dynamic shapes
 - 🚀 Optimized the original code to reduce the constant portion
+- 🚀 Using [OnnxSlim](https://github.com/WeLoveAI/OnnxSlim) slim onnx model，speed up 5%; by [@inisis](https://github.com/inisis)
 
 
 ## Model Support and Downloads
@@ -47,12 +48,14 @@ python llm_export.py \
 - Supports exporting the tokenizer as a text file, use --export_token
 - Supports converting the exported ONNX model to an MNN model, with default conversion to non-symmetric 4bit quantization, use --export_mnn
 - Specify export paths using --onnx_path and --mnn_path
+- Default using onnx-slim, skip using --skip_slim
 
 ## Commad Args
 ```
-usage: llm_export.py [-h] --path PATH [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b}] [--onnx_path ONNX_PATH]
-                     [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token] [--export_embed] [--export_visual] [--export_lm]
-                     [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bf16]
+usage: llm_export.py [-h] --path PATH
+                     [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh}]
+                     [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token] [--export_embed] [--export_visual] [--export_lm]
+                     [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bf16] [--skip_slim]
 
 llm_exporter
 
@@ -62,7 +65,7 @@ optional arguments:
                         Can be either:
                                 - A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
                                 - A path to a *directory* clone from repo like `../chatglm-6b`.
-  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b}
+  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh}
                         type(`str`, *optional*):
                                 The pretrain llm model type.
   --onnx_path ONNX_PATH
@@ -85,4 +88,5 @@ optional arguments:
                         export llm block [id] to an `onnx` model.
   --export_blocks       export llm all blocks to `onnx` models.
   --embed_bf16          using `bfloat16` replace `float32` in embedding.
+  --skip_slim           Whether or not to skip onnx-slim.
 ```
diff --git a/llm_export.py b/llm_export.py
@@ -5,6 +5,7 @@
 import argparse
 import torch
 import numpy as np
+from onnxslim import slim
 import onnxruntime as ort
 import _tools as MNNTools
 import sentencepiece as spm
@@ -83,7 +84,7 @@ def __init__(self, args):
         self.export_verbose = args.export_verbose
         self.export_test = args.export_test
         self.embed_bf16 = args.embed_bf16
-        self.slim = args.slim
+        self.skip_slim = args.skip_slim
         tokenizer_model = os.path.join(args.path, 'tokenizer.model')
         if os.path.exists(tokenizer_model):
             self.sp_model = spm.SentencePieceProcessor(tokenizer_model)
@@ -186,8 +187,7 @@ def export_lm(self):
                         output_names=['token_id'],
                         do_constant_folding=True,
                         opset_version=15)
-        if self.slim:
-            from onnxslim import slim
+        if not self.skip_slim:
             slim(onnx_model, output_model=onnx_model)
         # test lm
         if self.export_test:
@@ -217,8 +217,7 @@ def export_visual(self):
                         }},
                         do_constant_folding=True,
                         opset_version=15)
-        if self.slim:
-            from onnxslim import slim
+        if not self.skip_slim:
             slim(onnx_model, output_model=onnx_model)
         # test
         if self.export_test:
@@ -246,8 +245,7 @@ def export_embed(self):
                         }},
                         do_constant_folding=True,
                         opset_version=15)
-        if self.slim:
-            from onnxslim import slim
+        if not self.skip_slim:
             slim(onnx_model, output_model=onnx_model)
         # test
         if self.export_test:
@@ -281,8 +279,7 @@ def export_block(self, block_id: int):
             dynamic_axes=self.block_dynamic_axes,
             do_constant_folding=True,
             opset_version=15)
-        if self.slim:
-            from onnxslim import slim
+        if not self.skip_slim:
             slim(onnx_model, output_model=onnx_model)
         if self.export_test:
             original_outs = model(inputs_embeds, attention_mask, position_ids, past_key_values)
@@ -322,8 +319,7 @@ def export(self):
             dynamic_axes=self.model_dynamic_axes,
             do_constant_folding=True,
             opset_version=15)
-        if self.slim:
-            from onnxslim import slim
+        if not self.skip_slim:
             slim(onnx_model, output_model=onnx_model)
         if self.export_test:
             # test
@@ -961,8 +957,7 @@ def export(self):
             dynamic_axes=self.model_dynamic_axes,
             do_constant_folding=True,
             opset_version=15)
-        if self.slim:
-            from onnxslim import slim
+        if not self.skip_slim:
             slim(onnx_model, output_model=onnx_model)
         if self.export_test:
             self.seq_len = 4
@@ -1042,7 +1037,7 @@ def get_attention_mask(self) -> torch.Tensor:
     parser.add_argument('--export_block', type=int, help='export llm block [id] to an `onnx` model.')
     parser.add_argument('--export_blocks', action='store_true', help='export llm all blocks to `onnx` models.')
     parser.add_argument('--embed_bf16', action='store_true', help='using `bfloat16` replace `float32` in embedding.')
-    parser.add_argument('--slim', action='store_true', help='Whether or not to slim the exported onnx model.')
+    parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.')
 
 
     args = parser.parse_args()
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-MNN==2.8.0
+MNN==2.8.1
 numpy==1.25.2
 onnxruntime==1.15.1
 torch==2.0.1

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-MNN==2.8.0`
	`1`	`+MNN==2.8.1`
`2`	`2`	`numpy==1.25.2`
`3`	`3`	`onnxruntime==1.15.1`
`4`	`4`	`torch==2.0.1`