Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions demos/common/export_models/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,11 +436,13 @@ def export_text_generation_model(model_repository_path, source_model, model_name
print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export llm model", source_model)
if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}"
print('Running command: ', convert_tokenizer_command) # for debug purposes
if os.system(convert_tokenizer_command):
raise ValueError("Failed to export tokenizer and detokenizer", source_model)
### Export draft model for speculative decoding
Expand All @@ -463,8 +465,9 @@ def export_text_generation_model(model_repository_path, source_model, model_name
additional_options = ""
if args["draft_eagle3_mode"]:
print("Using eagle3 option for the draft model export")
additional_options += " --eagle3 --task text-generation-with-past"
additional_options += " --task text-generation-with-past"
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export llm model", source_model)

Expand Down Expand Up @@ -515,12 +518,12 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
print("Exporting embeddings model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
print('Running command:', optimum_command) # for debug purposes
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export embeddings model", source_model)
print("Exporting tokenizer to ", destination_path)
convert_tokenizer_command = "convert_tokenizer -o {} {} {}".format(destination_path, source_model, set_max_context_length)
print('Running command:', convert_tokenizer_command) # for debug purposes
print('Running command: ', convert_tokenizer_command) # for debug purposes
if (os.system(convert_tokenizer_command)):
raise ValueError("Failed to export tokenizer model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template)
Expand All @@ -535,6 +538,7 @@ def export_text2speech_model(model_repository_path, source_model, model_name, pr
print("Exporting text2speech model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export text2speech model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
Expand All @@ -549,6 +553,7 @@ def export_speech2text_model(model_repository_path, source_model, model_name, pr
print("Exporting speech2text model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export speech2text model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template)
Expand All @@ -563,6 +568,7 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec
print("Exporting rerank model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export rerank model", source_model)
print("Exporting tokenizer to ", destination_path)
Expand All @@ -589,6 +595,7 @@ def export_rerank_model(model_repository_path, source_model, model_name, precisi
print("Exporting rerank model to ",embeddings_path)
if not os.path.isdir(embeddings_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], tmpdirname)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export rerank model", source_model)
set_rt_info(tmpdirname, 'openvino_model.xml', 'config.json')
Expand Down Expand Up @@ -625,7 +632,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
print("Model index file already exists. Skipping conversion, re-generating graph only.")
else:
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} {}".format(source_model, precision, task_parameters['extra_quantization_params'], target_path)
print(f'optimum cli command: {optimum_command}')
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export image generation model", source_model)

Expand Down
19 changes: 10 additions & 9 deletions demos/continuous_batching/speculative_decoding/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,13 @@ Python environment setup:
curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py
pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt

# Override optimum-intel with version supporting eagle3
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@xufang/add_eagle3_draft_model_conversion

mkdir models
```

Run `export_model.py` script to download and quantize the model:

```console
python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model Tengyunw/qwen3_8b_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models
python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model AngelSlim/Qwen3-8B_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models
```

Draft model inherits all scheduler properties from the main model.
Expand All @@ -55,6 +52,12 @@ models
└── Qwen
└── Qwen3-8B
├── added_tokens.json
├── AngelSlim-Qwen3-8B_eagle3
│   ├── config.json
│   ├── generation_config.json
│   ├── openvino_config.json
│   ├── openvino_model.bin
│   └── openvino_model.xml
├── chat_template.jinja
├── config.json
├── generation_config.json
Expand All @@ -68,14 +71,10 @@ models
├── openvino_tokenizer.bin
├── openvino_tokenizer.xml
├── special_tokens_map.json
├── Tengyunw-qwen3_8b_eagle3
│   ├── config.json
│   ├── generation_config.json
│   ├── openvino_model.bin
│   └── openvino_model.xml
├── tokenizer_config.json
├── tokenizer.json
└── vocab.json

```

## Server Deployment
Expand Down Expand Up @@ -316,6 +315,8 @@ for chunk in stream:
```

Output:

```
if len(numbers) <= 1:
return numbers
else:
Expand Down