From 68a025b3f7e1ef04366d4a5d4b14b54387b665da Mon Sep 17 00:00:00 2001 From: mzegla Date: Thu, 7 May 2026 14:50:36 +0200 Subject: [PATCH 1/3] remove custom optimum reinstall --- demos/continuous_batching/speculative_decoding/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md index 9b3398677e..30ef9d0392 100644 --- a/demos/continuous_batching/speculative_decoding/README.md +++ b/demos/continuous_batching/speculative_decoding/README.md @@ -34,9 +34,6 @@ Python environment setup: curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt -# Override optimum-intel with version supporting eagle3 -python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@xufang/add_eagle3_draft_model_conversion - mkdir models ``` From 83509a923ce88e54626a24c7d3ad2aa72fe816f3 Mon Sep 17 00:00:00 2001 From: mzegla Date: Mon, 11 May 2026 11:47:11 +0200 Subject: [PATCH 2/3] remove --eagle3 from optimum command and switch model --- demos/common/export_models/export_model.py | 2 +- .../speculative_decoding/README.md | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 1c0e2e87b3..11c059778d 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -463,7 +463,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name additional_options = "" if args["draft_eagle3_mode"]: print("Using eagle3 option for the draft model export") - additional_options += " --eagle3 --task text-generation-with-past" + additional_options += " --task text-generation-with-past" optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path) if os.system(optimum_command): raise ValueError("Failed to export llm model", source_model) diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md index 30ef9d0392..0e3371cd69 100644 --- a/demos/continuous_batching/speculative_decoding/README.md +++ b/demos/continuous_batching/speculative_decoding/README.md @@ -40,7 +40,7 @@ mkdir models Run `export_model.py` script to download and quantize the model: ```console -python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model Tengyunw/qwen3_8b_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models +python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model AngelSlim/Qwen3-8B_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models ``` Draft model inherits all scheduler properties from the main model. @@ -52,6 +52,12 @@ models └── Qwen └── Qwen3-8B ├── added_tokens.json + ├── AngelSlim-Qwen3-8B_eagle3 + │   ├── config.json + │   ├── generation_config.json + │   ├── openvino_config.json + │   ├── openvino_model.bin + │   └── openvino_model.xml ├── chat_template.jinja ├── config.json ├── generation_config.json @@ -65,14 +71,10 @@ models ├── openvino_tokenizer.bin ├── openvino_tokenizer.xml ├── special_tokens_map.json - ├── Tengyunw-qwen3_8b_eagle3 - │   ├── config.json - │   ├── generation_config.json - │   ├── openvino_model.bin - │   └── openvino_model.xml ├── tokenizer_config.json ├── tokenizer.json └── vocab.json + ``` ## Server Deployment @@ -313,6 +315,8 @@ for chunk in stream: ``` Output: + +``` if len(numbers) <= 1: return numbers else: From 405b6ee277fa8f14d0ede218354d336baa397e5e Mon Sep 17 00:00:00 2001 From: mzegla Date: Mon, 11 May 2026 16:07:32 +0200 Subject: [PATCH 3/3] more printing --- demos/common/export_models/export_model.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 11c059778d..d4cb3a9e27 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -436,11 +436,13 @@ def export_text_generation_model(model_repository_path, source_model, model_name print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1") task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1" optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path) + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export llm model", source_model) if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))): print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model") convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}" + print('Running command: ', convert_tokenizer_command) # for debug purposes if os.system(convert_tokenizer_command): raise ValueError("Failed to export tokenizer and detokenizer", source_model) ### Export draft model for speculative decoding @@ -465,6 +467,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name print("Using eagle3 option for the draft model export") additional_options += " --task text-generation-with-past" optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path) + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export llm model", source_model) @@ -515,12 +518,12 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name, print("Exporting embeddings model to ",destination_path) if not os.path.isdir(destination_path) or args['overwrite_models']: optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path) - print('Running command:', optimum_command) # for debug purposes + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export embeddings model", source_model) print("Exporting tokenizer to ", destination_path) convert_tokenizer_command = "convert_tokenizer -o {} {} {}".format(destination_path, source_model, set_max_context_length) - print('Running command:', convert_tokenizer_command) # for debug purposes + print('Running command: ', convert_tokenizer_command) # for debug purposes if (os.system(convert_tokenizer_command)): raise ValueError("Failed to export tokenizer model", source_model) gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template) @@ -535,6 +538,7 @@ def export_text2speech_model(model_repository_path, source_model, model_name, pr print("Exporting text2speech model to ",destination_path) if not os.path.isdir(destination_path) or args['overwrite_models']: optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path) + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export text2speech model", source_model) gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template) @@ -549,6 +553,7 @@ def export_speech2text_model(model_repository_path, source_model, model_name, pr print("Exporting speech2text model to ",destination_path) if not os.path.isdir(destination_path) or args['overwrite_models']: optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path) + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export speech2text model", source_model) gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template) @@ -563,6 +568,7 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec print("Exporting rerank model to ",destination_path) if not os.path.isdir(destination_path) or args['overwrite_models']: optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path) + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export rerank model", source_model) print("Exporting tokenizer to ", destination_path) @@ -589,6 +595,7 @@ def export_rerank_model(model_repository_path, source_model, model_name, precisi print("Exporting rerank model to ",embeddings_path) if not os.path.isdir(embeddings_path) or args['overwrite_models']: optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], tmpdirname) + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export rerank model", source_model) set_rt_info(tmpdirname, 'openvino_model.xml', 'config.json') @@ -625,7 +632,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam print("Model index file already exists. Skipping conversion, re-generating graph only.") else: optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} {}".format(source_model, precision, task_parameters['extra_quantization_params'], target_path) - print(f'optimum cli command: {optimum_command}') + print('Running command: ', optimum_command) # for debug purposes if os.system(optimum_command): raise ValueError("Failed to export image generation model", source_model)