From 68a025b3f7e1ef04366d4a5d4b14b54387b665da Mon Sep 17 00:00:00 2001
From: mzegla <milosz.zeglarski@intel.com>
Date: Thu, 7 May 2026 14:50:36 +0200
Subject: [PATCH 1/3] remove custom optimum reinstall

---
 demos/continuous_batching/speculative_decoding/README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md
index 9b3398677e..30ef9d0392 100644
--- a/demos/continuous_batching/speculative_decoding/README.md
+++ b/demos/continuous_batching/speculative_decoding/README.md
@@ -34,9 +34,6 @@ Python environment setup:
 curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py
 pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt
 
-# Override optimum-intel with version supporting eagle3
-python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@xufang/add_eagle3_draft_model_conversion
-
 mkdir models
 ```
 

From 83509a923ce88e54626a24c7d3ad2aa72fe816f3 Mon Sep 17 00:00:00 2001
From: mzegla <milosz.zeglarski@intel.com>
Date: Mon, 11 May 2026 11:47:11 +0200
Subject: [PATCH 2/3] remove --eagle3 from optimum command and switch model

---
 demos/common/export_models/export_model.py       |  2 +-
 .../speculative_decoding/README.md               | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
index 1c0e2e87b3..11c059778d 100644
--- a/demos/common/export_models/export_model.py
+++ b/demos/common/export_models/export_model.py
@@ -463,7 +463,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name
                 additional_options = ""
                 if args["draft_eagle3_mode"]:
                     print("Using eagle3 option for the draft model export")
-                    additional_options += " --eagle3  --task text-generation-with-past"
+                    additional_options += " --task text-generation-with-past"
                 optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
                 if os.system(optimum_command):
                     raise ValueError("Failed to export llm model", source_model)
diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md
index 30ef9d0392..0e3371cd69 100644
--- a/demos/continuous_batching/speculative_decoding/README.md
+++ b/demos/continuous_batching/speculative_decoding/README.md
@@ -40,7 +40,7 @@ mkdir models
 Run `export_model.py` script to download and quantize the model:
 
 ```console
-python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model Tengyunw/qwen3_8b_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models
+python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model AngelSlim/Qwen3-8B_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models
 ```
 
 Draft model inherits all scheduler properties from the main model.
@@ -52,6 +52,12 @@ models
 └── Qwen
     └── Qwen3-8B
         ├── added_tokens.json
+        ├── AngelSlim-Qwen3-8B_eagle3
+        │   ├── config.json
+        │   ├── generation_config.json
+        │   ├── openvino_config.json
+        │   ├── openvino_model.bin
+        │   └── openvino_model.xml
         ├── chat_template.jinja
         ├── config.json
         ├── generation_config.json
@@ -65,14 +71,10 @@ models
         ├── openvino_tokenizer.bin
         ├── openvino_tokenizer.xml
         ├── special_tokens_map.json
-        ├── Tengyunw-qwen3_8b_eagle3
-        │   ├── config.json
-        │   ├── generation_config.json
-        │   ├── openvino_model.bin
-        │   └── openvino_model.xml
         ├── tokenizer_config.json
         ├── tokenizer.json
         └── vocab.json
+
 ```
 
 ## Server Deployment
@@ -313,6 +315,8 @@ for chunk in stream:
 ```
 
 Output:
+
+```
 if len(numbers) <= 1:
   return numbers
 else:

From 405b6ee277fa8f14d0ede218354d336baa397e5e Mon Sep 17 00:00:00 2001
From: mzegla <milosz.zeglarski@intel.com>
Date: Mon, 11 May 2026 16:07:32 +0200
Subject: [PATCH 3/3] more printing

---
 demos/common/export_models/export_model.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
index 11c059778d..d4cb3a9e27 100644
--- a/demos/common/export_models/export_model.py
+++ b/demos/common/export_models/export_model.py
@@ -436,11 +436,13 @@ def export_text_generation_model(model_repository_path, source_model, model_name
                     print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
                     task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
             optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
+            print('Running command: ', optimum_command)  # for debug purposes
             if os.system(optimum_command):
                 raise ValueError("Failed to export llm model", source_model)
             if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
                 print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
                 convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}"
+                print('Running command: ', convert_tokenizer_command)  # for debug purposes
                 if os.system(convert_tokenizer_command):
                     raise ValueError("Failed to export tokenizer and detokenizer", source_model)
     ### Export draft model for speculative decoding 
@@ -465,6 +467,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name
                     print("Using eagle3 option for the draft model export")
                     additional_options += " --task text-generation-with-past"
                 optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
+                print('Running command: ', optimum_command)  # for debug purposes
                 if os.system(optimum_command):
                     raise ValueError("Failed to export llm model", source_model)
 
@@ -515,12 +518,12 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
     print("Exporting embeddings model to ",destination_path)
     if not os.path.isdir(destination_path) or args['overwrite_models']:
         optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
-        print('Running command:', optimum_command)  # for debug purposes
+        print('Running command: ', optimum_command)  # for debug purposes
         if os.system(optimum_command):
             raise ValueError("Failed to export embeddings model", source_model)
         print("Exporting tokenizer to ", destination_path)
         convert_tokenizer_command = "convert_tokenizer -o {} {} {}".format(destination_path, source_model, set_max_context_length) 
-        print('Running command:', convert_tokenizer_command)  # for debug purposes
+        print('Running command: ', convert_tokenizer_command)  # for debug purposes
         if (os.system(convert_tokenizer_command)):
             raise ValueError("Failed to export tokenizer model", source_model)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template)
@@ -535,6 +538,7 @@ def export_text2speech_model(model_repository_path, source_model, model_name, pr
     print("Exporting text2speech model to ",destination_path)
     if not os.path.isdir(destination_path) or args['overwrite_models']:
         optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
+        print('Running command: ', optimum_command)  # for debug purposes
         if os.system(optimum_command):
             raise ValueError("Failed to export text2speech model", source_model)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
@@ -549,6 +553,7 @@ def export_speech2text_model(model_repository_path, source_model, model_name, pr
     print("Exporting speech2text model to ",destination_path)
     if not os.path.isdir(destination_path) or args['overwrite_models']:
         optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
+        print('Running command: ', optimum_command)  # for debug purposes
         if os.system(optimum_command):
             raise ValueError("Failed to export speech2text model", source_model)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template)
@@ -563,6 +568,7 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec
     print("Exporting rerank model to ",destination_path)
     if not os.path.isdir(destination_path) or args['overwrite_models']:
         optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
+        print('Running command: ', optimum_command)  # for debug purposes
         if os.system(optimum_command):
             raise ValueError("Failed to export rerank model", source_model)
         print("Exporting tokenizer to ", destination_path)
@@ -589,6 +595,7 @@ def export_rerank_model(model_repository_path, source_model, model_name, precisi
             print("Exporting rerank model to ",embeddings_path)
             if not os.path.isdir(embeddings_path) or args['overwrite_models']:
                 optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], tmpdirname)
+                print('Running command: ', optimum_command)  # for debug purposes
                 if os.system(optimum_command):
                     raise ValueError("Failed to export rerank model", source_model)
                 set_rt_info(tmpdirname, 'openvino_model.xml', 'config.json')
@@ -625,7 +632,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
         print("Model index file already exists. Skipping conversion, re-generating graph only.")
     else:
         optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} {}".format(source_model, precision, task_parameters['extra_quantization_params'], target_path)
-        print(f'optimum cli command: {optimum_command}')
+        print('Running command: ', optimum_command)  # for debug purposes
         if os.system(optimum_command):
             raise ValueError("Failed to export image generation model", source_model)