From 926575fe06bd80aa3b2fbcb94f058b3e35a258ba Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Fri, 3 Apr 2026 23:33:41 +0000 Subject: [PATCH 1/2] feat(vllm_batch): add reasoning_parser support for batch inference - Add `reasoning_parser` field to VLLMModelConfig - Pass it to `structured_outputs_config` in init_engine (async path) - Filter it from AsyncEngineArgs (it's a serving-layer arg, not engine arg) - init_vllm subprocess path already handles it via CLI arg conversion Co-Authored-By: Claude Sonnet 4.6 --- .../model_engine_server/common/dtos/llms/vllm.py | 5 +++++ .../inference/vllm/vllm_batch.py | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/common/dtos/llms/vllm.py b/model-engine/model_engine_server/common/dtos/llms/vllm.py index 473af057a..26c7decf3 100644 --- a/model-engine/model_engine_server/common/dtos/llms/vllm.py +++ b/model-engine/model_engine_server/common/dtos/llms/vllm.py @@ -73,6 +73,11 @@ class VLLMModelConfig(BaseModel): description="Enable auto tool choice", ) + reasoning_parser: Optional[str] = Field( + None, + description="Reasoning parser (e.g. 'nemotron_v3', 'deepseek_r1')", + ) + load_format: Optional[str] = Field( None, description="The format of the model weights to load.\n\n" diff --git a/model-engine/model_engine_server/inference/vllm/vllm_batch.py b/model-engine/model_engine_server/inference/vllm/vllm_batch.py index d33a4a66e..76f1b0f28 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_batch.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_batch.py @@ -290,7 +290,15 @@ async def init_engine( seed=request.model_cfg.seed or 0, gpu_memory_utilization=request.max_gpu_memory_utilization or 0.9, ) - engine_args_dict = {**default_engine_args_dict, **parsed_configs.model_dump(exclude_none=True)} + _serving_only_keys = {"reasoning_parser"} + engine_args_dict = { + k: v + for k, v in { + **default_engine_args_dict, + **parsed_configs.model_dump(exclude_none=True), + }.items() + if k not in _serving_only_keys + } print("vLLM engine args:", engine_args_dict, flush=True) engine_args = AsyncEngineArgs(**engine_args_dict) @@ -319,7 +327,9 @@ async def init_engine( return_tokens_as_token_ids=False, enable_auto_tool_choice=False, tool_call_parser=None, - structured_outputs_config=argparse.Namespace(reasoning_parser=None), + structured_outputs_config=argparse.Namespace( + reasoning_parser=parsed_configs.reasoning_parser + ), enable_prompt_tokens_details=False, enable_force_include_usage=False, enable_log_outputs=False, From 0924eeb29ef22d4ac276771ce036b77ec36e034b Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Fri, 3 Apr 2026 23:37:36 +0000 Subject: [PATCH 2/2] fix(vllm_batch): filter serving-only args from AsyncEngineArgs tool_call_parser, enable_auto_tool_choice, chat_template, and reasoning_parser are app-state args, not AsyncEngineArgs params. Pass them via app_state_args only. Co-Authored-By: Claude Sonnet 4.6 --- model-engine/model_engine_server/inference/vllm/vllm_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model-engine/model_engine_server/inference/vllm/vllm_batch.py b/model-engine/model_engine_server/inference/vllm/vllm_batch.py index 76f1b0f28..fa1c4b275 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_batch.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_batch.py @@ -290,7 +290,7 @@ async def init_engine( seed=request.model_cfg.seed or 0, gpu_memory_utilization=request.max_gpu_memory_utilization or 0.9, ) - _serving_only_keys = {"reasoning_parser"} + _serving_only_keys = {"reasoning_parser", "tool_call_parser", "enable_auto_tool_choice", "chat_template"} engine_args_dict = { k: v for k, v in {