From 6bb8bd4f6b574803e2e4decee36930d0475d3ec4 Mon Sep 17 00:00:00 2001 From: exzile Date: Fri, 26 Jun 2026 15:54:45 -0400 Subject: [PATCH] Support add_generation_prompt request parameter for chat completions The chat template was always rendered with add_generation_prompt=true, hardcoded in every servable. This exposes an optional add_generation_prompt field (bool, default true) on the /v3/chat/completions request, matching HF transformers and vLLM. When false, the trailing generation prompt is omitted, which is the building block for assistant prefill. - Parse add_generation_prompt in the request (openai_api_handler.cpp) and store it on the request struct (openai_request.hpp). - Honor it in all chat-template application sites: the MINJA path (LLM and VLM continuous batching, legacy) and the Python-Jinja path (read from the request body and pass into the template render). - Add tests covering default (generation prompt added) and false (generation prompt omitted, assistant message preserved). Verified end-to-end on the MINJA path with HuggingFaceTB/SmolLM2-360M-Instruct: default renders a trailing "<|im_start|>assistant", add_generation_prompt=false omits it. Note: true assistant prefill (continue_final_message - continuing from the final assistant message without closing it) is a separate control and is left as a follow-up. Implements #3877 Co-Authored-By: Claude Opus 4.8 --- docs/model_server_rest_api_chat.md | 1 + src/llm/apis/openai_api_handler.cpp | 11 +++++++ src/llm/apis/openai_request.hpp | 3 ++ src/llm/py_jinja_template_processor.cpp | 8 +++-- src/llm/servable.cpp | 4 +-- .../continuous_batching/servable.cpp | 2 +- .../visual_language_model/legacy/servable.cpp | 2 +- src/test/llm/llmtemplate_test.cpp | 32 +++++++++++++++++++ 8 files changed, 57 insertions(+), 6 deletions(-) diff --git a/docs/model_server_rest_api_chat.md b/docs/model_server_rest_api_chat.md index 951694f8f4..332fb330a2 100644 --- a/docs/model_server_rest_api_chat.md +++ b/docs/model_server_rest_api_chat.md @@ -222,6 +222,7 @@ Some parameters, especially related to sampling (like `temperature`, `top_p` etc | response_format | ✅ | ✅ | ✅ | object | An object specifying the format that the model must output. Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured Outputs which ensures the model will match your supplied JSON schema according to [OpenAI reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format). Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). Additionally, `response_format` can accept [XGrammar structural tags format](https://github.com/mlc-ai/xgrammar/blob/v0.1.26/docs/tutorials/structural_tag.md#format-types) (not part of OpenAI API). For example: `{ "type": "const_string", "value": "Hello World!" }`. **Note** that if model server fails to process the format, the request will still be processed, but the format will not be imposed. | | chat_template_kwargs | ✅ | ❌ | ✅ | object | Enables passing additional parameters to chat template engine. Example `{"enable_thinking": false}`. Note that values like `messages`, `eos_token`, `bos_token` etc. are provided natively to the template engine, so including them in `chat_template_kwargs` will cause error. | | skip_special_tokens | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information (e.g. bounding boxes, reasoning markers). When `false`, any tool or reasoning parser configured on the endpoint is silently disabled for the request, so the raw token stream is returned. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. | +| add_generation_prompt | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to append the chat template's generation prompt (the marker that signals the model to start a new assistant turn). Set to `false` to render the conversation without a trailing generation prompt — useful for assistant prefill where the final `assistant` message should be continued rather than treated as a completed turn. Applies to both the Python-Jinja and MINJA chat template paths. | #### Beam search sampling specific | Param | OpenVINO Model Server | OpenAI /chat/completions API | vLLM Serving Sampling Params | Type | Description | diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index 0e96e9b335..101271bb7c 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -664,6 +664,17 @@ absl::Status OpenAIApiHandler::parseCommonPart(std::optional maxTokens request.ignoreEOS = it->value.GetBool(); } + // add_generation_prompt: bool; optional - defaults to true + // Extension, unsupported by OpenAI API, however supported by HF transformers and vLLM. + // When false, the chat template is rendered without a trailing generation prompt + // so a final assistant message can be continued as a prefix (assistant prefill). + it = doc.FindMember("add_generation_prompt"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsBool()) + return absl::InvalidArgumentError("add_generation_prompt accepts values true or false"); + request.addGenerationPrompt = it->value.GetBool(); + } + // max_tokens: uint; optional // Common part checked here, specific parts are checked in parseCompletionsPart and parseChatCompletionsPart // TODO: Deprecated - this will need to be removed in the future diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index 24327be44f..fe3f3764f5 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -52,6 +52,9 @@ struct OpenAIRequest { int logprobschat{0}; bool echo{false}; std::optional ignoreEOS{std::nullopt}; + // When false, the chat template is rendered without a trailing generation prompt + // (e.g. for assistant prefill). Defaults to true. Extension supported by HF/vLLM. + std::optional addGenerationPrompt{std::nullopt}; std::optional> stop{std::nullopt}; std::optional includeStopStrInOutput{std::nullopt}; std::optional numReturnSequences{std::nullopt}; // effective for beam search and multinomial decoding diff --git a/src/llm/py_jinja_template_processor.cpp b/src/llm/py_jinja_template_processor.cpp index 188a3c0daa..f895aa0c48 100644 --- a/src/llm/py_jinja_template_processor.cpp +++ b/src/llm/py_jinja_template_processor.cpp @@ -58,11 +58,15 @@ bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templ elif not isinstance(chat_template_kwargs, dict): raise Exception("chat_template_kwargs must be an object") + add_generation_prompt = request_json.get("add_generation_prompt", True) + if not isinstance(add_generation_prompt, bool): + raise Exception("add_generation_prompt accepts values true or false") + tools = request_json["tools"] if "tools" in request_json else None if tools is None: - output = chat_template.render(messages=messages, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs) + output = chat_template.render(messages=messages, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=add_generation_prompt, **chat_template_kwargs) else: - output = tool_chat_template.render(messages=messages, tools=tools, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs) + output = tool_chat_template.render(messages=messages, tools=tools, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=add_generation_prompt, **chat_template_kwargs) except Exception as e: error = str(e) )", diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 17d74e688a..74c44bb265 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -202,7 +202,7 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded + bool addGenerationPrompt = executionContext->apiHandler->getRequest().addGenerationPrompt.value_or(true); auto toolParsingResult = executionContext->apiHandler->parseToolsToJsonContainer(); if (!toolParsingResult.ok()) { return toolParsingResult.status(); @@ -240,7 +240,7 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; + bool addGenerationPrompt = executionContext->apiHandler->getRequest().addGenerationPrompt.value_or(true); auto toolParsingResult = executionContext->apiHandler->parseToolsToJsonContainer(); if (!toolParsingResult.ok()) { return toolParsingResult.status(); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 8b65ac7fe0..c1a19e8ccd 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -139,7 +139,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler->getRequest().addGenerationPrompt.value_or(true); auto toolParsingResult = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); if (!toolParsingResult.ok()) { return toolParsingResult.status(); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index aba0361182..e09379c139 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -392,7 +392,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler->getRequest().addGenerationPrompt.value_or(true); auto toolParsingResult = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); if (!toolParsingResult.ok()) { return toolParsingResult.status(); diff --git a/src/test/llm/llmtemplate_test.cpp b/src/test/llm/llmtemplate_test.cpp index e34b4b22a5..27784fbc76 100644 --- a/src/test/llm/llmtemplate_test.cpp +++ b/src/test/llm/llmtemplate_test.cpp @@ -167,6 +167,38 @@ TEST_F(LLMChatTemplateTest, ChatTemplateDefault) { ASSERT_EQ(finalPrompt, expectedOutput); } +// add_generation_prompt request field controls whether the trailing generation +// prompt is rendered (assistant prefill support, issue #3877). +TEST_F(LLMChatTemplateTest, ChatTemplateAddGenerationPromptDefaultsTrue) { + std::string jinja = "{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}{% if add_generation_prompt %}<|GEN|>{% endif %}"; + ASSERT_TRUE(CreateJinjaConfig(jinja)); + LoadTemplateProcessor(); + std::string finalPrompt = ""; + std::string payloadBody = R"( + { + "messages": [{ "role": "user", "content": "hi" }] + } + )"; + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_NE(finalPrompt.find("<|GEN|>"), std::string::npos) << "default should add generation prompt, got: " << finalPrompt; +} + +TEST_F(LLMChatTemplateTest, ChatTemplateAddGenerationPromptFalse) { + std::string jinja = "{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}{% if add_generation_prompt %}<|GEN|>{% endif %}"; + ASSERT_TRUE(CreateJinjaConfig(jinja)); + LoadTemplateProcessor(); + std::string finalPrompt = ""; + std::string payloadBody = R"( + { + "messages": [{ "role": "user", "content": "hi" }, { "role": "assistant", "content": "partial" }], + "add_generation_prompt": false + } + )"; + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(finalPrompt.find("<|GEN|>"), std::string::npos) << "add_generation_prompt=false should omit generation prompt, got: " << finalPrompt; + ASSERT_NE(finalPrompt.find("partial"), std::string::npos) << "assistant prefill content should be present, got: " << finalPrompt; +} + TEST_F(LLMChatTemplateTest, ChatTemplateMultiMessage) { CopyDefaultChatTemplate(); LoadTemplateProcessor();