Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/model_server_rest_api_chat.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ Some parameters, especially related to sampling (like `temperature`, `top_p` etc
| response_format | ✅ | ✅ | ✅ | object | An object specifying the format that the model must output. Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured Outputs which ensures the model will match your supplied JSON schema according to [OpenAI reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format). Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). Additionally, `response_format` can accept [XGrammar structural tags format](https://github.com/mlc-ai/xgrammar/blob/v0.1.26/docs/tutorials/structural_tag.md#format-types) (not part of OpenAI API). For example: `{ "type": "const_string", "value": "Hello World!" }`. **Note** that if model server fails to process the format, the request will still be processed, but the format will not be imposed. |
| chat_template_kwargs | ✅ | ❌ | ✅ | object | Enables passing additional parameters to chat template engine. Example `{"enable_thinking": false}`. Note that values like `messages`, `eos_token`, `bos_token` etc. are provided natively to the template engine, so including them in `chat_template_kwargs` will cause error. |
| skip_special_tokens | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information (e.g. bounding boxes, reasoning markers). When `false`, any tool or reasoning parser configured on the endpoint is silently disabled for the request, so the raw token stream is returned. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. |
| add_generation_prompt | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to append the chat template's generation prompt (the marker that signals the model to start a new assistant turn). Set to `false` to render the conversation without a trailing generation prompt — useful for assistant prefill where the final `assistant` message should be continued rather than treated as a completed turn. Applies to both the Python-Jinja and MINJA chat template paths. |

#### Beam search sampling specific
| Param | OpenVINO Model Server | OpenAI /chat/completions API | vLLM Serving Sampling Params | Type | Description |
Expand Down
11 changes: 11 additions & 0 deletions src/llm/apis/openai_api_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,17 @@ absl::Status OpenAIApiHandler::parseCommonPart(std::optional<uint32_t> maxTokens
request.ignoreEOS = it->value.GetBool();
}

// add_generation_prompt: bool; optional - defaults to true
// Extension, unsupported by OpenAI API, however supported by HF transformers and vLLM.
// When false, the chat template is rendered without a trailing generation prompt
// so a final assistant message can be continued as a prefix (assistant prefill).
it = doc.FindMember("add_generation_prompt");
if (it != doc.MemberEnd() && !it->value.IsNull()) {
if (!it->value.IsBool())
return absl::InvalidArgumentError("add_generation_prompt accepts values true or false");
request.addGenerationPrompt = it->value.GetBool();
}

// max_tokens: uint; optional
// Common part checked here, specific parts are checked in parseCompletionsPart and parseChatCompletionsPart
// TODO: Deprecated - this will need to be removed in the future
Expand Down
3 changes: 3 additions & 0 deletions src/llm/apis/openai_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ struct OpenAIRequest {
int logprobschat{0};
bool echo{false};
std::optional<bool> ignoreEOS{std::nullopt};
// When false, the chat template is rendered without a trailing generation prompt
// (e.g. for assistant prefill). Defaults to true. Extension supported by HF/vLLM.
std::optional<bool> addGenerationPrompt{std::nullopt};
std::optional<std::set<std::string>> stop{std::nullopt};
std::optional<bool> includeStopStrInOutput{std::nullopt};
std::optional<int> numReturnSequences{std::nullopt}; // effective for beam search and multinomial decoding
Expand Down
8 changes: 6 additions & 2 deletions src/llm/py_jinja_template_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,15 @@ bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templ
elif not isinstance(chat_template_kwargs, dict):
raise Exception("chat_template_kwargs must be an object")

add_generation_prompt = request_json.get("add_generation_prompt", True)
if not isinstance(add_generation_prompt, bool):
raise Exception("add_generation_prompt accepts values true or false")

tools = request_json["tools"] if "tools" in request_json else None
if tools is None:
output = chat_template.render(messages=messages, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs)
output = chat_template.render(messages=messages, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=add_generation_prompt, **chat_template_kwargs)
else:
output = tool_chat_template.render(messages=messages, tools=tools, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs)
output = tool_chat_template.render(messages=messages, tools=tools, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=add_generation_prompt, **chat_template_kwargs)
except Exception as e:
error = str(e)
)",
Expand Down
4 changes: 2 additions & 2 deletions src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
#endif
{
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded
bool addGenerationPrompt = executionContext->apiHandler->getRequest().addGenerationPrompt.value_or(true);
auto toolParsingResult = executionContext->apiHandler->parseToolsToJsonContainer();
if (!toolParsingResult.ok()) {
return toolParsingResult.status();
Expand Down Expand Up @@ -240,7 +240,7 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
#endif
{
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool addGenerationPrompt = true;
bool addGenerationPrompt = executionContext->apiHandler->getRequest().addGenerationPrompt.value_or(true);
auto toolParsingResult = executionContext->apiHandler->parseToolsToJsonContainer();
if (!toolParsingResult.ok()) {
return toolParsingResult.status();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
} else // NOLINT(readability/braces)
#endif
{
constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded
bool addGenerationPrompt = vlmExecutionContext->apiHandler->getRequest().addGenerationPrompt.value_or(true);
auto toolParsingResult = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
if (!toolParsingResult.ok()) {
return toolParsingResult.status();
Expand Down
2 changes: 1 addition & 1 deletion src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
} else // NOLINT(readability/braces)
#endif
{
constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded
bool addGenerationPrompt = vlmExecutionContext->apiHandler->getRequest().addGenerationPrompt.value_or(true);
auto toolParsingResult = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
if (!toolParsingResult.ok()) {
return toolParsingResult.status();
Expand Down
32 changes: 32 additions & 0 deletions src/test/llm/llmtemplate_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,38 @@ TEST_F(LLMChatTemplateTest, ChatTemplateDefault) {
ASSERT_EQ(finalPrompt, expectedOutput);
}

// add_generation_prompt request field controls whether the trailing generation
// prompt is rendered (assistant prefill support, issue #3877).
TEST_F(LLMChatTemplateTest, ChatTemplateAddGenerationPromptDefaultsTrue) {
std::string jinja = "{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}{% if add_generation_prompt %}<|GEN|>{% endif %}";
ASSERT_TRUE(CreateJinjaConfig(jinja));
LoadTemplateProcessor();
std::string finalPrompt = "";
std::string payloadBody = R"(
{
"messages": [{ "role": "user", "content": "hi" }]
}
)";
ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true);
ASSERT_NE(finalPrompt.find("<|GEN|>"), std::string::npos) << "default should add generation prompt, got: " << finalPrompt;
}

TEST_F(LLMChatTemplateTest, ChatTemplateAddGenerationPromptFalse) {
std::string jinja = "{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}{% if add_generation_prompt %}<|GEN|>{% endif %}";
ASSERT_TRUE(CreateJinjaConfig(jinja));
LoadTemplateProcessor();
std::string finalPrompt = "";
std::string payloadBody = R"(
{
"messages": [{ "role": "user", "content": "hi" }, { "role": "assistant", "content": "partial" }],
"add_generation_prompt": false
}
)";
ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true);
ASSERT_EQ(finalPrompt.find("<|GEN|>"), std::string::npos) << "add_generation_prompt=false should omit generation prompt, got: " << finalPrompt;
ASSERT_NE(finalPrompt.find("partial"), std::string::npos) << "assistant prefill content should be present, got: " << finalPrompt;
}

TEST_F(LLMChatTemplateTest, ChatTemplateMultiMessage) {
CopyDefaultChatTemplate();
LoadTemplateProcessor();
Expand Down