diff --git a/docs/parameters.md b/docs/parameters.md index edfddafa7e..ccee89f772 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -137,6 +137,7 @@ Task specific parameters for different tasks (text generation/image generation/e | `--reasoning_parser` | `string` | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss, gemma4] | | `--tool_parser` | `string` | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2, gemma4] | | `--enable_tool_guided_generation` | `bool` | Enables enforcing tool schema during generation. Requires setting response parser. Default: false. | +| `--cache_interval_multiplier` | `integer` | Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Applicable only for models with linear attention. Default: 64. | ### Image generation | option | Value format | Description | diff --git a/src/graph_export/graph_cli_parser.cpp b/src/graph_export/graph_cli_parser.cpp index 2a8e281c94..79b6d53993 100644 --- a/src/graph_export/graph_cli_parser.cpp +++ b/src/graph_export/graph_cli_parser.cpp @@ -82,8 +82,8 @@ void GraphCLIParser::createOptions() { cxxopts::value()->default_value("false"), "ENABLE_TOOL_GUIDED_GENERATION") ("cache_interval_multiplier", - "Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Default: unset.", - cxxopts::value(), + "Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Applicable only for models with linear attention. Default: 64.", + cxxopts::value()->default_value("64"), "CACHE_INTERVAL_MULTIPLIER"); options->add_options("plugin config") diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp index 1aaff99844..66c3813177 100644 --- a/src/llm/language_model/continuous_batching/servable_initializer.cpp +++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp @@ -171,9 +171,7 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptrschedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse(); properties->schedulerConfig.max_num_seqs = nodeOptions.max_num_seqs(); properties->schedulerConfig.enable_prefix_caching = nodeOptions.enable_prefix_caching(); - if (nodeOptions.has_cache_interval_multiplier()) { - properties->schedulerConfig.cache_interval_multiplier = nodeOptions.cache_interval_multiplier(); - } + properties->schedulerConfig.cache_interval_multiplier = nodeOptions.cache_interval_multiplier(); if (nodeOptions.has_cache_eviction_config()) { properties->schedulerConfig.cache_eviction_config = prepareCacheEvictionConfig(nodeOptions); diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto index ce252ea899..75fc8c654b 100644 --- a/src/llm/llm_calculator.proto +++ b/src/llm/llm_calculator.proto @@ -136,7 +136,8 @@ message LLMCalculatorOptions { optional SparseAttentionConfig sparse_attention_config = 24; - optional uint64 cache_interval_multiplier = 25; + // Applicable only for models with linear attention. + optional uint64 cache_interval_multiplier = 25 [default = 64]; enum ChatTemplateMode { // Use GenAI's apply_chat_template (minja-based). diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index e13cf29919..528b65e81a 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -4249,6 +4249,8 @@ void TestLLMNodeOptionsCheckDefault(std::string& modelsPath) { ASSERT_EQ(properties->schedulerConfig.dynamic_split_fuse, true); ASSERT_EQ(properties->schedulerConfig.max_num_seqs, 256); ASSERT_EQ(properties->schedulerConfig.enable_prefix_caching, false); + ASSERT_TRUE(properties->schedulerConfig.cache_interval_multiplier.has_value()); + ASSERT_EQ(properties->schedulerConfig.cache_interval_multiplier.value(), 64); ASSERT_EQ(properties->device, "CPU"); // CPU default properties (inference_num_threads, enable_cpu_pinning) are automatically // added to pluginConfig for CPU device; verify no user-specified entries are present.