From 96fb9e454541d88680fbddedbdd834ba93482515 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 22 Jun 2026 23:54:23 +0200 Subject: [PATCH 1/2] change default cache_interal_miltiplier [test_doc_files_windows=demos/continuous_batching/agentic_ai/README.md] --- docs/parameters.md | 1 + src/graph_export/graph_cli_parser.cpp | 4 ++-- src/llm/llm_calculator.proto | 3 ++- src/test/llm/llmnode_test.cpp | 2 ++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/parameters.md b/docs/parameters.md index edfddafa7e..ccee89f772 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -137,6 +137,7 @@ Task specific parameters for different tasks (text generation/image generation/e | `--reasoning_parser` | `string` | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss, gemma4] | | `--tool_parser` | `string` | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2, gemma4] | | `--enable_tool_guided_generation` | `bool` | Enables enforcing tool schema during generation. Requires setting response parser. Default: false. | +| `--cache_interval_multiplier` | `integer` | Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Applicable only for models with linear attention. Default: 64. | ### Image generation | option | Value format | Description | diff --git a/src/graph_export/graph_cli_parser.cpp b/src/graph_export/graph_cli_parser.cpp index b8909bd3fe..63f46cd7e1 100644 --- a/src/graph_export/graph_cli_parser.cpp +++ b/src/graph_export/graph_cli_parser.cpp @@ -81,8 +81,8 @@ void GraphCLIParser::createOptions() { cxxopts::value()->default_value("false"), "ENABLE_TOOL_GUIDED_GENERATION") ("cache_interval_multiplier", - "Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Default: unset.", - cxxopts::value(), + "Multiplier for the KV cache block interval. Controls the granularity of cache allocation. Applicable only for models with linear attention. Default: 64.", + cxxopts::value()->default_value("64"), "CACHE_INTERVAL_MULTIPLIER"); options->add_options("plugin config") diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto index e727fcbb4b..23e36a31e0 100644 --- a/src/llm/llm_calculator.proto +++ b/src/llm/llm_calculator.proto @@ -136,5 +136,6 @@ message LLMCalculatorOptions { optional SparseAttentionConfig sparse_attention_config = 24; - optional uint64 cache_interval_multiplier = 25; + // Applicable only for models with linear attention. + optional uint64 cache_interval_multiplier = 25 [default = 64]; } diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index e13cf29919..528b65e81a 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -4249,6 +4249,8 @@ void TestLLMNodeOptionsCheckDefault(std::string& modelsPath) { ASSERT_EQ(properties->schedulerConfig.dynamic_split_fuse, true); ASSERT_EQ(properties->schedulerConfig.max_num_seqs, 256); ASSERT_EQ(properties->schedulerConfig.enable_prefix_caching, false); + ASSERT_TRUE(properties->schedulerConfig.cache_interval_multiplier.has_value()); + ASSERT_EQ(properties->schedulerConfig.cache_interval_multiplier.value(), 64); ASSERT_EQ(properties->device, "CPU"); // CPU default properties (inference_num_threads, enable_cpu_pinning) are automatically // added to pluginConfig for CPU device; verify no user-specified entries are present. From 91ad990803490f2644ba7b2895a2c7a944611eb6 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 23 Jun 2026 01:02:00 +0200 Subject: [PATCH 2/2] fix tests --- .../continuous_batching/servable_initializer.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp index 418dce9656..0c93020ea7 100644 --- a/src/llm/language_model/continuous_batching/servable_initializer.cpp +++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp @@ -155,9 +155,7 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptrschedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse(); properties->schedulerConfig.max_num_seqs = nodeOptions.max_num_seqs(); properties->schedulerConfig.enable_prefix_caching = nodeOptions.enable_prefix_caching(); - if (nodeOptions.has_cache_interval_multiplier()) { - properties->schedulerConfig.cache_interval_multiplier = nodeOptions.cache_interval_multiplier(); - } + properties->schedulerConfig.cache_interval_multiplier = nodeOptions.cache_interval_multiplier(); if (nodeOptions.has_cache_eviction_config()) { properties->schedulerConfig.cache_eviction_config = prepareCacheEvictionConfig(nodeOptions);