Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/model_cache.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Alternatively the location of the cache storage can be set using the parameter `

The model server security context must have read-write access to the cache storage path.

`--cache_dir` also applies to LLM text-generation servables using the continuous batching pipeline (GPU). With it set, the compiled-model/blob cache is persisted across restarts, so a model that has already been compiled (or idle-unloaded) reloads from the cache instead of recompiling. An explicit `CACHE_DIR` in a node's `plugin_config` takes precedence over the global `--cache_dir`.

When using Model Server with configuration file, it is possible to serve more than one model. In such case, model cache is applied to all the models, with an exception to:
- Models with custom loader (for security reasons explained earlier)
- Models configured to shape `auto` or batch_size `auto`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,21 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
return status;
}

// Propagate the global --cache_dir (ServerSettings) into the continuous batching
// pipeline plugin config. Unlike the non-CB path (ModelInstance::setCacheOptions),
// this initializer constructs the pipeline directly, so the server-level cache_dir
// is otherwise never applied. An explicit CACHE_DIR in the node's plugin_config
// remains authoritative.
const std::string& globalCacheDir = Config::instance().cacheDir();
if (!globalCacheDir.empty()) {
if (properties->pluginConfig.find(ov::cache_dir.name()) == properties->pluginConfig.end()) {
properties->pluginConfig[ov::cache_dir.name()] = globalCacheDir;
SPDLOG_DEBUG("Applying global cache_dir to continuous batching pipeline: {}", globalCacheDir);
} else {
SPDLOG_DEBUG("CACHE_DIR set explicitly in node plugin_config; keeping user value over global cache_dir");
}
}

if (properties->device == "CPU") {
status = applyDefaultCpuProperties(properties->pluginConfig);
if (!status.ok()) {
Expand Down
120 changes: 120 additions & 0 deletions src/test/llm/llmnode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#endif

#include "../../http_rest_api_handler.hpp"
#include "../../config.hpp"
#include "../../http_status_code.hpp"
#include "../../json_parser.hpp"
#include "../../llm/apis/openai_completions.hpp"
Expand Down Expand Up @@ -4425,6 +4426,125 @@ TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCheckPluginConfig) {
LLMNodeOptionsCheckPluginConfig(modelsPath);
}

// Verifies that the global --cache_dir (ServerSettings) is propagated into the
// continuous batching pipeline plugin config, and that an explicit CACHE_DIR in
// the node's plugin_config takes precedence over the global value.
// Regression test for openvinotoolkit/model_server#4230.
void LLMNodeOptionsCacheDirPropagation(std::string& modelsPath) {
// Seed the global cache_dir via the CLI parser (same path used in production).
char* n_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080", (char*)"--cache_dir", (char*)"/tmp/ovms_global_cache"};
int arg_count = 9;
ovms::Config::instance().parse(arg_count, n_argv);
ASSERT_EQ(ovms::Config::instance().cacheDir(), "/tmp/ovms_global_cache");

// Case 1: no CACHE_DIR in node plugin_config -> global value is applied.
{
std::string testPbtxt = R"(
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"

node: {
name: "llmNode"
calculator: "HttpLLMCalculator"
input_stream: "LOOPBACK:loopback"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
input_side_packet: "LLM_NODE_RESOURCES:llm"
output_stream: "LOOPBACK:loopback"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
input_stream_info: {
tag_index: 'LOOPBACK:0',
back_edge: true
}
node_options: {
[type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
models_path: ")" +
modelsPath + R"("
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler",
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "LOOPBACK:0"
}
}
}
}
}
)";
adjustConfigForTargetPlatform(testPbtxt);
::mediapipe::CalculatorGraphConfig config;
ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config));
std::shared_ptr<GenAiServable> servable;
ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK);
auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1);
ASSERT_EQ(properties->pluginConfig["CACHE_DIR"].as<std::string>(), "/tmp/ovms_global_cache");
}

// Case 2: explicit CACHE_DIR in node plugin_config wins over the global value.
{
std::string testPbtxt = R"(
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"

node: {
name: "llmNode"
calculator: "HttpLLMCalculator"
input_stream: "LOOPBACK:loopback"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
input_side_packet: "LLM_NODE_RESOURCES:llm"
output_stream: "LOOPBACK:loopback"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
input_stream_info: {
tag_index: 'LOOPBACK:0',
back_edge: true
}
node_options: {
[type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
models_path: ")" +
modelsPath + R"("
plugin_config: '{"CACHE_DIR": "/tmp/ovms_node_cache"}'
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler",
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "LOOPBACK:0"
}
}
}
}
}
)";
adjustConfigForTargetPlatform(testPbtxt);
::mediapipe::CalculatorGraphConfig config;
ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config));
std::shared_ptr<GenAiServable> servable;
ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK);
auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1);
// The test harness may rewrite the path for the target platform, so match
// on substrings: the explicit node value must win over the global one.
std::string nodeCacheDir = properties->pluginConfig["CACHE_DIR"].as<std::string>();
ASSERT_NE(nodeCacheDir.find("ovms_node_cache"), std::string::npos) << "Explicit node CACHE_DIR should be used, got: " << nodeCacheDir;
ASSERT_EQ(nodeCacheDir.find("ovms_global_cache"), std::string::npos) << "Global cache_dir must not override explicit node CACHE_DIR, got: " << nodeCacheDir;
}

// Restore the global cache_dir so the singleton does not leak into other tests.
char* reset_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080"};
ovms::Config::instance().parse(7, reset_argv);
}
TEST_F(LLMOptionsHttpTest, LLMNodeOptionsCacheDirPropagation) {
LLMNodeOptionsCacheDirPropagation(modelsPath);
}
TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCacheDirPropagation) {
LLMNodeOptionsCacheDirPropagation(modelsPath);
}

void LLMNodeOptionsCheckNonDefault(std::string& modelsPath) {
std::string testPbtxt = R"(
input_stream: "HTTP_REQUEST_PAYLOAD:input"
Expand Down