Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/llm/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ The calculator supports the following `node_options` for tuning the pipeline con
- `optional string tool_parser` - name of the parser to use for tool calls extraction from model output before creating a response;
- `optional bool enable_tool_guided_generation` - enable enforcing tool schema during generation. Requires setting response parser. [default = false];
- `optional SparseAttentionConfig sparse_attention_config` - Sparse attention configuration. Disabled if not specified.
- `optional int64 idle_unload_timeout_seconds` - unload the graph's model resources after this many seconds with no inference requests, freeing GPU/CPU memory; the model is reloaded automatically on the next request. `0` disables the feature [default = 0]. See [Idle model unload](#idle-model-unload).

### Idle model unload
When `idle_unload_timeout_seconds` is set to a positive value, the model server unloads the LLM graph's heavy resources (the continuous batching pipeline, freeing GPU VRAM / host memory) after the configured period without any inference requests. The first request after an unload transparently reloads the model and is served once it is ready, so the GPU can be used by other workloads while a model is idle.

Notes:
- Only inference requests reset the idle timer; status/metrics/health endpoints do not keep a model loaded.
- The first request after an idle unload pays the reload latency. Combine with [model caching](../model_cache.md) (`--cache_dir`) so the reload is a fast cache import rather than a full recompile.
- The graph reports as `AVAILABLE` while idle-unloaded (it auto-reloads on demand). The `ovms_graph_loaded` metric reports `1` when loaded and `0` when idle-unloaded.
- Supported for LLM continuous-batching graphs. Graphs containing Python nodes are not supported with this setting.

### Caching settings
The value of `cache_size` might have performance and stability implications. It is used for storing LLM model KV cache data. Adjust it based on your environment capabilities, model size and expected level of concurrency.
Expand Down
56 changes: 55 additions & 1 deletion src/dags/pipelinedefinitionstatus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ const std::string& pipelineDefinitionStateCodeToString(PipelineDefinitionStateCo
{PipelineDefinitionStateCode::LOADING_PRECONDITION_FAILED_REQUIRED_REVALIDATION, "LOADING_PRECONDITION_FAILED_REQUIRED_REVALIDATION"},
{PipelineDefinitionStateCode::AVAILABLE_REQUIRED_REVALIDATION, "AVAILABLE_REQUIRED_REVALIDATION"},
{PipelineDefinitionStateCode::AVAILABLE, "AVAILABLE"},
{PipelineDefinitionStateCode::RETIRED, "RETIRED"}};
{PipelineDefinitionStateCode::RETIRED, "RETIRED"},
{PipelineDefinitionStateCode::UNLOADED, "UNLOADED"}};
return names.at(code);
}

Expand All @@ -62,6 +63,9 @@ StateKeeper BeginState::handle(const RetireEvent& e) const {
throw std::logic_error(INVALID_TRANSITION_MESSAGE);
return {};
}
StateKeeper BeginState::handle(const UnloadEvent& e) const {
return {}; // unload is a no-op when not yet loaded
}

PipelineDefinitionStateCode ReloadState::getStateCode() const {
return code;
Expand All @@ -84,6 +88,9 @@ StateKeeper ReloadState::handle(const RetireEvent& e) const {
throw std::logic_error(INVALID_TRANSITION_MESSAGE);
return {};
}
StateKeeper ReloadState::handle(const UnloadEvent& e) const {
return {}; // unload is a no-op while reloading
}

PipelineDefinitionStateCode AvailableState::getStateCode() const {
return code;
Expand All @@ -105,6 +112,9 @@ StateChanger<AvailableRequiredRevalidation> AvailableState::handle(const UsedMod
StateChanger<RetiredState> AvailableState::handle(const RetireEvent& e) const {
return {};
}
StateChanger<UnloadedState> AvailableState::handle(const UnloadEvent& e) const {
return {};
}

PipelineDefinitionStateCode AvailableRequiredRevalidation::getStateCode() const {
return code;
Expand All @@ -124,6 +134,9 @@ StateKeeper AvailableRequiredRevalidation::handle(const UsedModelChangedEvent& e
StateChanger<RetiredState> AvailableRequiredRevalidation::handle(const RetireEvent& e) const {
return {};
}
StateKeeper AvailableRequiredRevalidation::handle(const UnloadEvent& e) const {
return {}; // unload is a no-op in AVAILABLE_REQUIRED_REVALIDATION
}

PipelineDefinitionStateCode LoadingPreconditionFailedState::getStateCode() const {
return code;
Expand All @@ -145,6 +158,10 @@ StateChanger<LoadingFailedLastValidationRequiredRevalidation> LoadingPreconditio
StateChanger<RetiredState> LoadingPreconditionFailedState::handle(const RetireEvent& e) const {
return {};
}
StateChanger<UnloadedState> LoadingPreconditionFailedState::handle(const UnloadEvent& e) const {
// Revert a failed wake-up reload back to UNLOADED so the next request retries.
return {};
}

PipelineDefinitionStateCode LoadingFailedLastValidationRequiredRevalidation::getStateCode() const {
return code;
Expand All @@ -164,6 +181,9 @@ StateKeeper LoadingFailedLastValidationRequiredRevalidation::handle(const UsedMo
StateChanger<RetiredState> LoadingFailedLastValidationRequiredRevalidation::handle(const RetireEvent& e) const {
return {};
}
StateKeeper LoadingFailedLastValidationRequiredRevalidation::handle(const UnloadEvent& e) const {
return {}; // unload is a no-op when loading already failed
}

PipelineDefinitionStateCode RetiredState::getStateCode() const {
return code;
Expand All @@ -187,6 +207,31 @@ StateKeeper RetiredState::handle(const RetireEvent& e) const {
throw std::logic_error(INVALID_TRANSITION_MESSAGE);
return {};
}
StateKeeper RetiredState::handle(const UnloadEvent& e) const {
return {}; // unload is a no-op when already retired
}

PipelineDefinitionStateCode UnloadedState::getStateCode() const {
return code;
}
StateChanger<ReloadState> UnloadedState::handle(const ReloadEvent& e) const {
return {}; // wake-up: transition through reload path
}
StateChanger<RetiredState> UnloadedState::handle(const RetireEvent& e) const {
return {}; // config removal while unloaded
}
StateChanger<AvailableState> UnloadedState::handle(const ValidationPassedEvent& e) const {
return {}; // defensive: if validation passes directly, go available
}
StateKeeper UnloadedState::handle(const ValidationFailedEvent& e) const {
return {};
}
StateKeeper UnloadedState::handle(const UsedModelChangedEvent& e) const {
return {};
}
StateKeeper UnloadedState::handle(const UnloadEvent& e) const {
return {}; // already unloaded, idempotent
}

PipelineDefinitionStatus::PipelineDefinitionStatus(const std::string& type, const std::string& name) :
MachineState(type, name) {}
Expand Down Expand Up @@ -233,6 +278,15 @@ std::tuple<ModelVersionState, ModelVersionStatusErrorCode> PipelineDefinitionSta
ModelVersionState::END,
ModelVersionStatusErrorCode::OK};

case PipelineDefinitionStateCode::UNLOADED:
// Report AVAILABLE: the graph auto-reloads on the next inference request,
// so health checks and routing should treat it as available. Reporting END
// or UNLOADING would cause clients and load-balancers to permanently
// exclude this servable from their pools.
return {
ModelVersionState::AVAILABLE,
ModelVersionStatusErrorCode::OK};

default:
return {};
}
Expand Down
37 changes: 35 additions & 2 deletions src/dags/pipelinedefinitionstatus.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ enum class PipelineDefinitionStateCode {
LOADING_PRECONDITION_FAILED_REQUIRED_REVALIDATION,
AVAILABLE_REQUIRED_REVALIDATION,
AVAILABLE,
RETIRED
RETIRED,
UNLOADED
};

const std::string& pipelineDefinitionStateCodeToString(PipelineDefinitionStateCode code);
Expand Down Expand Up @@ -112,6 +113,11 @@ struct LoadingFailedLastValidationRequiredRevalidation;
* State in which pipeline is retired - removed from config
*/
struct RetiredState;
/**
* State in which pipeline is idle-unloaded (resources freed) but not retired.
* Auto-reloads on the next inference request.
*/
struct UnloadedState;

#define EVENT_STRUCT_WITH_NAME(x) \
struct x { \
Expand All @@ -131,6 +137,7 @@ EVENT_STRUCT_WITH_NAME(ValidationFailedEvent);
EVENT_STRUCT_WITH_NAME(ValidationPassedEvent);
EVENT_STRUCT_WITH_NAME(UsedModelChangedEvent);
EVENT_STRUCT_WITH_NAME(RetireEvent);
EVENT_STRUCT_WITH_NAME(UnloadEvent);

template <typename State>
struct StateChanger {
Expand All @@ -155,6 +162,7 @@ struct BeginState {
StateChanger<LoadingPreconditionFailedState> handle(const ValidationFailedEvent& e) const;
StateKeeper handle(const UsedModelChangedEvent& e) const;
StateKeeper handle(const RetireEvent& e) const;
StateKeeper handle(const UnloadEvent& e) const;
};

struct ReloadState {
Expand All @@ -165,6 +173,7 @@ struct ReloadState {
StateChanger<LoadingPreconditionFailedState> handle(const ValidationFailedEvent& e) const;
StateKeeper handle(const UsedModelChangedEvent& e) const;
StateKeeper handle(const RetireEvent& e) const;
StateKeeper handle(const UnloadEvent& e) const;
};

struct AvailableState {
Expand All @@ -175,6 +184,7 @@ struct AvailableState {
StateKeeper handle(const ValidationFailedEvent& e) const;
StateChanger<AvailableRequiredRevalidation> handle(const UsedModelChangedEvent& e) const;
StateChanger<RetiredState> handle(const RetireEvent& e) const;
StateChanger<UnloadedState> handle(const UnloadEvent& e) const;
};

struct AvailableRequiredRevalidation {
Expand All @@ -185,6 +195,7 @@ struct AvailableRequiredRevalidation {
StateChanger<LoadingPreconditionFailedState> handle(const ValidationFailedEvent& e) const;
StateKeeper handle(const UsedModelChangedEvent& e) const;
StateChanger<RetiredState> handle(const RetireEvent& e) const;
StateKeeper handle(const UnloadEvent& e) const;
};

struct LoadingPreconditionFailedState {
Expand All @@ -195,6 +206,11 @@ struct LoadingPreconditionFailedState {
StateKeeper handle(const ValidationFailedEvent& e) const;
StateChanger<LoadingFailedLastValidationRequiredRevalidation> handle(const UsedModelChangedEvent& e) const;
StateChanger<RetiredState> handle(const RetireEvent& e) const;
// A failed wake-up reload of an idle graph reverts to UNLOADED so the next
// inference request can retry the wake (self-healing once the underlying issue
// is resolved). Only wakeUpIfUnloaded() sends UnloadEvent from this state;
// the watcher's unload() only does so from AVAILABLE.
StateChanger<UnloadedState> handle(const UnloadEvent& e) const;
};

struct LoadingFailedLastValidationRequiredRevalidation {
Expand All @@ -205,6 +221,7 @@ struct LoadingFailedLastValidationRequiredRevalidation {
StateChanger<LoadingPreconditionFailedState> handle(const ValidationFailedEvent& e) const;
StateKeeper handle(const UsedModelChangedEvent& e) const;
StateChanger<RetiredState> handle(const RetireEvent& e) const;
StateKeeper handle(const UnloadEvent& e) const;
};

struct RetiredState {
Expand All @@ -215,9 +232,25 @@ struct RetiredState {
StateChanger<LoadingPreconditionFailedState> handle(const ValidationFailedEvent& e) const;
StateKeeper handle(const UsedModelChangedEvent& e) const;
StateKeeper handle(const RetireEvent& e) const;
StateKeeper handle(const UnloadEvent& e) const;
};

struct UnloadedState {
static const PipelineDefinitionStateCode code = PipelineDefinitionStateCode::UNLOADED;
PipelineDefinitionStateCode getStateCode() const;
// Wake-up: reuse the reload path
StateChanger<ReloadState> handle(const ReloadEvent& e) const;
// Config removal while unloaded
StateChanger<RetiredState> handle(const RetireEvent& e) const;
// Defensive: if validation somehow passes after an unload, go back to AVAILABLE
StateChanger<AvailableState> handle(const ValidationPassedEvent& e) const;
// All other events are no-ops in UNLOADED
StateKeeper handle(const ValidationFailedEvent& e) const;
StateKeeper handle(const UsedModelChangedEvent& e) const;
StateKeeper handle(const UnloadEvent& e) const;
};

class PipelineDefinitionStatus : public MachineState<BeginState, ReloadState, AvailableState, AvailableRequiredRevalidation, LoadingPreconditionFailedState, LoadingFailedLastValidationRequiredRevalidation, RetiredState> {
class PipelineDefinitionStatus : public MachineState<BeginState, ReloadState, AvailableState, AvailableRequiredRevalidation, LoadingPreconditionFailedState, LoadingFailedLastValidationRequiredRevalidation, RetiredState, UnloadedState> {
public:
PipelineDefinitionStatus(const std::string& type, const std::string& name);
bool isAvailable() const;
Expand Down
9 changes: 9 additions & 0 deletions src/mediapipe_internal/mediapipegraphconfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ Status MediapipeGraphConfig::parseNode(const rapidjson::Value& v) {
this->setSubconfigPath(DEFAULT_SUBCONFIG_FILENAME);
this->setModelMeshSubconfigPath(DEFAULT_MODELMESH_SUBCONFIG_FILENAME);
}
if (v.HasMember("idle_unload_timeout_seconds")) {
int timeoutSeconds = v["idle_unload_timeout_seconds"].GetInt();
if (timeoutSeconds < 0) {
SPDLOG_ERROR("idle_unload_timeout_seconds must be >= 0 for mediapipe graph: {}", this->getGraphName());
return StatusCode::JSON_INVALID;
}
this->setIdleUnloadTimeoutSeconds(timeoutSeconds);
SPDLOG_DEBUG("Mediapipe graph {} idle_unload_timeout_seconds set to {}", this->getGraphName(), timeoutSeconds);
}
} catch (std::logic_error& e) {
SPDLOG_DEBUG("Relative path error: {}", e.what());
return StatusCode::INTERNAL_ERROR;
Expand Down
16 changes: 16 additions & 0 deletions src/mediapipe_internal/mediapipegraphconfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ class MediapipeGraphConfig {
*/
GraphQueueSizeValue graphQueueSize;

/**
* @brief Idle unload timeout in seconds.
* 0 (default) = feature disabled.
* When > 0, the graph's heavy resources are freed after this many seconds
* of zero in-flight requests, and lazily reloaded on the next inference.
*/
int idleUnloadTimeoutSeconds = 0;

public:
MediapipeGraphConfig(const std::string& graphName = "",
const std::string& basePath = "",
Expand Down Expand Up @@ -195,6 +203,14 @@ class MediapipeGraphConfig {
return std::get<int>(*this->graphQueueSize);
}

int getIdleUnloadTimeoutSeconds() const {
return this->idleUnloadTimeoutSeconds;
}

void setIdleUnloadTimeoutSeconds(int seconds) {
this->idleUnloadTimeoutSeconds = seconds;
}

bool isReloadRequired(const MediapipeGraphConfig& rhs) const;

/**
Expand Down
Loading