Skip to content

Commit b9751b1

Browse files
authored
Add a CUDA memory tracker and use it in voxtral runner (#15780)
This PR adds a CUDA memory tracker and integrates it into `Stats` of LLM Runner. We added `gpu_total_bytes`, `gpu_free_before_load_bytes`, `gpu_free_after_load_bytes`, `gpu_free_after_generate_bytes` and `gpu_peak_usage_mb` information. See logging: ``` PyTorchObserver {"prompt_tokens":387,"generated_tokens":68,"model_load_start_ms":1762976881583,"model_load_end_ms":1762976883487,"inference_start_ms":1762976887396,"inference_end_ms":1762976888589,"prompt_eval_end_ms":1762976887815,"first_token_ms":1762976887815,"aggregate_sampling_time_ms":17,"gpu_total_bytes":17094475776,"gpu_free_before_load_bytes":15589179392,"gpu_free_after_load_bytes":11455692800,"gpu_free_after_generate_bytes":10530848768,"gpu_peak_usage_mb":4824,"SCALING_FACTOR_UNITS_PER_SECOND":1000} ```
1 parent 9981e41 commit b9751b1

File tree

5 files changed

+318
-4
lines changed

5 files changed

+318
-4
lines changed
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <cuda_runtime.h>
12+
#include <algorithm>
13+
#include <limits>
14+
15+
#include <executorch/runtime/platform/log.h>
16+
17+
namespace executorch::backends::cuda {
18+
19+
/**
20+
* @class CudaMemoryTracker
21+
* @brief Tracks CUDA memory usage and logs memory state at key points
22+
*
23+
* This class provides utilities to query and track CUDA memory usage,
24+
* including peak memory usage and detailed memory state logging.
25+
*/
26+
class CudaMemoryTracker {
27+
public:
28+
/**
29+
* @brief Constructor - initializes tracker and logs startup memory state
30+
*/
31+
CudaMemoryTracker() {
32+
if (!query(&last_free_bytes_, &total_bytes_)) {
33+
return;
34+
}
35+
available_ = true;
36+
// Record the initial free bytes observed at startup. We'll use this as a
37+
// baseline so reported "peak usage" reflects additional memory used
38+
// since the tracker was created (instead of the absolute device usage,
39+
// which may include other processes).
40+
initial_free_bytes_ = last_free_bytes_;
41+
min_free_bytes_ = last_free_bytes_;
42+
log_state("startup", last_free_bytes_, total_bytes_);
43+
}
44+
45+
/**
46+
* @brief Logs current memory state at a tagged checkpoint
47+
* @param tag Descriptive tag for this memory sample (e.g., "after_load")
48+
*/
49+
void log_sample(const char* tag) {
50+
if (!available_) {
51+
return;
52+
}
53+
size_t free_bytes = 0;
54+
size_t total_bytes = 0;
55+
if (!query(&free_bytes, &total_bytes)) {
56+
return;
57+
}
58+
min_free_bytes_ = std::min(min_free_bytes_, free_bytes);
59+
total_bytes_ = total_bytes;
60+
last_free_bytes_ = free_bytes;
61+
log_state(tag, free_bytes, total_bytes);
62+
}
63+
64+
/**
65+
* @brief Destructor - logs final memory state and peak usage summary
66+
*/
67+
~CudaMemoryTracker() {
68+
if (!available_) {
69+
return;
70+
}
71+
size_t free_bytes = 0;
72+
size_t total_bytes = 0;
73+
if (!query(&free_bytes, &total_bytes)) {
74+
return;
75+
}
76+
min_free_bytes_ = std::min(min_free_bytes_, free_bytes);
77+
total_bytes_ = total_bytes;
78+
last_free_bytes_ = free_bytes;
79+
// Compute peak usage relative to the initial free baseline so that
80+
// allocations by other processes present at startup are not attributed
81+
// to this process. If for some reason initial_free_bytes_ was not set,
82+
// fall back to absolute device usage.
83+
double peak_mb = 0.0;
84+
if (initial_free_bytes_ != std::numeric_limits<size_t>::max()) {
85+
size_t used_delta = 0;
86+
if (initial_free_bytes_ > min_free_bytes_) {
87+
used_delta = initial_free_bytes_ - min_free_bytes_;
88+
}
89+
peak_mb = static_cast<double>(used_delta) / (1024.0 * 1024.0);
90+
} else {
91+
peak_mb = static_cast<double>(total_bytes_ - min_free_bytes_) /
92+
(1024.0 * 1024.0);
93+
}
94+
const double total_mb =
95+
static_cast<double>(total_bytes_) / (1024.0 * 1024.0);
96+
ET_LOG(
97+
Info,
98+
"CUDA memory peak usage (since startup): %.2f MB, device total: %.2f MB",
99+
peak_mb,
100+
total_mb);
101+
}
102+
103+
private:
104+
/**
105+
* @brief Queries current CUDA memory info
106+
* @param free_bytes Output parameter for free memory in bytes
107+
* @param total_bytes Output parameter for total memory in bytes
108+
* @return true if query succeeded, false otherwise
109+
*/
110+
bool query(size_t* free_bytes, size_t* total_bytes) {
111+
cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes);
112+
if (err != cudaSuccess) {
113+
if (!error_logged_) {
114+
error_logged_ = true;
115+
ET_LOG(
116+
Error,
117+
"cudaMemGetInfo failed with error: %s",
118+
cudaGetErrorString(err));
119+
}
120+
available_ = false;
121+
return false;
122+
}
123+
return true;
124+
}
125+
126+
/**
127+
* @brief Logs the current memory state
128+
* @param tag Tag describing this log point
129+
* @param free_bytes Current free memory in bytes
130+
* @param total_bytes Current total memory in bytes
131+
*/
132+
void log_state(const char* tag, size_t free_bytes, size_t total_bytes) const {
133+
const double used_mb =
134+
static_cast<double>(total_bytes - free_bytes) / (1024.0 * 1024.0);
135+
const double free_mb = static_cast<double>(free_bytes) / (1024.0 * 1024.0);
136+
const double total_mb =
137+
static_cast<double>(total_bytes) / (1024.0 * 1024.0);
138+
ET_LOG(
139+
Info,
140+
"CUDA memory (%s): used %.2f MB, free %.2f MB, total %.2f MB",
141+
tag,
142+
used_mb,
143+
free_mb,
144+
total_mb);
145+
}
146+
147+
bool available_{false};
148+
bool error_logged_{false};
149+
size_t last_free_bytes_{0};
150+
size_t total_bytes_{0};
151+
size_t min_free_bytes_{std::numeric_limits<size_t>::max()};
152+
// Baseline free bytes observed at tracker construction. Used to compute
153+
// peak usage attributable to this process since the tracker started.
154+
size_t initial_free_bytes_{std::numeric_limits<size_t>::max()};
155+
156+
public:
157+
// Simple accessors to allow other components to read last-sampled values.
158+
// These are safe to call after a successful log_sample() invocation.
159+
uint64_t last_free_bytes() const {
160+
return static_cast<uint64_t>(last_free_bytes_);
161+
}
162+
uint64_t total_bytes() const {
163+
return static_cast<uint64_t>(total_bytes_);
164+
}
165+
uint64_t min_free_bytes() const {
166+
return static_cast<uint64_t>(min_free_bytes_);
167+
}
168+
uint64_t initial_free_bytes() const {
169+
return static_cast<uint64_t>(initial_free_bytes_);
170+
}
171+
double peak_usage_mb() const {
172+
// Prefer peak relative to the initial free baseline; fall back to
173+
// absolute device peak if baseline isn't available.
174+
if (min_free_bytes_ == std::numeric_limits<size_t>::max()) {
175+
return 0.0;
176+
}
177+
if (initial_free_bytes_ != std::numeric_limits<size_t>::max()) {
178+
size_t used_delta = 0;
179+
if (initial_free_bytes_ > min_free_bytes_) {
180+
used_delta = initial_free_bytes_ - min_free_bytes_;
181+
}
182+
return static_cast<double>(used_delta) / (1024.0 * 1024.0);
183+
}
184+
if (total_bytes_ == 0) {
185+
return 0.0;
186+
}
187+
return static_cast<double>(total_bytes_ - min_free_bytes_) /
188+
(1024.0 * 1024.0);
189+
}
190+
};
191+
192+
} // namespace executorch::backends::cuda

extension/llm/runner/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,25 @@ target_include_directories(
5555
extension_llm_runner INTERFACE ${_common_include_directories}
5656
)
5757

58+
# If the project is configured to build with CUDA support, try to find a CUDA
59+
# runtime (prefer the CUDAToolkit package). If found, expose a compile-time
60+
# macro so sources can conditionally compile CUDA-aware code.
61+
if(EXECUTORCH_BUILD_CUDA)
62+
# Prefer the modern CMake CUDAToolkit module, fall back to searching for the
63+
# CUDA runtime library (cudart) if the package isn't available.
64+
find_package(CUDAToolkit QUIET)
65+
if(CUDAToolkit_FOUND)
66+
target_compile_definitions(extension_llm_runner PUBLIC CUDA_AVAILABLE)
67+
target_link_libraries(extension_llm_runner PUBLIC CUDA::cudart)
68+
message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE")
69+
else()
70+
message(
71+
STATUS
72+
"CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found"
73+
)
74+
endif()
75+
endif()
76+
5877
install(
5978
TARGETS extension_llm_runner
6079
EXPORT ExecuTorchTargets

extension/llm/runner/multimodal_runner.cpp

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
#include <pytorch/tokenizers/hf_tokenizer.h>
1616
#include <pytorch/tokenizers/sentencepiece.h>
1717

18+
#ifdef CUDA_AVAILABLE
19+
#include <executorch/backends/cuda/runtime/memory_tracker.h>
20+
#endif
21+
1822
namespace executorch::extension::llm {
1923

2024
using ::executorch::extension::Module;
@@ -38,7 +42,16 @@ MultimodalRunner::MultimodalRunner(
3842
io_manager_(std::move(io_manager)),
3943
text_token_generator_(std::move(text_token_generator)),
4044
stats_(std::move(stats)),
41-
pos_(0) {}
45+
pos_(0) {
46+
#ifdef CUDA_AVAILABLE
47+
cuda_memory_tracker_ =
48+
std::make_unique<::executorch::backends::cuda::CudaMemoryTracker>();
49+
// Probe immediately after creating the tracker to capture GPU state before
50+
// any model loading happens.
51+
stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes();
52+
stats_->gpu_free_before_load_bytes = cuda_memory_tracker_->last_free_bytes();
53+
#endif
54+
}
4255

4356
bool MultimodalRunner::is_loaded() {
4457
return multimodal_prefiller_->is_method_loaded() &&
@@ -49,8 +62,18 @@ Error MultimodalRunner::load() {
4962
if (is_loaded()) {
5063
return Error::Ok;
5164
}
65+
stats_->model_load_start_ms = time_in_ms();
5266
ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load());
5367
ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
68+
stats_->model_load_end_ms = time_in_ms();
69+
70+
#ifdef CUDA_AVAILABLE
71+
cuda_memory_tracker_->log_sample("after_load");
72+
stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes();
73+
stats_->gpu_free_after_load_bytes = cuda_memory_tracker_->last_free_bytes();
74+
stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
75+
#endif
76+
5477
return Error::Ok;
5578
}
5679

@@ -86,9 +109,7 @@ Error MultimodalRunner::generate(
86109
}
87110

88111
if (!is_loaded()) {
89-
stats_->model_load_start_ms = time_in_ms();
90112
ET_CHECK_OK_OR_RETURN_ERROR(load());
91-
stats_->model_load_end_ms = time_in_ms();
92113
}
93114

94115
if (config.warming) {
@@ -192,6 +213,15 @@ Error MultimodalRunner::generate(
192213
stats_->num_generated_tokens = num_generated_tokens;
193214
// Finalize stats and call callback
194215
stats_->inference_end_ms = time_in_ms();
216+
217+
#ifdef CUDA_AVAILABLE
218+
cuda_memory_tracker_->log_sample("after_generate");
219+
stats_->gpu_free_after_generate_bytes =
220+
cuda_memory_tracker_->last_free_bytes();
221+
// update peak in case it changed after generation
222+
stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
223+
#endif
224+
195225
if (!config.warming) {
196226
printf("\n");
197227
}

extension/llm/runner/multimodal_runner.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
// These are provided for backward compatibility
3737
#include <executorch/extension/llm/runner/llm_runner_helper.h>
3838

39+
#ifdef CUDA_AVAILABLE
40+
#include <executorch/backends/cuda/runtime/memory_tracker.h>
41+
#endif
42+
3943
namespace executorch {
4044
namespace extension {
4145
namespace llm {
@@ -150,6 +154,11 @@ class ET_EXPERIMENTAL MultimodalRunner {
150154
std::unique_ptr<TextTokenGenerator> text_token_generator_;
151155
std::unique_ptr<Stats> stats_;
152156

157+
#ifdef CUDA_AVAILABLE
158+
std::unique_ptr<::executorch::backends::cuda::CudaMemoryTracker>
159+
cuda_memory_tracker_;
160+
#endif
161+
153162
// Internal state
154163
int64_t pos_;
155164
};

0 commit comments

Comments
 (0)