Skip to content

Commit 04bed3f

Browse files
feat: add timeout env variable for process groups. (#331)
Co-authored-by: phantomlei <phantomlei3@gmail.com>
1 parent 395231d commit 04bed3f

File tree

5 files changed

+77
-6
lines changed

5 files changed

+77
-6
lines changed

xllm/core/runtime/dit_engine.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ limitations under the License.
2222
#include "core/common/metrics.h"
2323
#include "framework/parallel_state/parallel_args.h"
2424
#include "framework/parallel_state/parallel_state.h"
25+
#include "util/env_var.h"
2526
#include "util/timer.h"
2627
#include "worker.h"
2728

@@ -60,8 +61,15 @@ DiTEngine::DiTEngine(const runtime::Options& options) : options_(options) {
6061
for (auto& worker : workers_) {
6162
futures.emplace_back(worker->process_group_test_async());
6263
}
63-
// wait up to 4 seconds for all futures to complete
64-
folly::collectAll(futures).within(std::chrono::seconds(4)).get();
64+
// Wait for all futures to complete with a configurable timeout.
65+
// The timeout can be adjusted via the
66+
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
67+
// seconds). This is particularly important in multi-node multi-device
68+
// scenarios where network latency may require a longer timeout period.
69+
const int timeout_seconds = util::get_process_group_test_timeout_seconds();
70+
folly::collectAll(futures)
71+
.within(std::chrono::seconds(timeout_seconds))
72+
.get();
6573
}
6674
}
6775

xllm/core/runtime/llm_engine.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ limitations under the License.
3636
#include "llm_worker_impl.h"
3737
#include "runtime/worker.h"
3838
#include "server/xllm_server_registry.h"
39+
#include "util/env_var.h"
3940
#include "util/pretty_print.h"
4041
#include "util/utils.h"
4142

@@ -106,8 +107,15 @@ void LLMEngine::process_group_test() {
106107
for (auto& worker : worker_clients_) {
107108
futures.emplace_back(worker->process_group_test_async());
108109
}
109-
// wait up to 4 seconds for all futures to complete
110-
folly::collectAll(futures).within(std::chrono::seconds(4)).get();
110+
// Wait for all futures to complete with a configurable timeout.
111+
// The timeout can be adjusted via the
112+
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
113+
// seconds). This is particularly important in multi-node multi-device
114+
// scenarios where network latency may require a longer timeout period.
115+
const int timeout_seconds = util::get_process_group_test_timeout_seconds();
116+
folly::collectAll(futures)
117+
.within(std::chrono::seconds(timeout_seconds))
118+
.get();
111119
}
112120
#endif
113121
}

xllm/core/runtime/vlm_engine.cpp

100755100644
Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ limitations under the License.
2727
#include "framework/model/model_args.h"
2828
#include "framework/model_loader.h"
2929
#include "framework/parallel_state/parallel_state.h"
30+
#include "util/env_var.h"
3031
#include "util/pretty_print.h"
3132
#include "util/utils.h"
3233
#include "worker.h"
@@ -75,8 +76,16 @@ void VLMEngine::process_group_test() {
7576
for (auto& worker : workers_) {
7677
futures.emplace_back(worker->process_group_test_async());
7778
}
78-
// wait up to 4 seconds for all futures to complete
79-
folly::collectAll(futures).within(std::chrono::seconds(4)).get();
79+
// Wait for all futures to complete with a configurable timeout.
80+
// The timeout can be adjusted via the
81+
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
82+
// seconds). This is particularly important in multi-node multi-device
83+
// communication scenarios where network latency may require a longer
84+
// timeout period.
85+
const int timeout_seconds = util::get_process_group_test_timeout_seconds();
86+
folly::collectAll(futures)
87+
.within(std::chrono::seconds(timeout_seconds))
88+
.get();
8089
}
8190
#endif
8291
}

xllm/core/util/env_var.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ limitations under the License.
1515

1616
#include "env_var.h"
1717

18+
#include <climits>
1819
#include <cstdlib>
20+
#include <cstring>
1921

2022
namespace xllm {
2123
namespace util {
@@ -30,5 +32,35 @@ bool get_bool_env(const std::string& key, bool defaultValue) {
3032
strVal == "True");
3133
}
3234

35+
int get_int_env(const std::string& key, int defaultValue) {
36+
const char* val = std::getenv(key.c_str());
37+
if (val == nullptr) {
38+
return defaultValue;
39+
}
40+
// Use strtol for proper error handling
41+
char* endptr;
42+
long int result = std::strtol(val, &endptr, 10);
43+
// Check if conversion was successful (endptr points to end of string or valid
44+
// terminator)
45+
if (endptr == val || *endptr != '\0') {
46+
return defaultValue;
47+
}
48+
// Check for overflow/underflow
49+
if (result < INT_MIN || result > INT_MAX) {
50+
return defaultValue;
51+
}
52+
return static_cast<int>(result);
53+
}
54+
55+
int get_process_group_test_timeout_seconds() {
56+
// Default timeout is 4 seconds, but can be overridden via environment
57+
// variable to accommodate multi-node multi-device communication scenarios
58+
// where network latency may require a longer timeout period.
59+
constexpr int kDefaultTimeoutSeconds = 4;
60+
constexpr const char* kTimeoutEnvVar =
61+
"XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS";
62+
return get_int_env(kTimeoutEnvVar, kDefaultTimeoutSeconds);
63+
}
64+
3365
} // namespace util
3466
} // namespace xllm

xllm/core/util/env_var.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,19 @@ namespace util {
2222

2323
bool get_bool_env(const std::string& key, bool defaultValue);
2424

25+
// Get an integer value from an environment variable.
26+
// Returns the default value if the environment variable is not set or cannot be
27+
// parsed.
28+
int get_int_env(const std::string& key, int defaultValue);
29+
30+
// Get the timeout in seconds for process group test operations.
31+
// This timeout is used when waiting for process group initialization tests to
32+
// complete in multi-device/multi-node scenarios. The default value is 4
33+
// seconds, but can be overridden by setting the
34+
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable. This is
35+
// particularly useful in multi-node multi-device communication scenarios where
36+
// network latency may cause the default 4-second timeout to be insufficient.
37+
int get_process_group_test_timeout_seconds();
38+
2539
} // namespace util
2640
} // namespace xllm

0 commit comments

Comments
 (0)