File tree Expand file tree Collapse file tree 5 files changed +77
-6
lines changed Expand file tree Collapse file tree 5 files changed +77
-6
lines changed Original file line number Diff line number Diff line change @@ -22,6 +22,7 @@ limitations under the License.
2222#include " core/common/metrics.h"
2323#include " framework/parallel_state/parallel_args.h"
2424#include " framework/parallel_state/parallel_state.h"
25+ #include " util/env_var.h"
2526#include " util/timer.h"
2627#include " worker.h"
2728
@@ -60,8 +61,15 @@ DiTEngine::DiTEngine(const runtime::Options& options) : options_(options) {
6061 for (auto & worker : workers_) {
6162 futures.emplace_back (worker->process_group_test_async ());
6263 }
63- // wait up to 4 seconds for all futures to complete
64- folly::collectAll (futures).within (std::chrono::seconds (4 )).get ();
64+ // Wait for all futures to complete with a configurable timeout.
65+ // The timeout can be adjusted via the
66+ // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
67+ // seconds). This is particularly important in multi-node multi-device
68+ // scenarios where network latency may require a longer timeout period.
69+ const int timeout_seconds = util::get_process_group_test_timeout_seconds ();
70+ folly::collectAll (futures)
71+ .within (std::chrono::seconds (timeout_seconds))
72+ .get ();
6573 }
6674}
6775
Original file line number Diff line number Diff line change @@ -36,6 +36,7 @@ limitations under the License.
3636#include " llm_worker_impl.h"
3737#include " runtime/worker.h"
3838#include " server/xllm_server_registry.h"
39+ #include " util/env_var.h"
3940#include " util/pretty_print.h"
4041#include " util/utils.h"
4142
@@ -106,8 +107,15 @@ void LLMEngine::process_group_test() {
106107 for (auto & worker : worker_clients_) {
107108 futures.emplace_back (worker->process_group_test_async ());
108109 }
109- // wait up to 4 seconds for all futures to complete
110- folly::collectAll (futures).within (std::chrono::seconds (4 )).get ();
110+ // Wait for all futures to complete with a configurable timeout.
111+ // The timeout can be adjusted via the
112+ // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
113+ // seconds). This is particularly important in multi-node multi-device
114+ // scenarios where network latency may require a longer timeout period.
115+ const int timeout_seconds = util::get_process_group_test_timeout_seconds ();
116+ folly::collectAll (futures)
117+ .within (std::chrono::seconds (timeout_seconds))
118+ .get ();
111119 }
112120#endif
113121}
Original file line number Diff line number Diff line change @@ -27,6 +27,7 @@ limitations under the License.
2727#include " framework/model/model_args.h"
2828#include " framework/model_loader.h"
2929#include " framework/parallel_state/parallel_state.h"
30+ #include " util/env_var.h"
3031#include " util/pretty_print.h"
3132#include " util/utils.h"
3233#include " worker.h"
@@ -75,8 +76,16 @@ void VLMEngine::process_group_test() {
7576 for (auto & worker : workers_) {
7677 futures.emplace_back (worker->process_group_test_async ());
7778 }
78- // wait up to 4 seconds for all futures to complete
79- folly::collectAll (futures).within (std::chrono::seconds (4 )).get ();
79+ // Wait for all futures to complete with a configurable timeout.
80+ // The timeout can be adjusted via the
81+ // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
82+ // seconds). This is particularly important in multi-node multi-device
83+ // communication scenarios where network latency may require a longer
84+ // timeout period.
85+ const int timeout_seconds = util::get_process_group_test_timeout_seconds ();
86+ folly::collectAll (futures)
87+ .within (std::chrono::seconds (timeout_seconds))
88+ .get ();
8089 }
8190#endif
8291}
Original file line number Diff line number Diff line change @@ -15,7 +15,9 @@ limitations under the License.
1515
1616#include " env_var.h"
1717
18+ #include < climits>
1819#include < cstdlib>
20+ #include < cstring>
1921
2022namespace xllm {
2123namespace util {
@@ -30,5 +32,35 @@ bool get_bool_env(const std::string& key, bool defaultValue) {
3032 strVal == " True" );
3133}
3234
35+ int get_int_env (const std::string& key, int defaultValue) {
36+ const char * val = std::getenv (key.c_str ());
37+ if (val == nullptr ) {
38+ return defaultValue;
39+ }
40+ // Use strtol for proper error handling
41+ char * endptr;
42+ long int result = std::strtol (val, &endptr, 10 );
43+ // Check if conversion was successful (endptr points to end of string or valid
44+ // terminator)
45+ if (endptr == val || *endptr != ' \0 ' ) {
46+ return defaultValue;
47+ }
48+ // Check for overflow/underflow
49+ if (result < INT_MIN || result > INT_MAX) {
50+ return defaultValue;
51+ }
52+ return static_cast <int >(result);
53+ }
54+
55+ int get_process_group_test_timeout_seconds () {
56+ // Default timeout is 4 seconds, but can be overridden via environment
57+ // variable to accommodate multi-node multi-device communication scenarios
58+ // where network latency may require a longer timeout period.
59+ constexpr int kDefaultTimeoutSeconds = 4 ;
60+ constexpr const char * kTimeoutEnvVar =
61+ " XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS" ;
62+ return get_int_env (kTimeoutEnvVar , kDefaultTimeoutSeconds );
63+ }
64+
3365} // namespace util
3466} // namespace xllm
Original file line number Diff line number Diff line change @@ -22,5 +22,19 @@ namespace util {
2222
2323bool get_bool_env (const std::string& key, bool defaultValue);
2424
25+ // Get an integer value from an environment variable.
26+ // Returns the default value if the environment variable is not set or cannot be
27+ // parsed.
28+ int get_int_env (const std::string& key, int defaultValue);
29+
30+ // Get the timeout in seconds for process group test operations.
31+ // This timeout is used when waiting for process group initialization tests to
32+ // complete in multi-device/multi-node scenarios. The default value is 4
33+ // seconds, but can be overridden by setting the
34+ // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable. This is
35+ // particularly useful in multi-node multi-device communication scenarios where
36+ // network latency may cause the default 4-second timeout to be insufficient.
37+ int get_process_group_test_timeout_seconds ();
38+
2539} // namespace util
2640} // namespace xllm
You can’t perform that action at this time.
0 commit comments