Skip to content

Commit 4533086

Browse files
committed
feat: support graceful termination of the xllm process via the SIGTERM/SIGINT signal.
Signed-off-by: Tao Peng <pengtao.156@jd.com>
1 parent e6010a7 commit 4533086

File tree

7 files changed

+66
-3
lines changed

7 files changed

+66
-3
lines changed

xllm/core/distributed_runtime/worker_server.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ limitations under the License.
2424
#include <torch/torch.h>
2525
#include <unistd.h>
2626

27+
#include <cstdlib>
2728
#include <memory>
2829
#include <optional>
2930
#include <utility>
@@ -50,6 +51,9 @@ limitations under the License.
5051
extern char** environ;
5152

5253
namespace xllm {
54+
namespace {
55+
void handle_signal(int signum) { _exit(0); }
56+
} // namespace
5357

5458
void WorkerServer::create_server(
5559
const runtime::Options& options,
@@ -217,6 +221,10 @@ WorkerServer::WorkerServer(int local_worker_idx,
217221
local_worker_idx, master_node_addr, done, parallel_args, d, options);
218222
return;
219223
} else {
224+
// worker process should handle SIGTREM and SIGINT signals.
225+
signal(SIGINT, handle_signal);
226+
signal(SIGTERM, handle_signal);
227+
220228
std::unique_ptr<ForwardSharedMemoryManager> input_shm_manager = nullptr;
221229
std::unique_ptr<ForwardSharedMemoryManager> output_shm_manager = nullptr;
222230
prepare_shm(

xllm/core/runtime/master.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,20 @@ limitations under the License.
4545
#include <pybind11/pybind11.h>
4646
#endif
4747

48+
namespace brpc {
49+
DECLARE_bool(graceful_quit_on_sigterm);
50+
DECLARE_bool(graceful_quit_on_sighup);
51+
} // namespace brpc
52+
4853
namespace xllm {
4954

5055
Master::Master(const Options& options, EngineType type) : options_(options) {
5156
LOG(INFO) << "Master init options: " << options.to_string();
5257

58+
// Allow brpc receive SIGTREM and SIGINT signal.
59+
brpc::FLAGS_graceful_quit_on_sigterm = true;
60+
brpc::FLAGS_graceful_quit_on_sighup = true;
61+
5362
#if defined(USE_NPU)
5463
if (options.rank_tablefile().has_value()) {
5564
FLAGS_rank_tablefile = options.rank_tablefile().value();

xllm/core/runtime/vlm_engine.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ limitations under the License.
2121

2222
#include <algorithm>
2323
#include <boost/algorithm/string.hpp>
24+
#include <cstdlib>
2425
#include <memory>
2526

2627
#include "common/metrics.h"
@@ -34,7 +35,16 @@ limitations under the License.
3435

3536
namespace xllm {
3637

38+
namespace {
39+
void handle_signal(int signum) { _exit(0); }
40+
} // namespace
41+
3742
VLMEngine::VLMEngine(const runtime::Options& options) : options_(options) {
43+
// worker process should handle SIGTREM and SIGINT signals.
44+
// TODO: delete these code when multi-process impl is supported.
45+
signal(SIGINT, handle_signal);
46+
signal(SIGTERM, handle_signal);
47+
3848
const auto& devices = options_.devices();
3949
CHECK_GT(devices.size(), 0) << "At least one device is required";
4050

xllm/pybind/embedding.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ def __init__(
3434
is_local: bool = True,
3535
**kwargs,
3636
) -> None:
37+
signal.signal(signal.SIGTERM, lambda s, f: sys.exit(0))
38+
signal.signal(signal.SIGINT, lambda s, f: sys.exit(0))
39+
3740
if not os.path.exists(model):
3841
raise ValueError(f"model {model} not exists")
3942

@@ -79,7 +82,8 @@ def __init__(
7982
def finish(self):
8083
try:
8184
#os.kill(os.getpid(), signal.SIGTERM)
82-
os.kill(os.getpid(), signal.SIGKILL)
85+
#os.kill(os.getpid(), signal.SIGKILL)
86+
util.terminate_process(os.getpid())
8387
except Exception as e:
8488
pass
8589

xllm/pybind/llm.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ def __init__(
4747
is_local: bool = True,
4848
**kwargs,
4949
) -> None:
50+
signal.signal(signal.SIGTERM, lambda s, f: sys.exit(0))
51+
signal.signal(signal.SIGINT, lambda s, f: sys.exit(0))
5052

5153
if not os.path.exists(model):
5254
raise ValueError(f"model {model} not exists")
@@ -102,7 +104,8 @@ def __init__(
102104
def finish(self):
103105
try:
104106
#os.kill(os.getpid(), signal.SIGTERM)
105-
os.kill(os.getpid(), signal.SIGKILL)
107+
#os.kill(os.getpid(), signal.SIGKILL)
108+
util.terminate_process(os.getpid())
106109
except Exception as e:
107110
pass
108111

xllm/pybind/util.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,30 @@
1+
import os
2+
import psutil
3+
import signal
14
import socket
5+
import sys
6+
7+
def terminate_process(pid, timeout=30):
8+
try:
9+
parent = psutil.Process(pid)
10+
except psutil.NoSuchProcess:
11+
return
12+
13+
children = parent.children(recursive=True)
14+
procs = children + [parent]
15+
16+
for p in procs:
17+
try:
18+
p.terminate()
19+
except psutil.NoSuchProcess:
20+
pass
21+
22+
gone, alive = psutil.wait_procs(procs, timeout=timeout)
23+
for p in alive:
24+
try:
25+
p.kill()
26+
except psutil.NoSuchProcess:
27+
pass
228

329
def get_free_port():
430
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:

xllm/pybind/vlm.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ def __init__(
4545
is_local: bool = True,
4646
**kwargs,
4747
) -> None:
48+
signal.signal(signal.SIGTERM, lambda s, f: sys.exit(0))
49+
signal.signal(signal.SIGINT, lambda s, f: sys.exit(0))
4850

4951
if not os.path.exists(model):
5052
raise ValueError(f"model {model} not exists")
@@ -97,7 +99,8 @@ def __init__(
9799
def finish(self):
98100
try:
99101
#os.kill(os.getpid(), signal.SIGTERM)
100-
os.kill(os.getpid(), signal.SIGKILL)
102+
#os.kill(os.getpid(), signal.SIGKILL)
103+
util.terminate_process(os.getpid())
101104
except Exception as e:
102105
pass
103106

0 commit comments

Comments
 (0)