|
2 | 2 | Launch the Parallax server. |
3 | 3 |
|
4 | 4 | This script is used to launch the Parallax server. |
5 | | -It will start the P2P server and the executor. |
| 5 | +It will start the following services: |
| 6 | + 1.Executor with tp_rank=0 in the main process. |
| 7 | + 2.Executor with tp_rank>0, each tp_rank as a subprocess. |
| 8 | + 3.HTTP server as a subprocess. |
| 9 | + 4.P2P server as a thread in the main process. |
6 | 10 |
|
7 | 11 | Example command: |
8 | 12 | python src/parallax/launch.py \ |
|
41 | 45 | gradient_server = None |
42 | 46 | http_server_process = None |
43 | 47 | executor = None |
44 | | - executor_procs = [] |
| 48 | + executor_subprocs = [] |
45 | 49 | try: |
46 | 50 | args = parse_args() |
47 | 51 | set_log_level(args.log_level) |
|
75 | 79 | pp_start_layer=args.start_layer, |
76 | 80 | pp_end_layer=args.end_layer, |
77 | 81 | hidden_layers=config.get("num_hidden_layers"), |
| 82 | + tp_size=args.tp_size, |
78 | 83 | tcp_port=args.tcp_port, |
79 | 84 | udp_port=args.udp_port, |
80 | 85 | dht_prefix=args.dht_prefix, |
|
91 | 96 | ) |
92 | 97 | if gradient_server is not None: |
93 | 98 | gradient_server.status = ServerState.READY |
94 | | - tp_rank_range = range(args.tp_size) |
95 | | - for tp_rank in tp_rank_range: |
| 99 | + |
| 100 | + # For each tp_rank > 0, create a subprocess and run executor |
| 101 | + for tp_rank in range(1, args.tp_size): |
96 | 102 | args_copy = argparse.Namespace(**vars(args)) |
97 | 103 | args_copy.tp_rank = tp_rank |
98 | 104 | proc = multiprocessing.Process( |
99 | 105 | target=run_executor_process, |
100 | 106 | args=(args_copy,), |
101 | 107 | ) |
102 | 108 | proc.start() |
103 | | - executor_procs.append(proc) |
104 | | - for executor_process in executor_procs: |
105 | | - executor_process.join() |
| 109 | + executor_subprocs.append(proc) |
| 110 | + # Launch executor with tp_rank=0 in the main process |
| 111 | + args.tp_rank = 0 |
| 112 | + executor = Executor.create_from_args(args) |
| 113 | + executor.run_loop() |
106 | 114 | else: |
107 | 115 | gradient_server = launch_p2p_server( |
108 | 116 | initial_peers=args.initial_peers, |
|
111 | 119 | pp_start_layer=args.start_layer, |
112 | 120 | pp_end_layer=args.end_layer, |
113 | 121 | hidden_layers=None, |
| 122 | + tp_size=args.tp_size, |
114 | 123 | tcp_port=args.tcp_port, |
115 | 124 | udp_port=args.udp_port, |
116 | 125 | dht_prefix=args.dht_prefix, |
|
128 | 137 | args.start_layer = gradient_server.block_start_index |
129 | 138 | args.end_layer = gradient_server.block_end_index |
130 | 139 | args.model_path = gradient_server.model_name |
131 | | - # TODO: Implement inter-process communication to enable TP. |
132 | | - # For scheduler mode, currently only support tp_rank=0 |
133 | | - args.tp_rank = 0 |
| 140 | + args.tp_size = gradient_server.tp_size |
134 | 141 |
|
135 | 142 | logger.debug( |
136 | 143 | f"Start Executor with start_layer: {args.start_layer}, end_layer: {args.end_layer}" |
|
148 | 155 | # Main execution loop with layer reallocation support |
149 | 156 | while True: |
150 | 157 | try: |
| 158 | + # For each tp_rank > 0, create a subprocess and run executor |
| 159 | + for tp_rank in range(1, args.tp_size): |
| 160 | + args_copy = argparse.Namespace(**vars(args)) |
| 161 | + args_copy.tp_rank = tp_rank |
| 162 | + proc = multiprocessing.Process( |
| 163 | + target=run_executor_process, |
| 164 | + args=(args_copy,), |
| 165 | + ) |
| 166 | + proc.start() |
| 167 | + executor_subprocs.append(proc) |
| 168 | + # Launch executor with tp_rank=0 in the main process |
| 169 | + args.tp_rank = 0 |
151 | 170 | executor = Executor.create_from_args(args, gradient_server=gradient_server) |
152 | 171 | if gradient_server is not None: |
153 | 172 | gradient_server.status = ServerState.READY |
|
159 | 178 | logger.warning( |
160 | 179 | "Layer allocation changed! Reloading executor with new layers..." |
161 | 180 | ) |
| 181 | + |
| 182 | + # shutdown all executor processes |
| 183 | + thread_pool = [] |
| 184 | + for executor_process in executor_subprocs: |
| 185 | + t = threading.Thread( |
| 186 | + target=stop_executor_process, args=(executor_process,) |
| 187 | + ) |
| 188 | + t.start() |
| 189 | + thread_pool.append(t) |
162 | 190 | executor.shutdown() |
| 191 | + for t in thread_pool: |
| 192 | + t.join() |
163 | 193 |
|
164 | 194 | if args.start_layer == 0: |
165 | 195 | http_server_process = stop_http_server(http_server_process) |
|
210 | 240 | if gradient_server is not None: |
211 | 241 | gradient_server.shutdown() |
212 | 242 |
|
213 | | - # Shutdown executor subprocess for scheduler mode |
214 | | - for executor_process in executor_procs: |
| 243 | + # Shutdown executor subprocesses |
| 244 | + for executor_process in executor_subprocs: |
215 | 245 | t = threading.Thread(target=stop_executor_process, args=(executor_process,)) |
216 | 246 | t.start() |
217 | 247 | thread_pool.append(t) |
218 | 248 |
|
219 | | - # Shutdown executor main process for non-scheduler mode |
| 249 | + # Shutdown executor main process |
220 | 250 | if executor is not None: |
221 | 251 | executor.shutdown() |
222 | 252 |
|
|
0 commit comments