-
Notifications
You must be signed in to change notification settings - Fork 742
[Engine] Revert TTFT optimize (#6680) and add EP batched token scheduler #7791
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -392,15 +392,6 @@ def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进 | |
| create=True, | ||
| ) | ||
|
|
||
| engine_forward_signal_data = np.zeros([1], dtype=np.int32) | ||
| self.engine_forward_signal = IPCSignal( | ||
| name="engine_forward_signal", | ||
| array=engine_forward_signal_data, | ||
| dtype=np.int32, | ||
| suffix=current_suffix, | ||
| create=True, | ||
| ) | ||
|
|
||
| # worker_live_signal 用于engine感知各worker进程是否存活,记录每个step 时间 | ||
| worker_healthy_live_recorded_time_array = np.zeros( | ||
| shape=[min(self.cfg.worker_num_per_node, self.cfg.parallel_config.tensor_parallel_size)], dtype=np.int32 | ||
|
|
@@ -1091,29 +1082,26 @@ def _fetch_request(): | |
| with self._pause_cond: | ||
| self._pause_cond.wait_for(lambda: not self.is_paused) | ||
| try: | ||
| if not is_fetching: | ||
| # Check if the thread pool is still available to avoid submitting tasks to a shutdown thread pool. | ||
| try: | ||
| if self.engine_worker_queue.exist_tasks(): | ||
| time.sleep(0.001) | ||
| continue | ||
| if self.cfg.scheduler_config.splitwise_role != "mixed": | ||
| if not is_fetching: | ||
| is_fetching = True | ||
| get_request_pool.submit(_fetch_request) | ||
This comment was marked as outdated.
Sorry, something went wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug 非 mixed 模式下 mixed 模式的 建议修复: if self.cfg.scheduler_config.splitwise_role != "mixed":
if not is_fetching:
try:
is_fetching = True
get_request_pool.submit(_fetch_request)
except RuntimeError as e:
if "shutdown" in str(e):
self.llm_logger.info("Thread pool shutdown detected, exiting scheduler loop")
break
else:
raise |
||
| except RuntimeError as e: | ||
| if "shutdown" in str(e): | ||
| self.llm_logger.info("Thread pool shutdown detected, exiting scheduler loop") | ||
| break | ||
| else: | ||
| raise | ||
| if self.cfg.scheduler_config.splitwise_role != "mixed": | ||
| # Continue preprocessing incoming requests and accumulating them in the queue when forward pass not finished. | ||
| # Once the forward pass finishes, these accumulated requests can be scheduled in larger, | ||
| # more efficient batches. | ||
| if self.engine_worker_queue.exist_tasks() or self.engine_forward_signal.value[0] != 0: | ||
| time.sleep(0.001) | ||
| continue | ||
|
|
||
| else: | ||
| # In mixed, todo: optimze cache swap, to decouple swap from scheduler | ||
| if self.engine_worker_queue.exist_tasks(): | ||
| time.sleep(0.001) | ||
| continue | ||
| if len(self.resource_manager.waiting) == 0 and (not is_fetching): | ||
| # Check if the thread pool is still available to avoid submitting tasks to a shutdown thread pool. | ||
| try: | ||
| is_fetching = True | ||
| get_request_pool.submit(_fetch_request) | ||
| except RuntimeError as e: | ||
| if "shutdown" in str(e): | ||
| self.llm_logger.info("Thread pool shutdown detected, exiting scheduler loop") | ||
| break | ||
| else: | ||
| raise | ||
|
|
||
| if hasattr(self.resource_manager, "scheduler_unhandled_request_num"): | ||
| self.resource_manager.scheduler_unhandled_request_num = self._get_scheduler_unhandled_request_num() | ||
|
|
@@ -1178,13 +1166,6 @@ def _fetch_request(): | |
| elif not task.has_been_preempted_before: | ||
| task.metrics.inference_start_time = time.time() | ||
| self.engine_worker_queue.put_tasks((batch_request, self.resource_manager.real_bsz)) | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| else: | ||
| # When there are no actual tasks to schedule, send an empty task batch to EP workers. | ||
| # This helps EP workers barrier for syncing tasks not hang. | ||
| if self.cfg.parallel_config.enable_expert_parallel: | ||
| self.engine_worker_queue.put_tasks( | ||
| (batch_request, self.resource_manager.real_bsz) | ||
| ) # Empty (as idle tasks for ep) | ||
|
|
||
| # 4. Response error tasks | ||
| if error_tasks: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,7 @@ | |
| from fastdeploy.logger.request_logger import RequestLogLevel, log_request | ||
| from fastdeploy.scheduler.data import ScheduledResponse | ||
| from fastdeploy.scheduler.local_scheduler import LocalScheduler | ||
| from fastdeploy.utils import get_logger | ||
| from fastdeploy.utils import envs, get_logger | ||
|
|
||
|
|
||
| class DPLocalScheduler(LocalScheduler): | ||
|
|
@@ -136,19 +136,52 @@ def get_requests( | |
| Returns: | ||
| List of Request objects ready for processing | ||
| """ | ||
| # DP scheduler is used in V1, there is no need to manage request fetching in the scheduler, resource_manager_v1 will do that. | ||
| if available_blocks <= reserved_output_blocks or batch < 1: | ||
| self.scheduler_logger.debug( | ||
| f"Scheduler's resource are insufficient: available_blocks={available_blocks} " | ||
| f"reserved_output_blocks={reserved_output_blocks} batch={batch} " | ||
| f"max_num_batched_tokens={max_num_batched_tokens}" | ||
| ) | ||
| return [] | ||
| required_total_blocks = 0 | ||
| current_prefill_tokens = 0 | ||
| start_batch_time = time.time() | ||
| requests: List[Request] = [] | ||
|
|
||
| with self.requests_not_empty: | ||
| batch_ids = self.requests_not_empty.wait_for( | ||
| lambda: self.ids[self.ids_read_cursor : self.ids_read_cursor + 1], | ||
| 0.005, | ||
| ) | ||
| if batch_ids: | ||
| for request_id in batch_ids: | ||
| request = self.requests[request_id] | ||
| requests.append(request.raw) | ||
| self.ids_read_cursor += 1 | ||
| while True: | ||
| batch_ids = self.requests_not_empty.wait_for( | ||
| lambda: self.ids[self.ids_read_cursor : self.ids_read_cursor + batch], | ||
| 0.005, | ||
| ) | ||
| if batch_ids: | ||
| for request_id in batch_ids: | ||
| request = self.requests[request_id] | ||
| required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) | ||
| current_prefill_tokens += request.prompt_tokens_ids_len | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug 外层 while 循环下一次迭代时,
建议在 break 之前检查而非之后累加: if required_total_blocks + required_input_blocks + reserved_output_blocks > available_blocks:
break # 先判断,通过后再累加
current_prefill_tokens += request.prompt_tokens_ids_len
required_total_blocks += required_input_blocks + reserved_output_blocks |
||
| required_total_blocks += required_input_blocks + reserved_output_blocks | ||
| if required_total_blocks > available_blocks: | ||
| break | ||
|
|
||
| requests.append(request.raw) | ||
| self.ids_read_cursor += 1 | ||
| start_batch_time = time.time() | ||
| if current_prefill_tokens > max_num_batched_tokens: | ||
| break | ||
| if len(requests) >= batch: | ||
| break | ||
| if ( | ||
| (current_prefill_tokens > max_num_batched_tokens) | ||
| or (len(requests) >= batch) | ||
| or (time.time() - start_batch_time > envs.FD_EP_BATCHED_TOKEN_TIMEOUT) | ||
| ): | ||
| break | ||
|
|
||
| if batch_ids: | ||
| if len(batch_ids) > 0 and len(requests) == 0: | ||
| self.scheduler_logger.debug( | ||
| f"Scheduler has put all just-pulled request into the queue: {len(batch_ids)}" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 日志信息语义与条件相反。 此处触发条件为 建议改为: self.scheduler_logger.debug(
f"No requests could be scheduled due to insufficient resources, pending={len(batch_ids)}"
) |
||
| ) | ||
|
|
||
| if len(requests) > 0: | ||
| log_request( | ||
|
|
||
This comment was marked as outdated.
Sorry, something went wrong.
Uh oh!
There was an error while loading. Please reload this page.