@@ -728,14 +728,7 @@ def _allocate_decode_and_extend():
728728 if scheduled_reqs :
729729 llm_logger .debug (f"schedued_reqs: { scheduled_reqs } " )
730730
731- # Update metrics
732- num_tasks = sum ([1 if task else 0 for task in self .tasks_list ])
733- num_blocks_used_by_tasks = sum ([len (task .block_tables ) if task else 0 for task in self .tasks_list ])
734- main_process_metrics .available_gpu_block_num .set (self .total_block_number () - num_blocks_used_by_tasks )
735- main_process_metrics .batch_size .set (self .max_num_seqs - self .available_batch ())
736- main_process_metrics .gpu_cache_usage_perc .set (self .get_gpu_cache_usage_perc ())
737- main_process_metrics .num_requests_running .set (len (self .running ))
738- main_process_metrics .num_requests_waiting .set (num_tasks - len (self .running ))
731+ self .update_metrics ()
739732
740733 return scheduled_reqs
741734
@@ -962,7 +955,10 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]):
962955 if request in self .running : # normally run and finished
963956 self .running .remove (request )
964957 request .status = RequestStatus .FINISHED
965- self ._free_blocks (request )
958+ try :
959+ self ._free_blocks (request )
960+ except Exception as e :
961+ llm_logger .warning (f"release block failed { req_id } : { e } " )
966962 if (
967963 request .request_id in self .to_be_rescheduled_request_id_set
968964 ): # finished after preempted, blocks have been recycled.
@@ -981,7 +977,19 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]):
981977 del self .req_dict [req_id ]
982978 except Exception as e :
983979 llm_logger .error (f"finish_request err: { e } , { str (traceback .format_exc ())} " )
980+ finally :
981+ self .update_metrics ()
984982
985983 def clear_data (self ):
986984 self .waiting : deque [Request ] = deque ()
987985 self .to_be_rescheduled_request_id_set = set ()
986+
987+ def update_metrics (self ):
988+ # Update metrics
989+ num_tasks = sum ([1 if task else 0 for task in self .tasks_list ])
990+ num_blocks_used_by_tasks = sum ([len (task .block_tables ) if task else 0 for task in self .tasks_list ])
991+ main_process_metrics .available_gpu_block_num .set (self .total_block_number () - num_blocks_used_by_tasks )
992+ main_process_metrics .batch_size .set (self .max_num_seqs - self .available_batch ())
993+ main_process_metrics .gpu_cache_usage_perc .set (self .get_gpu_cache_usage_perc ())
994+ main_process_metrics .num_requests_running .set (len (self .running ))
995+ main_process_metrics .num_requests_waiting .set (num_tasks - len (self .running ))
0 commit comments