diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 74e3db9120..9a1bb0151d 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -1,8 +1,68 @@ +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/__main__.py b/berkeley-function-call-leaderboard/bfcl_eval/__main__.py +index 3832e7e..61901b2 100644 +--- a/berkeley-function-call-leaderboard/bfcl_eval/__main__.py ++++ b/berkeley-function-call-leaderboard/bfcl_eval/__main__.py +@@ -118,6 +118,11 @@ def generate( + "--exclude-state-log", + help="Exclude info about the state of each API system after each turn in the inference log; only relevant for multi-turn categories.", + ), ++ include_verbose_log: bool = typer.Option( ++ False, ++ "--include-verbose-log", ++ help="Include the __verbose field from model server responses (e.g. OVMS) in the result output; useful for debugging generation settings, prompts, and timings.", ++ ), + num_gpus: int = typer.Option(1, help="The number of GPUs to use."), + num_threads: Optional[int] = typer.Option(None, help="The number of threads to use."), + gpu_memory_utilization: float = typer.Option(0.9, help="The GPU memory utilization."), +@@ -159,6 +164,7 @@ def generate( + temperature=temperature, + include_input_log=include_input_log, + exclude_state_log=exclude_state_log, ++ include_verbose_log=include_verbose_log, + num_gpus=num_gpus, + num_threads=num_threads, + gpu_memory_utilization=gpu_memory_utilization, +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py +index c9cbe09..6504eb1 100644 +--- a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py ++++ b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py +@@ -165,13 +165,13 @@ def collect_test_cases(args, model_name, all_test_categories, all_test_entries_i + return sorted(test_cases_to_generate, key=sort_key) + + +-def multi_threaded_inference(handler, test_case, include_input_log, exclude_state_log): ++def multi_threaded_inference(handler, test_case, include_input_log, exclude_state_log, include_verbose_log=False): + + assert type(test_case["function"]) is list + + try: + result, metadata = handler.inference( +- test_case, include_input_log, exclude_state_log ++ test_case, include_input_log, exclude_state_log, include_verbose_log + ) + except Exception as e: + # This is usually the case when the model getting stuck on one particular test case. +@@ -284,6 +284,7 @@ def generate_results(args, model_name, test_cases_total): + test_case, + args.include_input_log, + args.exclude_state_log, ++ getattr(args, "include_verbose_log", False), + ) + in_flight[future] = test_case_id + +@@ -320,6 +321,7 @@ def generate_results(args, model_name, test_cases_total): + test_case, + args.include_input_log, + args.exclude_state_log, ++ getattr(args, "include_verbose_log", False), + ) + in_flight[future] = test_case_id + diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -index bb625d2..3ab2856 100644 +index bb625d2..7204adb 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -@@ -2153,6 +2153,42 @@ third_party_inference_model_map = { +@@ -2153,6 +2153,30 @@ third_party_inference_model_map = { is_fc_model=True, underscore_to_dot=True, ), @@ -29,18 +89,6 @@ index bb625d2..3ab2856 100644 + output_price=None, + is_fc_model=True, + underscore_to_dot=True, -+ ), -+ "ovms-model-responses": ModelConfig( -+ model_name="ovms-model-responses", -+ display_name="ovms-model-responses", -+ url="http://localhost:8000/v3", -+ org="ovms", -+ license="apache-2.0", -+ model_handler=OpenAIResponsesHandler, -+ input_price=None, -+ output_price=None, -+ is_fc_model=True, -+ underscore_to_dot=True, + ), } @@ -72,50 +120,18 @@ index 357584f..e45e12c 100644 "store": False, } -diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py -index 0953fdd..fffcc6c 100644 ---- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py -+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py -@@ -38,10 +38,10 @@ class OpenAIResponsesHandler(BaseHandler): - - kwargs = {} - -- if api_key := os.getenv("OPENAI_API_KEY"): -+ if api_key := os.getenv("OPENAI_API_KEY","unused"): - kwargs["api_key"] = api_key - -- if base_url := os.getenv("OPENAI_BASE_URL"): -+ if base_url := os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"): - kwargs["base_url"] = base_url - - if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): -@@ -103,6 +103,9 @@ class OpenAIResponsesHandler(BaseHandler): - "include": ["reasoning.encrypted_content"], - "reasoning": {"summary": "auto"}, - "temperature": self.temperature, -+ "max_output_tokens": 2048, -+ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), -+ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, - } - - # OpenAI reasoning models don't support temperature parameter -@@ -222,6 +225,7 @@ class OpenAIResponsesHandler(BaseHandler): - "include": ["reasoning.encrypted_content"], - "reasoning": {"summary": "auto"}, - "temperature": self.temperature, -+ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, - } - - # OpenAI reasoning models don't support temperature parameter diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -index 10f1a08..b67d39c 100644 +index 10f1a08..50890c7 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -@@ -1,3 +1,4 @@ +@@ -7,6 +7,7 @@ from openai import OpenAI + from overrides import override + from qwen_agent.llm import get_chat_model + import time +import json - import os - from typing import Any + class QwenAPIHandler(OpenAICompletionsHandler): + """ @@ -28,8 +29,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) self.model_style = ModelStyle.OPENAI_COMPLETIONS @@ -127,7 +143,7 @@ index 10f1a08..b67d39c 100644 ) #### FC methods #### -@@ -45,9 +46,10 @@ class QwenAPIHandler(OpenAICompletionsHandler): +@@ -45,9 +46,9 @@ class QwenAPIHandler(OpenAICompletionsHandler): model=self.model_name.replace("-FC", ""), tools=tools, parallel_tool_calls=True, @@ -137,14 +153,214 @@ index 10f1a08..b67d39c 100644 + max_completion_tokens=2048, + tool_choice=os.getenv("TOOL_CHOICE", "auto"), + extra_body={"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, -+ temperature=self.temperature, stream=True, stream_options={ "include_usage": True -@@ -352,4 +354,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): +@@ -352,4 +353,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): 'timeout': 1000, 'max_tokens': 16384 } - }) \ No newline at end of file + }) +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py +index a1025e9..fed8c99 100644 +--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py +@@ -70,6 +70,7 @@ class BaseHandler: + test_entry: dict, + include_input_log: bool, + exclude_state_log: bool, ++ include_verbose_log: bool = False, + ): + # This method is used to retrive model response for each model. + +@@ -78,18 +79,18 @@ class BaseHandler: + if "FC" in self.registry_name or self.is_fc_model: + if contain_multi_turn_interaction(test_entry["id"]): + return self.inference_multi_turn_FC( +- test_entry, include_input_log, exclude_state_log ++ test_entry, include_input_log, exclude_state_log, include_verbose_log + ) + else: +- return self.inference_single_turn_FC(test_entry, include_input_log) ++ return self.inference_single_turn_FC(test_entry, include_input_log, include_verbose_log) + # Prompting model + else: + if contain_multi_turn_interaction(test_entry["id"]): + return self.inference_multi_turn_prompting( +- test_entry, include_input_log, exclude_state_log ++ test_entry, include_input_log, exclude_state_log, include_verbose_log + ) + else: +- return self.inference_single_turn_prompting(test_entry, include_input_log) ++ return self.inference_single_turn_prompting(test_entry, include_input_log, include_verbose_log) + + @final + def inference_multi_turn_FC( +@@ -97,6 +98,7 @@ class BaseHandler: + test_entry: dict, + include_input_log: bool, + exclude_state_log: bool, ++ include_verbose_log: bool = False, + ) -> tuple[list[list], dict]: + initial_config: dict = test_entry.get("initial_config", {}) + involved_classes: list = test_entry["involved_classes"] +@@ -119,6 +121,7 @@ class BaseHandler: + force_quit = False # Whether the model has been forced to quit. If True, this whole entry will be failed. + + all_reasoning_content: list[list] = [] ++ all_verbose_log: list[list[dict]] = [] + + # Execute no function call, but just to get a reference to all the instances to get the initial state for logging purpose + _, involved_instances = execute_multi_turn_func_call( +@@ -206,6 +209,7 @@ class BaseHandler: + current_turn_output_token_count: list[float] = [] + current_turn_latency: list[float] = [] + current_turn_reasoning_content = [] ++ current_turn_verbose_log: list[dict] = [] + + count = 0 + while True: +@@ -219,6 +223,11 @@ class BaseHandler: + + api_response, query_latency = self._query_FC(inference_data) + ++ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra: ++ verbose_data = api_response.model_extra.get("__verbose") ++ if verbose_data: ++ current_turn_verbose_log.append({k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data}) ++ + # This part of logging is disabled by default because it is too verbose and will make the result file extremely large + # It is only useful to see if the inference pipeline is working as expected (eg, does it convert all the inputs correctly) + if include_input_log: +@@ -335,6 +344,7 @@ class BaseHandler: + all_model_response.append(current_turn_response) + all_inference_log.append(current_turn_inference_log) + all_reasoning_content.append(current_turn_reasoning_content) ++ all_verbose_log.append(current_turn_verbose_log) + total_input_token_count.append(current_turn_input_token_count) + total_output_token_count.append(current_turn_output_token_count) + total_latency.append(current_turn_latency) +@@ -388,6 +398,9 @@ class BaseHandler: + ): + metadata["reasoning_content"] = all_reasoning_content + ++ if include_verbose_log and any(turn_log for turn_log in all_verbose_log): ++ metadata["__verbose"] = all_verbose_log ++ + return all_model_response, metadata + + @final +@@ -396,6 +409,7 @@ class BaseHandler: + test_entry: dict, + include_input_log: bool, + exclude_state_log: bool, ++ include_verbose_log: bool = False, + ) -> tuple[list[list], dict]: + initial_config: dict = test_entry.get("initial_config", {}) + involved_classes: list = test_entry["involved_classes"] +@@ -415,6 +429,7 @@ class BaseHandler: + all_reasoning_content: list[list] = [] + # The debugging log for human to understand + all_inference_log: list[list[dict]] = [] ++ all_verbose_log: list[list[dict]] = [] + force_quit = False # Whether the model has been forced to quit. If True, this whole entry will be failed. + + # Execute no function call, but just to get a reference to all the instances to get the initial state for logging purpose +@@ -498,6 +513,7 @@ class BaseHandler: + current_turn_input_token_count: list[float] = [] + current_turn_output_token_count: list[float] = [] + current_turn_latency: list[float] = [] ++ current_turn_verbose_log: list[dict] = [] + + count = 0 + while True: +@@ -511,6 +527,11 @@ class BaseHandler: + + api_response, query_latency = self._query_prompting(inference_data) + ++ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra: ++ verbose_data = api_response.model_extra.get("__verbose") ++ if verbose_data: ++ current_turn_verbose_log.append({k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data}) ++ + # This part of logging is disabled by default because it is too verbose and will make the result file extremely large + # It is only useful to see if the inference pipeline is working as expected (eg, does it convert all the inputs correctly) + if include_input_log: +@@ -626,6 +647,7 @@ class BaseHandler: + all_model_response.append(current_turn_response) + all_reasoning_content.append(current_turn_reasoning_content) + all_inference_log.append(current_turn_inference_log) ++ all_verbose_log.append(current_turn_verbose_log) + total_input_token_count.append(current_turn_input_token_count) + total_output_token_count.append(current_turn_output_token_count) + total_latency.append(current_turn_latency) +@@ -679,11 +701,14 @@ class BaseHandler: + ): + metadata["reasoning_content"] = all_reasoning_content + ++ if include_verbose_log and any(turn_log for turn_log in all_verbose_log): ++ metadata["__verbose"] = all_verbose_log ++ + return all_model_response, metadata + + @final + def inference_single_turn_FC( +- self, test_entry: dict, include_input_log: bool ++ self, test_entry: dict, include_input_log: bool, include_verbose_log: bool = False + ) -> tuple[any, dict]: + inference_data: dict = {} + inference_data = self._pre_query_processing_FC(inference_data, test_entry) +@@ -716,11 +741,16 @@ class BaseHandler: + ): + metadata["reasoning_content"] = model_response_data["reasoning_content"] + ++ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra: ++ verbose_data = api_response.model_extra.get("__verbose") ++ if verbose_data: ++ metadata["__verbose"] = {k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data} ++ + return model_response_data["model_responses"], metadata + + @final + def inference_single_turn_prompting( +- self, test_entry: dict, include_input_log: bool ++ self, test_entry: dict, include_input_log: bool, include_verbose_log: bool = False + ) -> tuple[any, dict]: + inference_data: dict = self._pre_query_processing_prompting(test_entry) + inference_data = self.add_first_turn_message_prompting( +@@ -751,6 +781,11 @@ class BaseHandler: + ): + metadata["reasoning_content"] = model_response_data["reasoning_content"] + ++ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra: ++ verbose_data = api_response.model_extra.get("__verbose") ++ if verbose_data: ++ metadata["__verbose"] = {k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data} ++ + return model_response_data["model_responses"], metadata + + def decode_ast(self, result, language: ReturnFormat, has_tool_call_tag: bool): +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py +index 961d9bf..6b6504c 100644 +--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py +@@ -51,14 +51,15 @@ class OSSHandler(BaseHandler, EnforceOverrides): + test_entry: dict, + include_input_log: bool, + exclude_state_log: bool, ++ include_verbose_log: bool = False, + ): + # TODO: Let oss model support FC methods as well, depends on their model type + if contain_multi_turn_interaction(test_entry["id"]): + return self.inference_multi_turn_prompting( +- test_entry, include_input_log, exclude_state_log ++ test_entry, include_input_log, exclude_state_log, include_verbose_log + ) + else: +- return self.inference_single_turn_prompting(test_entry, include_input_log) ++ return self.inference_single_turn_prompting(test_entry, include_input_log, include_verbose_log) + + @override + def decode_ast(self, result, language, has_tool_call_tag):