55from dataflow .serving import LocalModelLLMServing_vllm
66from dataflow .utils .storage import FileStorage
77from dataflow .operators .core_text import BenchDatasetEvaluatorQuestion
8+ from dataflow .cli_funcs .cli_eval import run_evaluation
89
910# =============================================================================
1011# Fair Evaluation Prompt Template
@@ -15,7 +16,6 @@ class FairAnswerJudgePrompt:
1516
1617 def build_prompt (self , question , answer , reference_answer ):
1718 prompt = f"""You are an expert evaluator assessing answer quality for academic questions.
18-
1919 **Question:**
2020 { question }
2121
@@ -54,78 +54,77 @@ def build_prompt(self, question, answer, reference_answer):
5454# Judge Model Configuration (local strong model as judge)
5555JUDGE_MODEL_CONFIG = {
5656 "model_path" : "./Qwen2.5-7B-Instruct" , # 用更强的模型做裁判
57- "hf_cache_dir" : "" ,
58- "hf_local_dir" : "" ,
59- "vllm_tensor_parallel_size" : 2 ,
60- "vllm_temperature" : 0.9 ,
61- "vllm_top_p" : 0.9 ,
57+ "hf_cache_dir" : "" ,
58+ "hf_local_dir" : "" ,
59+ "vllm_tensor_parallel_size" : 1 ,
60+ "vllm_temperature" : 0.9 ,
61+ "vllm_top_p" : 0.9 ,
6262 "vllm_max_tokens" : 512 ,
63- "vllm_repetition_penalty" : 1.0 ,
64- "vllm_seed" : None ,
65- "vllm_max_model_len" : None ,
66- "vllm_gpu_memory_utilization" : 0.9
63+ "vllm_repetition_penalty" : 0.8 ,
64+ "vllm_seed" : None ,
65+ "vllm_max_model_len" : None ,
66+ "vllm_gpu_memory_utilization" : 0.9
6767}
6868
6969# Target Models Configuration (字典格式 - 必需)
7070TARGET_MODELS = [
71- # {
72- # "name": "qwen_3b", # 模型名称(可选,默认使用路径最后一部分)
73- # "path": "./Qwen2.5-3B-Instruct", # 模型路径(必需)
71+ {
72+ "name" : "qwen_3b" , # 模型名称(可选,默认使用路径最后一部分)
73+ "path" : "./Qwen2.5-3B-Instruct" , # 模型路径(必需)
7474
75- # # ===== 答案生成的模型加载参数(可选)=====
76- # "tensor_parallel_size": 1, # GPU并行数量
77- # "max_tokens": 1024, # 最大生成token数
78- # "gpu_memory_utilization": 0.8, # GPU显存利用率
79- # },
75+ # 大模型可以用不同的参数
76+ "vllm_tensor_parallel_size" : 1 ,
77+ "vllm_temperature" : 0.1 ,
78+ "vllm_top_p" : 0.9 ,
79+ "vllm_max_tokens" : 2048 ,
80+ "vllm_repetition_penalty" : 1.0 ,
81+ "vllm_seed" : None ,
82+ "vllm_gpu_memory_utilization" : 0.9 ,
83+ },
8084 {
8185 "name" : "qwen_7b" ,
8286 "path" : "./Qwen2.5-7B-Instruct" ,
8387 # 大模型可以用不同的参数
84- "vllm_tensor_parallel_size" : 2 ,
85- "vllm_temperature" : 0.1 ,
86- "vllm_top_p" : 0.9 ,
88+ "vllm_tensor_parallel_size" : 1 ,
89+ "vllm_temperature" : 0.1 ,
90+ "vllm_top_p" : 0.9 ,
8791 "vllm_max_tokens" : 2048 ,
88- "vllm_repetition_penalty" :1.0 ,
89- "vllm_seed" :None ,
92+ "vllm_repetition_penalty" : 1.0 ,
93+ "vllm_seed" : None ,
9094 "vllm_gpu_memory_utilization" : 0.9 ,
9195
9296 # 可以为每个模型自定义提示词
93- "answer_prompt" : """please answer the following question:"""
97+ "answer_prompt" : """Please answer the following question based on the provided information. Put your answer in \\ boxed{{}}.
98+
99+ For example, if the answer is B, output: \\ boxed{{B}}"""
94100
95101 },
96102
97- # 添加更多模型...
98- # {
99- # "name": "llama_8b",
100- # "path": "meta-llama/Llama-3-8B-Instruct",
101- # "tensor_parallel_size": 2
102- # }
103103]
104104
105- # Data Configuration
105+ # Bench Configuration
106106BENCH_CONFIG = [
107107 {
108- "name" : "math_bench " , # bench名称
108+ "name" : "bench_name " , # bench名称
109109 "input_file" : "./.cache/data/qa.json" , # 数据文件
110110 "question_key" : "input" , # 问题字段名
111111 "reference_answer_key" : "output" , # 答案字段名
112- "output_dir" : "./eval_results/math_bench" , # 输出目录
112+ "output_dir" : "./eval_results/bench_name" , # 输出目录
113+ },
114+ {
115+ "name" : "bench_name_1" ,
116+ "input_file" : "./.cache/data/data_qa.json" ,
117+ "question_key" : "input" ,
118+ "reference_answer_key" : "output" ,
119+ "output_dir" : "./eval_results/bench_name_1" ,
113120 },
114- # {
115- # "name": "reasoning_bench",
116- # "input_file": "./.cache/data/reasoning_qa.jsonl",
117- # "question_key": "input",
118- # "reference_answer_key": "output",
119- # "output_dir": "./eval_results/reasoning_bench",
120- # },
121121 # {
122122 # "name": "code_bench",
123123 # "input_file": "./.cache/data/code_qa.json",
124124 # "question_key": "problem",
125125 # "reference_answer_key": "solution",
126126 # "output_dir": "./eval_results/code_bench",
127127 # },
128-
129128]
130129
131130# Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run)
@@ -164,15 +163,15 @@ def create_judge_serving():
164163 # Enhanced VLLM configuration
165164 vllm_config = {
166165 "hf_model_name_or_path" : model_path ,
167- "hf_cache_dir" : JUDGE_MODEL_CONFIG .get ("hf_cache_dir" , None ),
168- "hf_local_dir" : JUDGE_MODEL_CONFIG .get ("hf_local_dir" , None ),
166+ "hf_cache_dir" : JUDGE_MODEL_CONFIG .get ("hf_cache_dir" ),
167+ "hf_local_dir" : JUDGE_MODEL_CONFIG .get ("hf_local_dir" ),
169168 "vllm_tensor_parallel_size" : JUDGE_MODEL_CONFIG .get ("vllm_tensor_parallel_size" , 1 ),
170- "vllm_temperature" : JUDGE_MODEL_CONFIG .get ("vllm_temperature" ,0.9 ),
171- "vllm_top_p" : JUDGE_MODEL_CONFIG .get ("vllm_top_p" ,0.9 ),
169+ "vllm_temperature" : JUDGE_MODEL_CONFIG .get ("vllm_temperature" , 0.9 ),
170+ "vllm_top_p" : JUDGE_MODEL_CONFIG .get ("vllm_top_p" , 0.9 ),
172171 "vllm_max_tokens" : JUDGE_MODEL_CONFIG .get ("max_tokens" , 512 ),
173- "vllm_repetition_penalty" : JUDGE_MODEL_CONFIG .get ("vllm_repetition_penalty" , 1.0 ),
174- "vllm_seed" : JUDGE_MODEL_CONFIG .get ("vllm_seed" ,None ),
175- "vllm_max_model_len" : JUDGE_MODEL_CONFIG .get ("vllm_max_model_len" ,None ),
172+ "vllm_repetition_penalty" : JUDGE_MODEL_CONFIG .get ("vllm_repetition_penalty" , 1.0 ),
173+ "vllm_seed" : JUDGE_MODEL_CONFIG .get ("vllm_seed" , None ),
174+ "vllm_max_model_len" : JUDGE_MODEL_CONFIG .get ("vllm_max_model_len" , None ),
176175 "vllm_gpu_memory_utilization" : JUDGE_MODEL_CONFIG .get ("gpu_memory_utilization" , 0.9 )
177176 }
178177
@@ -195,12 +194,12 @@ def create_evaluator(judge_serving, eval_result_path):
195194 )
196195
197196
198- def create_storage (data_file , cache_path ):
197+ def create_storage (data_file , cache_path , bench_name = "eval_result" ):
199198 """创建存储算子"""
200199 return FileStorage (
201200 first_entry_file_name = data_file ,
202201 cache_path = cache_path ,
203- file_name_prefix = "eval" ,
202+ file_name_prefix = bench_name ,
204203 cache_type = "json"
205204 )
206205
@@ -214,7 +213,7 @@ def get_evaluator_config():
214213 return {
215214 "JUDGE_MODEL_CONFIG" : JUDGE_MODEL_CONFIG ,
216215 "TARGET_MODELS" : TARGET_MODELS ,
217- "DATA_CONFIG " : DATA_CONFIG ,
216+ "BENCH_CONFIG " : BENCH_CONFIG ,
218217 "EVALUATOR_RUN_CONFIG" : EVALUATOR_RUN_CONFIG ,
219218 "EVAL_CONFIG" : EVAL_CONFIG ,
220219 "create_judge_serving" : create_judge_serving ,
@@ -230,7 +229,6 @@ def get_evaluator_config():
230229if __name__ == "__main__" :
231230 # 直接运行时的简单评估
232231 print ("Starting local evaluation..." )
233- from dataflow .cli_funcs .cli_eval import run_evaluation
234232
235233 try :
236234 config = get_evaluator_config ()
0 commit comments