Skip to content

Commit aba86e3

Browse files
committed
evalpipeline
1 parent 104fd76 commit aba86e3

File tree

2 files changed

+59
-61
lines changed

2 files changed

+59
-61
lines changed

dataflow/cli_funcs/eval_pipeline/eval_api.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def build_prompt(self, question, answer, reference_answer):
5757
# 评估模型设置
5858
JUDGE_MODEL_CONFIG = {
5959
"model_name": "gpt-4o-mini",
60-
"api_url": "http://123.129.219.111:3000/v1/chat/completions", # 请求URL 必填 / request (required)
60+
"api_url": "API_URL", # 请求URL 必填 / request (required)
6161
"api_key_env": "DF_API_KEY", # api_key 必填 / api_key (required)
6262
"max_workers": 3,
6363
"max_retries": 5,
@@ -77,9 +77,9 @@ def build_prompt(self, question, answer, reference_answer):
7777
# },
7878
{
7979
"name": "qwen_7b",
80-
"path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-7B-Instruct",
80+
"path": "./Qwen2.5-7B-Instruct",
8181
# 大模型可以用不同的参数
82-
"vllm_tensor_parallel_size": 2,
82+
"vllm_tensor_parallel_size": 1,
8383
"vllm_temperature" : 0.1,
8484
"vllm_top_p" :0.9,
8585
"vllm_max_tokens": 2048,
@@ -97,9 +97,9 @@ def build_prompt(self, question, answer, reference_answer):
9797
},
9898
{
9999
"name": "qwen_3b",
100-
"path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-3B-Instruct",
100+
"path": "./Qwen2.5-3B-Instruct",
101101
# 大模型可以用不同的参数
102-
"vllm_tensor_parallel_size": 2,
102+
"vllm_tensor_parallel_size": 1,
103103
"vllm_temperature" : 0.1,
104104
"vllm_top_p" :0.9,
105105
"vllm_max_tokens": 2048,
@@ -127,15 +127,15 @@ def build_prompt(self, question, answer, reference_answer):
127127
# Data Configuration
128128
BENCH_CONFIG = [
129129
{
130-
"name":"fuck_data",
131-
"input_file": "/mnt/DataFlow/fyl/workspace/.cache/data/qa.json", # 输入数据文件
130+
"name":"bench_name",
131+
"input_file": "./.cache/data/qa.json", # 输入数据文件
132132
"output_dir": "./api_eval", # 输出目录
133133
"question_key": "input", # 原始数据中的问题字段
134134
"reference_answer_key": "output" # 原始数据中的参考答案字段
135135
},
136136
{
137-
"name":"fuck_fuck_data",
138-
"input_file": "/mnt/DataFlow/fyl/workspace/nt.json", # 输入数据文件
137+
"name":"bench_name_1",
138+
"input_file": "./.cache/data/data_qa.json", # 输入数据文件
139139
"output_dir": "./api_eval", # 输出目录
140140
"question_key": "input", # 原始数据中的问题字段
141141
"reference_answer_key": "output" # 原始数据中的参考答案字段

dataflow/cli_funcs/eval_pipeline/eval_local.py

Lines changed: 50 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from dataflow.serving import LocalModelLLMServing_vllm
66
from dataflow.utils.storage import FileStorage
77
from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion
8+
from dataflow.cli_funcs.cli_eval import run_evaluation
89

910
# =============================================================================
1011
# Fair Evaluation Prompt Template
@@ -15,7 +16,6 @@ class FairAnswerJudgePrompt:
1516

1617
def build_prompt(self, question, answer, reference_answer):
1718
prompt = f"""You are an expert evaluator assessing answer quality for academic questions.
18-
1919
**Question:**
2020
{question}
2121
@@ -54,78 +54,77 @@ def build_prompt(self, question, answer, reference_answer):
5454
# Judge Model Configuration (local strong model as judge)
5555
JUDGE_MODEL_CONFIG = {
5656
"model_path": "./Qwen2.5-7B-Instruct", # 用更强的模型做裁判
57-
"hf_cache_dir" : "",
58-
"hf_local_dir" : "",
59-
"vllm_tensor_parallel_size": 2,
60-
"vllm_temperature" : 0.9,
61-
"vllm_top_p" : 0.9,
57+
"hf_cache_dir": "",
58+
"hf_local_dir": "",
59+
"vllm_tensor_parallel_size": 1,
60+
"vllm_temperature": 0.9,
61+
"vllm_top_p": 0.9,
6262
"vllm_max_tokens": 512,
63-
"vllm_repetition_penalty" : 1.0,
64-
"vllm_seed" : None,
65-
"vllm_max_model_len" : None,
66-
"vllm_gpu_memory_utilization" : 0.9
63+
"vllm_repetition_penalty": 0.8,
64+
"vllm_seed": None,
65+
"vllm_max_model_len": None,
66+
"vllm_gpu_memory_utilization": 0.9
6767
}
6868

6969
# Target Models Configuration (字典格式 - 必需)
7070
TARGET_MODELS = [
71-
# {
72-
# "name": "qwen_3b", # 模型名称(可选,默认使用路径最后一部分)
73-
# "path": "./Qwen2.5-3B-Instruct", # 模型路径(必需)
71+
{
72+
"name": "qwen_3b", # 模型名称(可选,默认使用路径最后一部分)
73+
"path": "./Qwen2.5-3B-Instruct", # 模型路径(必需)
7474

75-
# # ===== 答案生成的模型加载参数(可选)=====
76-
# "tensor_parallel_size": 1, # GPU并行数量
77-
# "max_tokens": 1024, # 最大生成token数
78-
# "gpu_memory_utilization": 0.8, # GPU显存利用率
79-
# },
75+
# 大模型可以用不同的参数
76+
"vllm_tensor_parallel_size": 1,
77+
"vllm_temperature": 0.1,
78+
"vllm_top_p": 0.9,
79+
"vllm_max_tokens": 2048,
80+
"vllm_repetition_penalty": 1.0,
81+
"vllm_seed": None,
82+
"vllm_gpu_memory_utilization": 0.9,
83+
},
8084
{
8185
"name": "qwen_7b",
8286
"path": "./Qwen2.5-7B-Instruct",
8387
# 大模型可以用不同的参数
84-
"vllm_tensor_parallel_size": 2,
85-
"vllm_temperature" : 0.1,
86-
"vllm_top_p" :0.9,
88+
"vllm_tensor_parallel_size": 1,
89+
"vllm_temperature": 0.1,
90+
"vllm_top_p": 0.9,
8791
"vllm_max_tokens": 2048,
88-
"vllm_repetition_penalty":1.0,
89-
"vllm_seed":None,
92+
"vllm_repetition_penalty": 1.0,
93+
"vllm_seed": None,
9094
"vllm_gpu_memory_utilization": 0.9,
9195

9296
# 可以为每个模型自定义提示词
93-
"answer_prompt": """please answer the following question:"""
97+
"answer_prompt": """Please answer the following question based on the provided information. Put your answer in \\boxed{{}}.
98+
99+
For example, if the answer is B, output: \\boxed{{B}}"""
94100

95101
},
96102

97-
# 添加更多模型...
98-
# {
99-
# "name": "llama_8b",
100-
# "path": "meta-llama/Llama-3-8B-Instruct",
101-
# "tensor_parallel_size": 2
102-
# }
103103
]
104104

105-
# Data Configuration
105+
# Bench Configuration
106106
BENCH_CONFIG = [
107107
{
108-
"name": "math_bench", # bench名称
108+
"name": "bench_name", # bench名称
109109
"input_file": "./.cache/data/qa.json", # 数据文件
110110
"question_key": "input", # 问题字段名
111111
"reference_answer_key": "output", # 答案字段名
112-
"output_dir": "./eval_results/math_bench", # 输出目录
112+
"output_dir": "./eval_results/bench_name", # 输出目录
113+
},
114+
{
115+
"name": "bench_name_1",
116+
"input_file": "./.cache/data/data_qa.json",
117+
"question_key": "input",
118+
"reference_answer_key": "output",
119+
"output_dir": "./eval_results/bench_name_1",
113120
},
114-
# {
115-
# "name": "reasoning_bench",
116-
# "input_file": "./.cache/data/reasoning_qa.jsonl",
117-
# "question_key": "input",
118-
# "reference_answer_key": "output",
119-
# "output_dir": "./eval_results/reasoning_bench",
120-
# },
121121
# {
122122
# "name": "code_bench",
123123
# "input_file": "./.cache/data/code_qa.json",
124124
# "question_key": "problem",
125125
# "reference_answer_key": "solution",
126126
# "output_dir": "./eval_results/code_bench",
127127
# },
128-
129128
]
130129

131130
# Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run)
@@ -164,15 +163,15 @@ def create_judge_serving():
164163
# Enhanced VLLM configuration
165164
vllm_config = {
166165
"hf_model_name_or_path": model_path,
167-
"hf_cache_dir" : JUDGE_MODEL_CONFIG.get("hf_cache_dir", None),
168-
"hf_local_dir" : JUDGE_MODEL_CONFIG.get("hf_local_dir", None),
166+
"hf_cache_dir": JUDGE_MODEL_CONFIG.get("hf_cache_dir"),
167+
"hf_local_dir": JUDGE_MODEL_CONFIG.get("hf_local_dir"),
169168
"vllm_tensor_parallel_size": JUDGE_MODEL_CONFIG.get("vllm_tensor_parallel_size", 1),
170-
"vllm_temperature" : JUDGE_MODEL_CONFIG.get("vllm_temperature",0.9),
171-
"vllm_top_p" : JUDGE_MODEL_CONFIG.get("vllm_top_p",0.9),
169+
"vllm_temperature": JUDGE_MODEL_CONFIG.get("vllm_temperature", 0.9),
170+
"vllm_top_p": JUDGE_MODEL_CONFIG.get("vllm_top_p", 0.9),
172171
"vllm_max_tokens": JUDGE_MODEL_CONFIG.get("max_tokens", 512),
173-
"vllm_repetition_penalty" : JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0),
174-
"vllm_seed" : JUDGE_MODEL_CONFIG.get("vllm_seed",None),
175-
"vllm_max_model_len" : JUDGE_MODEL_CONFIG.get("vllm_max_model_len",None),
172+
"vllm_repetition_penalty": JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0),
173+
"vllm_seed": JUDGE_MODEL_CONFIG.get("vllm_seed", None),
174+
"vllm_max_model_len": JUDGE_MODEL_CONFIG.get("vllm_max_model_len", None),
176175
"vllm_gpu_memory_utilization": JUDGE_MODEL_CONFIG.get("gpu_memory_utilization", 0.9)
177176
}
178177

@@ -195,12 +194,12 @@ def create_evaluator(judge_serving, eval_result_path):
195194
)
196195

197196

198-
def create_storage(data_file, cache_path):
197+
def create_storage(data_file, cache_path, bench_name="eval_result"):
199198
"""创建存储算子"""
200199
return FileStorage(
201200
first_entry_file_name=data_file,
202201
cache_path=cache_path,
203-
file_name_prefix="eval",
202+
file_name_prefix=bench_name,
204203
cache_type="json"
205204
)
206205

@@ -214,7 +213,7 @@ def get_evaluator_config():
214213
return {
215214
"JUDGE_MODEL_CONFIG": JUDGE_MODEL_CONFIG,
216215
"TARGET_MODELS": TARGET_MODELS,
217-
"DATA_CONFIG": DATA_CONFIG,
216+
"BENCH_CONFIG": BENCH_CONFIG,
218217
"EVALUATOR_RUN_CONFIG": EVALUATOR_RUN_CONFIG,
219218
"EVAL_CONFIG": EVAL_CONFIG,
220219
"create_judge_serving": create_judge_serving,
@@ -230,7 +229,6 @@ def get_evaluator_config():
230229
if __name__ == "__main__":
231230
# 直接运行时的简单评估
232231
print("Starting local evaluation...")
233-
from dataflow.cli_funcs.cli_eval import run_evaluation
234232

235233
try:
236234
config = get_evaluator_config()

0 commit comments

Comments
 (0)