From 104fd760615901eb8e7affde7ce6bd4a8acd3de5 Mon Sep 17 00:00:00 2001 From: Yalin Feng Date: Mon, 24 Nov 2025 21:37:15 +0800 Subject: [PATCH 1/2] add batch bench --- dataflow/cli_funcs/cli_eval.py | 314 ++++++++++++------ dataflow/cli_funcs/eval_pipeline/eval_api.py | 65 +++- .../cli_funcs/eval_pipeline/eval_local.py | 67 +++- 3 files changed, 313 insertions(+), 133 deletions(-) diff --git a/dataflow/cli_funcs/cli_eval.py b/dataflow/cli_funcs/cli_eval.py index f279b10e..a6e2c523 100644 --- a/dataflow/cli_funcs/cli_eval.py +++ b/dataflow/cli_funcs/cli_eval.py @@ -8,7 +8,6 @@ from pathlib import Path from typing import List, Dict, Any from datetime import datetime - from dataflow import get_logger from dataflow.serving import LocalModelLLMServing_vllm from dataflow.operators.reasoning import ReasoningAnswerGenerator @@ -35,7 +34,6 @@ class EvaluationPipeline: def __init__(self, config: Dict[str, Any]): self.config = config - # self.cli_args = cli_args self.prepared_models = [] self.generated_files = [] @@ -103,10 +101,10 @@ def _prepare_models(self) -> List[Dict]: continue model_info = { - **default_config, # 1. 先设置默认值 - **item, # 2. 用户配置覆盖默认值 - "name": item.get("name", Path(item["path"]).name), # 3. 确保name字段正确 - "type": "local" # 4. 强制设置type + **default_config, + **item, + "name": item.get("name", Path(item["path"]).name), + "type": "local" } else: logger.error(f"Invalid model format at index {idx}") @@ -131,119 +129,180 @@ def _clear_vllm_cache(self): logger.warning(f"Failed to clear cache: {e}") def _generate_answers(self) -> List[Dict]: - """生成模型答案""" + """生成模型答案 - 每个模型只加载一次""" generated_files = [] - data_config = self.config.get("DATA_CONFIG", {}) - input_file = data_config.get("input_file", "./.cache/data/qa.json") - - if not Path(input_file).exists(): - logger.error(f"Input file not found: {input_file}") + bench_config_list = self.config.get("BENCH_CONFIG", []) + + if not bench_config_list: + logger.error("No BENCH_CONFIG found") return [] - - self._clear_vllm_cache() - + + # 外层循环:遍历模型 for idx, model_info in enumerate(self.prepared_models, 1): llm_serving = None - answer_generator = None - storage = None - + try: - logger.info(f"[{idx}/{len(self.prepared_models)}] Processing: {model_info['name']}") - - cache_dir = model_info.get('cache_dir', './.cache/eval') - Path(cache_dir).mkdir(parents=True, exist_ok=True) - output_file = f"{cache_dir}/answers_{model_info['name']}.json" - + logger.info(f"[{idx}/{len(self.prepared_models)}] Loading model: {model_info['name']}") + + # 清理缓存(每个模型加载前清理一次) + self._clear_vllm_cache() + # 加载模型 llm_serving = LocalModelLLMServing_vllm( hf_model_name_or_path=model_info['path'], - vllm_tensor_parallel_size=model_info.get('tensor_parallel_size', 2), - vllm_max_tokens=model_info.get('max_tokens', 1024), - vllm_gpu_memory_utilization=model_info.get('gpu_memory_utilization', 0.8) - ) - - # 答案生成器 - custom_prompt = model_info.get('answer_prompt', DEFAULT_ANSWER_PROMPT) - answer_generator = ReasoningAnswerGenerator( - llm_serving=llm_serving, - prompt_template=DiyAnswerGeneratorPrompt(custom_prompt) + vllm_tensor_parallel_size=model_info.get('vllm_tensor_parallel_size', 2), + vllm_temperature=model_info.get('vllm_temperature', 0.7), + vllm_top_p=model_info.get('vllm_top_p', 0.9), + vllm_max_tokens=model_info.get('vllm_max_tokens', 1024), + vllm_repetition_penalty=model_info.get('vllm_repetition_penalty', 1.0), + vllm_seed=model_info.get('vllm_seed', None), + vllm_gpu_memory_utilization=model_info.get('vllm_gpu_memory_utilization', 0.8) ) - - # 存储 - cache_path = f"{cache_dir}/{model_info['name']}_generation" - storage = FileStorage( - first_entry_file_name=input_file, - cache_path=cache_path, - file_name_prefix=model_info.get('file_prefix', 'answer_gen'), - cache_type=model_info.get('cache_type', 'json') - ) - - # 运行生成 - answer_generator.run( - storage=storage.step(), - input_key=data_config.get("question_key", "input"), - output_key=model_info.get('output_key', 'model_generated_answer') - ) - - # 保存结果 - file_prefix = model_info.get('file_prefix', 'answer_gen') - cache_type = model_info.get('cache_type', 'json') - - # 查找所有匹配的文件 - pattern = f"{file_prefix}_step*.{cache_type}" - matching_files = sorted(Path(cache_path).glob(pattern)) - - if matching_files: - # 使用最新的文件(最后一个step) - gen_file = matching_files[-1] - shutil.copy2(gen_file, output_file) - generated_files.append({ - "model_name": model_info['name'], - "model_path": model_info['path'], - "file_path": output_file - }) - else: - logger.error(f"No generated file found for {model_info['name']} in {cache_path}") - continue - + + # 内层循环:遍历bench(复用模型) + for bench_idx, bench_config in enumerate(bench_config_list, 1): + answer_generator = None + storage = None + + try: + bench_name = bench_config.get("name", "default") + logger.info(f" [{bench_idx}/{len(bench_config_list)}] Processing bench: {bench_name}") + + input_file = bench_config["input_file"] + if not Path(input_file).exists(): + logger.error(f"Input file not found: {input_file}") + continue + + question_key = bench_config.get("question_key", "input") + bench_output_dir = bench_config.get("output_dir", "./eval_results") + + # 设置缓存和输出目录 + cache_dir = model_info.get('cache_dir', './.cache/eval') + Path(cache_dir).mkdir(parents=True, exist_ok=True) + Path(bench_output_dir).mkdir(parents=True, exist_ok=True) + + output_file = f"{bench_output_dir}/{bench_name}_answers_{model_info['name']}.json" + + # 答案生成器(复用llm_serving) + custom_prompt = model_info.get('answer_prompt', DEFAULT_ANSWER_PROMPT) + answer_generator = ReasoningAnswerGenerator( + llm_serving=llm_serving, + prompt_template=DiyAnswerGeneratorPrompt(custom_prompt) + ) + + # 存储 + cache_path = f"{cache_dir}/{bench_name}_{model_info['name']}_generation" + storage = FileStorage( + first_entry_file_name=input_file, + cache_path=cache_path, + file_name_prefix=model_info.get('file_prefix', 'answer_gen'), + cache_type=model_info.get('cache_type', 'json') + ) + + # 运行生成 + answer_generator.run( + storage=storage.step(), + input_key=question_key, + output_key=model_info.get('output_key', 'model_generated_answer') + ) + + # 保存结果 + file_prefix = model_info.get('file_prefix', 'answer_gen') + cache_type = model_info.get('cache_type', 'json') + pattern = f"{file_prefix}_step*.{cache_type}" + matching_files = sorted(Path(cache_path).glob(pattern)) + + if matching_files: + gen_file = matching_files[-1] + shutil.copy2(gen_file, output_file) + generated_files.append({ + "model_name": model_info['name'], + "model_path": model_info['path'], + "file_path": output_file, + "bench_name": bench_name + }) + logger.success(f" ✓ Generated answers for {bench_name}") + else: + logger.error(f"No generated file found in {cache_path}") + continue + + except Exception as e: + logger.error(f"Failed to process bench {bench_name}: {e}") + import traceback + traceback.print_exc() + continue + + finally: + # 清理bench级别的资源 + if answer_generator is not None: + del answer_generator + if storage is not None: + del storage + gc.collect() + except Exception as e: - logger.error(f"Failed to process {model_info['name']}: {e}") + logger.error(f"Failed to load model {model_info['name']}: {e}") + import traceback + traceback.print_exc() continue - + finally: - if answer_generator is not None: - del answer_generator - if storage is not None: - del storage + # 清理模型级别的资源 if llm_serving is not None: + logger.info(f"Unloading model: {model_info['name']}") del llm_serving gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() - + return generated_files def _run_evaluation(self) -> List[Dict]: """运行评估""" try: + logger.info("Loading judge model...") judge_serving = self.config["create_judge_serving"]() + logger.info("✓ Judge model loaded") except Exception as e: logger.error(f"Failed to create judge: {e}") return [] results = [] eval_config = self.config.get("EVALUATOR_RUN_CONFIG", {}) - - for file_info in self.generated_files: + + total_evals = len(self.generated_files) + + for eval_idx, file_info in enumerate(self.generated_files, 1): try: + bench_name = file_info.get('bench_name', 'unknown') + model_name = file_info['model_name'] + + logger.info(f"\n[Eval {eval_idx}/{total_evals}] {model_name} × {bench_name}") + + # 找到对应的bench配置 + bench_config = None + for bc in self.config.get("BENCH_CONFIG", []): + if bc.get("name") == bench_name: + bench_config = bc + break + + if not bench_config: + logger.warning(f" ⚠️ No bench config found") + continue + + eval_output_dir = bench_config.get("eval_output_dir", "./eval_results") + + # 执行评估 + logger.info(f" 📊 Evaluating...") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - result_file = f"./eval_results/{timestamp}_{file_info['model_name']}/result.json" + result_file = f"{eval_output_dir}/{timestamp}_{model_name}/result.json" Path(result_file).parent.mkdir(parents=True, exist_ok=True) storage = self.config["create_storage"]( file_info["file_path"], - f"./.cache/eval/{file_info['model_name']}" + f"./.cache/eval/{model_name}", + bench_name ) evaluator = self.config["create_evaluator"](judge_serving, result_file) @@ -258,40 +317,95 @@ def _run_evaluation(self) -> List[Dict]: with open(result_file, 'r') as f: data = json.load(f) if data: - data[0]["model_name"] = file_info['model_name'] - results.append(data[0]) + result_data = data[0].copy() + result_data["model_name"] = model_name + result_data["bench_name"] = bench_name + results.append(result_data) + logger.info(f" ✓ Accuracy: {result_data.get('accuracy', 0):.3f}") except Exception as e: - logger.error(f"Eval failed for {file_info['model_name']}: {e}") + logger.error(f" ✗ Evaluation failed: {e}") + import traceback + traceback.print_exc() continue + logger.info(f"\n✓ Evaluation complete: {len(results)} results") return results def _generate_report(self, results: List[Dict]): - """生成报告""" + """生成报告 - 支持批量bench独立输出""" if not results: logger.warning("No results") return - sorted_results = sorted(results, key=lambda x: x.get("accuracy", 0), reverse=True) + # 打印报告 + print("\n" + "="*80) + print("EVALUATION RESULTS - ALL BENCHES & MODELS") + print("="*80) - print("\n" + "=" * 60) - print("Model Evaluation Results") - print("=" * 60) + # 按准确率排序 + sorted_results = sorted(results, key=lambda x: x.get("accuracy", 0), reverse=True) + for i, r in enumerate(sorted_results, 1): - print(f"{i}. {r['model_name']}") + print(f"{i}. [{r.get('bench_name', 'unknown')}] {r['model_name']}") print(f" Accuracy: {r.get('accuracy', 0):.3f}") print(f" Total: {r.get('total_samples', 0)}") print(f" Matched: {r.get('matched_samples', 0)}") print() - print("=" * 60) - - # 保存详细报告 - report_file = "./eval_results/report.json" - Path(report_file).parent.mkdir(parents=True, exist_ok=True) - with open(report_file, 'w') as f: - json.dump({"results": sorted_results}, f, indent=2) - print(f"Detailed report: {report_file}") + + # 按bench分组保存结果 + bench_config_list = self.config.get("BENCH_CONFIG", []) + + # 为每个bench单独保存结果 + bench_groups = {} + for result in sorted_results: + bench_name = result.get('bench_name', 'unknown') + if bench_name not in bench_groups: + bench_groups[bench_name] = [] + bench_groups[bench_name].append(result) + + # 保存每个bench的结果到各自的output_dir + for bench_name, bench_results in bench_groups.items(): + # 找到对应bench的配置 + bench_output_dir = "./eval_results" # 默认值 + for bench_config in bench_config_list: + if bench_config.get("name") == bench_name: + bench_output_dir = bench_config.get("output_dir", "./eval_results") + break + + # 保存该bench的结果 + report_file = f"{bench_output_dir}/results.json" + Path(report_file).parent.mkdir(parents=True, exist_ok=True) + + report_data = { + "bench_name": bench_name, + "timestamp": datetime.now().isoformat(), + "total_evaluations": len(bench_results), + "results": bench_results + } + + with open(report_file, 'w', encoding='utf-8') as f: + json.dump(report_data, f, ensure_ascii=False, indent=2) + + print(f"Bench '{bench_name}' results saved to: {report_file}") + + # 另外保存一个汇总文件(包含所有bench) + all_results_file = "./eval_results/all_results.json" + Path(all_results_file).parent.mkdir(parents=True, exist_ok=True) + + all_report_data = { + "timestamp": datetime.now().isoformat(), + "total_evaluations": len(sorted_results), + "total_benches": len(bench_groups), + "results": sorted_results + } + + with open(all_results_file, 'w', encoding='utf-8') as f: + json.dump(all_report_data, f, ensure_ascii=False, indent=2) + + print("="*80) + print(f"All results summary saved to: {all_results_file}") + print("="*80) class DataFlowEvalCLI: diff --git a/dataflow/cli_funcs/eval_pipeline/eval_api.py b/dataflow/cli_funcs/eval_pipeline/eval_api.py index a3c0acce..95312581 100644 --- a/dataflow/cli_funcs/eval_pipeline/eval_api.py +++ b/dataflow/cli_funcs/eval_pipeline/eval_api.py @@ -5,7 +5,7 @@ from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion from dataflow.serving import APILLMServing_request from dataflow.utils.storage import FileStorage - +from dataflow.cli_funcs.cli_eval import run_evaluation # ============================================================================= # Fair Evaluation Prompt Template @@ -57,7 +57,7 @@ def build_prompt(self, question, answer, reference_answer): # 评估模型设置 JUDGE_MODEL_CONFIG = { "model_name": "gpt-4o-mini", - "api_url": "", # 请求URL 必填 / request (required) + "api_url": "http://123.129.219.111:3000/v1/chat/completions", # 请求URL 必填 / request (required) "api_key_env": "DF_API_KEY", # api_key 必填 / api_key (required) "max_workers": 3, "max_retries": 5, @@ -77,12 +77,35 @@ def build_prompt(self, question, answer, reference_answer): # }, { "name": "qwen_7b", - "path": "./Qwen2.5-7B-Instruct", + "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-7B-Instruct", + # 大模型可以用不同的参数 + "vllm_tensor_parallel_size": 2, + "vllm_temperature" : 0.1, + "vllm_top_p" :0.9, + "vllm_max_tokens": 2048, + "vllm_repetition_penalty":1.0, + "vllm_seed":None, + "vllm_gpu_memory_utilization": 0.9, + # 可以为每个模型自定义提示词 不写就为默认模板 即build_prompt函数中的prompt + # 默认被评估模型提示词 + # 再次提示:该prompt为被评估模型的提示词,请勿与评估模型提示词混淆!!! + # You can customize prompts for each model. If not specified, defaults to the template in build_prompt function. + # Default prompt for evaluated models + # IMPORTANT: This is the prompt for models being evaluated, NOT for the judge model!!! + "answer_prompt": """please answer the following question:""" # 这里不要使用{question} / do not code {question} here + }, + { + "name": "qwen_3b", + "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-3B-Instruct", # 大模型可以用不同的参数 - "tensor_parallel_size": 2, - "max_tokens": 2048, - "gpu_memory_utilization": 0.9, + "vllm_tensor_parallel_size": 2, + "vllm_temperature" : 0.1, + "vllm_top_p" :0.9, + "vllm_max_tokens": 2048, + "vllm_repetition_penalty":1.0, + "vllm_seed":None, + "vllm_gpu_memory_utilization": 0.9, # 可以为每个模型自定义提示词 不写就为默认模板 即build_prompt函数中的prompt # 默认被评估模型提示词 @@ -102,13 +125,22 @@ def build_prompt(self, question, answer, reference_answer): ] # Data Configuration -DATA_CONFIG = { - "input_file": "./.cache/data/qa.json", # 输入数据文件 - "output_dir": "./eval_results", # 输出目录 - "question_key": "input", # 原始数据中的问题字段 - "reference_answer_key": "output" # 原始数据中的参考答案字段 -} - +BENCH_CONFIG = [ + { + "name":"fuck_data", + "input_file": "/mnt/DataFlow/fyl/workspace/.cache/data/qa.json", # 输入数据文件 + "output_dir": "./api_eval", # 输出目录 + "question_key": "input", # 原始数据中的问题字段 + "reference_answer_key": "output" # 原始数据中的参考答案字段 + }, + { + "name":"fuck_fuck_data", + "input_file": "/mnt/DataFlow/fyl/workspace/nt.json", # 输入数据文件 + "output_dir": "./api_eval", # 输出目录 + "question_key": "input", # 原始数据中的问题字段 + "reference_answer_key": "output" # 原始数据中的参考答案字段 + } +] # Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run) EVALUATOR_RUN_CONFIG = { "input_test_answer_key": "model_generated_answer", # 模型生成的答案字段名 @@ -157,12 +189,12 @@ def create_evaluator(judge_serving, eval_result_path): ) -def create_storage(data_file, cache_path): +def create_storage(data_file, cache_path,bench_name="eval_result"): """创建存储算子""" return FileStorage( first_entry_file_name=data_file, cache_path=cache_path, - file_name_prefix="eval_result", + file_name_prefix=bench_name, cache_type="json" ) @@ -177,7 +209,7 @@ def get_evaluator_config(): return { "JUDGE_MODEL_CONFIG": JUDGE_MODEL_CONFIG, # 评估模型设置映射 "TARGET_MODELS": TARGET_MODELS, # 被评估模型设置映射 - "DATA_CONFIG": DATA_CONFIG, # 数据设置映射 + "BENCH_CONFIG": BENCH_CONFIG, # 数据设置映射 "EVAL_CONFIG": EVAL_CONFIG, # 评估模式设置映射 "EVALUATOR_RUN_CONFIG": EVALUATOR_RUN_CONFIG, # 评估数据集字段映射 "create_judge_serving": create_judge_serving, @@ -195,7 +227,6 @@ def get_evaluator_config(): # 直接运行时的简单评估 # Simple evaluation when run directly print("Starting API evaluation...") - from dataflow.cli_funcs.cli_eval import run_evaluation try: config = get_evaluator_config() diff --git a/dataflow/cli_funcs/eval_pipeline/eval_local.py b/dataflow/cli_funcs/eval_pipeline/eval_local.py index 4053685f..29de3393 100644 --- a/dataflow/cli_funcs/eval_pipeline/eval_local.py +++ b/dataflow/cli_funcs/eval_pipeline/eval_local.py @@ -54,9 +54,16 @@ def build_prompt(self, question, answer, reference_answer): # Judge Model Configuration (local strong model as judge) JUDGE_MODEL_CONFIG = { "model_path": "./Qwen2.5-7B-Instruct", # 用更强的模型做裁判 - "tensor_parallel_size": 1, - "max_tokens": 512, - "gpu_memory_utilization": 0.8, + "hf_cache_dir" : "", + "hf_local_dir" : "", + "vllm_tensor_parallel_size": 2, + "vllm_temperature" : 0.9, + "vllm_top_p" : 0.9, + "vllm_max_tokens": 512, + "vllm_repetition_penalty" : 1.0, + "vllm_seed" : None, + "vllm_max_model_len" : None, + "vllm_gpu_memory_utilization" : 0.9 } # Target Models Configuration (字典格式 - 必需) @@ -73,11 +80,14 @@ def build_prompt(self, question, answer, reference_answer): { "name": "qwen_7b", "path": "./Qwen2.5-7B-Instruct", - # 大模型可以用不同的参数 - "tensor_parallel_size": 2, - "max_tokens": 2048, - "gpu_memory_utilization": 0.9, + "vllm_tensor_parallel_size": 2, + "vllm_temperature" : 0.1, + "vllm_top_p" :0.9, + "vllm_max_tokens": 2048, + "vllm_repetition_penalty":1.0, + "vllm_seed":None, + "vllm_gpu_memory_utilization": 0.9, # 可以为每个模型自定义提示词 "answer_prompt": """please answer the following question:""" @@ -93,12 +103,30 @@ def build_prompt(self, question, answer, reference_answer): ] # Data Configuration -DATA_CONFIG = { - "input_file": "/data1/fyl/workspace/.cache/data/qa.json", # 输入数据文件 - "output_dir": "./eval_results", # 输出目录 - "question_key": "input", # 原始数据中的问题字段 - "reference_answer_key": "output" # 原始数据中的参考答案字段 -} +BENCH_CONFIG = [ + { + "name": "math_bench", # bench名称 + "input_file": "./.cache/data/qa.json", # 数据文件 + "question_key": "input", # 问题字段名 + "reference_answer_key": "output", # 答案字段名 + "output_dir": "./eval_results/math_bench", # 输出目录 + }, + # { + # "name": "reasoning_bench", + # "input_file": "./.cache/data/reasoning_qa.jsonl", + # "question_key": "input", + # "reference_answer_key": "output", + # "output_dir": "./eval_results/reasoning_bench", + # }, + # { + # "name": "code_bench", + # "input_file": "./.cache/data/code_qa.json", + # "question_key": "problem", + # "reference_answer_key": "solution", + # "output_dir": "./eval_results/code_bench", + # }, + +] # Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run) EVALUATOR_RUN_CONFIG = { @@ -136,9 +164,16 @@ def create_judge_serving(): # Enhanced VLLM configuration vllm_config = { "hf_model_name_or_path": model_path, - "vllm_tensor_parallel_size": JUDGE_MODEL_CONFIG.get("tensor_parallel_size", 1), + "hf_cache_dir" : JUDGE_MODEL_CONFIG.get("hf_cache_dir", None), + "hf_local_dir" : JUDGE_MODEL_CONFIG.get("hf_local_dir", None), + "vllm_tensor_parallel_size": JUDGE_MODEL_CONFIG.get("vllm_tensor_parallel_size", 1), + "vllm_temperature" : JUDGE_MODEL_CONFIG.get("vllm_temperature",0.9), + "vllm_top_p" : JUDGE_MODEL_CONFIG.get("vllm_top_p",0.9), "vllm_max_tokens": JUDGE_MODEL_CONFIG.get("max_tokens", 512), - "vllm_gpu_memory_utilization": JUDGE_MODEL_CONFIG.get("gpu_memory_utilization", 0.8) + "vllm_repetition_penalty" : JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0), + "vllm_seed" : JUDGE_MODEL_CONFIG.get("vllm_seed",None), + "vllm_max_model_len" : JUDGE_MODEL_CONFIG.get("vllm_max_model_len",None), + "vllm_gpu_memory_utilization": JUDGE_MODEL_CONFIG.get("gpu_memory_utilization", 0.9) } # Add optional VLLM parameters if they exist @@ -165,7 +200,7 @@ def create_storage(data_file, cache_path): return FileStorage( first_entry_file_name=data_file, cache_path=cache_path, - file_name_prefix="eval_result", + file_name_prefix="eval", cache_type="json" ) From aba86e31169e53b5261061b5b61019a642cbb8a6 Mon Sep 17 00:00:00 2001 From: Yalin Feng Date: Sat, 29 Nov 2025 03:36:52 +0800 Subject: [PATCH 2/2] evalpipeline --- dataflow/cli_funcs/eval_pipeline/eval_api.py | 18 ++-- .../cli_funcs/eval_pipeline/eval_local.py | 102 +++++++++--------- 2 files changed, 59 insertions(+), 61 deletions(-) diff --git a/dataflow/cli_funcs/eval_pipeline/eval_api.py b/dataflow/cli_funcs/eval_pipeline/eval_api.py index 95312581..c3cb1bb0 100644 --- a/dataflow/cli_funcs/eval_pipeline/eval_api.py +++ b/dataflow/cli_funcs/eval_pipeline/eval_api.py @@ -57,7 +57,7 @@ def build_prompt(self, question, answer, reference_answer): # 评估模型设置 JUDGE_MODEL_CONFIG = { "model_name": "gpt-4o-mini", - "api_url": "http://123.129.219.111:3000/v1/chat/completions", # 请求URL 必填 / request (required) + "api_url": "API_URL", # 请求URL 必填 / request (required) "api_key_env": "DF_API_KEY", # api_key 必填 / api_key (required) "max_workers": 3, "max_retries": 5, @@ -77,9 +77,9 @@ def build_prompt(self, question, answer, reference_answer): # }, { "name": "qwen_7b", - "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-7B-Instruct", + "path": "./Qwen2.5-7B-Instruct", # 大模型可以用不同的参数 - "vllm_tensor_parallel_size": 2, + "vllm_tensor_parallel_size": 1, "vllm_temperature" : 0.1, "vllm_top_p" :0.9, "vllm_max_tokens": 2048, @@ -97,9 +97,9 @@ def build_prompt(self, question, answer, reference_answer): }, { "name": "qwen_3b", - "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-3B-Instruct", + "path": "./Qwen2.5-3B-Instruct", # 大模型可以用不同的参数 - "vllm_tensor_parallel_size": 2, + "vllm_tensor_parallel_size": 1, "vllm_temperature" : 0.1, "vllm_top_p" :0.9, "vllm_max_tokens": 2048, @@ -127,15 +127,15 @@ def build_prompt(self, question, answer, reference_answer): # Data Configuration BENCH_CONFIG = [ { - "name":"fuck_data", - "input_file": "/mnt/DataFlow/fyl/workspace/.cache/data/qa.json", # 输入数据文件 + "name":"bench_name", + "input_file": "./.cache/data/qa.json", # 输入数据文件 "output_dir": "./api_eval", # 输出目录 "question_key": "input", # 原始数据中的问题字段 "reference_answer_key": "output" # 原始数据中的参考答案字段 }, { - "name":"fuck_fuck_data", - "input_file": "/mnt/DataFlow/fyl/workspace/nt.json", # 输入数据文件 + "name":"bench_name_1", + "input_file": "./.cache/data/data_qa.json", # 输入数据文件 "output_dir": "./api_eval", # 输出目录 "question_key": "input", # 原始数据中的问题字段 "reference_answer_key": "output" # 原始数据中的参考答案字段 diff --git a/dataflow/cli_funcs/eval_pipeline/eval_local.py b/dataflow/cli_funcs/eval_pipeline/eval_local.py index 29de3393..0cacf850 100644 --- a/dataflow/cli_funcs/eval_pipeline/eval_local.py +++ b/dataflow/cli_funcs/eval_pipeline/eval_local.py @@ -5,6 +5,7 @@ from dataflow.serving import LocalModelLLMServing_vllm from dataflow.utils.storage import FileStorage from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion +from dataflow.cli_funcs.cli_eval import run_evaluation # ============================================================================= # Fair Evaluation Prompt Template @@ -15,7 +16,6 @@ class FairAnswerJudgePrompt: def build_prompt(self, question, answer, reference_answer): prompt = f"""You are an expert evaluator assessing answer quality for academic questions. - **Question:** {question} @@ -54,70 +54,70 @@ def build_prompt(self, question, answer, reference_answer): # Judge Model Configuration (local strong model as judge) JUDGE_MODEL_CONFIG = { "model_path": "./Qwen2.5-7B-Instruct", # 用更强的模型做裁判 - "hf_cache_dir" : "", - "hf_local_dir" : "", - "vllm_tensor_parallel_size": 2, - "vllm_temperature" : 0.9, - "vllm_top_p" : 0.9, + "hf_cache_dir": "", + "hf_local_dir": "", + "vllm_tensor_parallel_size": 1, + "vllm_temperature": 0.9, + "vllm_top_p": 0.9, "vllm_max_tokens": 512, - "vllm_repetition_penalty" : 1.0, - "vllm_seed" : None, - "vllm_max_model_len" : None, - "vllm_gpu_memory_utilization" : 0.9 + "vllm_repetition_penalty": 0.8, + "vllm_seed": None, + "vllm_max_model_len": None, + "vllm_gpu_memory_utilization": 0.9 } # Target Models Configuration (字典格式 - 必需) TARGET_MODELS = [ - # { - # "name": "qwen_3b", # 模型名称(可选,默认使用路径最后一部分) - # "path": "./Qwen2.5-3B-Instruct", # 模型路径(必需) + { + "name": "qwen_3b", # 模型名称(可选,默认使用路径最后一部分) + "path": "./Qwen2.5-3B-Instruct", # 模型路径(必需) - # # ===== 答案生成的模型加载参数(可选)===== - # "tensor_parallel_size": 1, # GPU并行数量 - # "max_tokens": 1024, # 最大生成token数 - # "gpu_memory_utilization": 0.8, # GPU显存利用率 - # }, + # 大模型可以用不同的参数 + "vllm_tensor_parallel_size": 1, + "vllm_temperature": 0.1, + "vllm_top_p": 0.9, + "vllm_max_tokens": 2048, + "vllm_repetition_penalty": 1.0, + "vllm_seed": None, + "vllm_gpu_memory_utilization": 0.9, + }, { "name": "qwen_7b", "path": "./Qwen2.5-7B-Instruct", # 大模型可以用不同的参数 - "vllm_tensor_parallel_size": 2, - "vllm_temperature" : 0.1, - "vllm_top_p" :0.9, + "vllm_tensor_parallel_size": 1, + "vllm_temperature": 0.1, + "vllm_top_p": 0.9, "vllm_max_tokens": 2048, - "vllm_repetition_penalty":1.0, - "vllm_seed":None, + "vllm_repetition_penalty": 1.0, + "vllm_seed": None, "vllm_gpu_memory_utilization": 0.9, # 可以为每个模型自定义提示词 - "answer_prompt": """please answer the following question:""" + "answer_prompt": """Please answer the following question based on the provided information. Put your answer in \\boxed{{}}. + + For example, if the answer is B, output: \\boxed{{B}}""" }, - # 添加更多模型... - # { - # "name": "llama_8b", - # "path": "meta-llama/Llama-3-8B-Instruct", - # "tensor_parallel_size": 2 - # } ] -# Data Configuration +# Bench Configuration BENCH_CONFIG = [ { - "name": "math_bench", # bench名称 + "name": "bench_name", # bench名称 "input_file": "./.cache/data/qa.json", # 数据文件 "question_key": "input", # 问题字段名 "reference_answer_key": "output", # 答案字段名 - "output_dir": "./eval_results/math_bench", # 输出目录 + "output_dir": "./eval_results/bench_name", # 输出目录 + }, + { + "name": "bench_name_1", + "input_file": "./.cache/data/data_qa.json", + "question_key": "input", + "reference_answer_key": "output", + "output_dir": "./eval_results/bench_name_1", }, - # { - # "name": "reasoning_bench", - # "input_file": "./.cache/data/reasoning_qa.jsonl", - # "question_key": "input", - # "reference_answer_key": "output", - # "output_dir": "./eval_results/reasoning_bench", - # }, # { # "name": "code_bench", # "input_file": "./.cache/data/code_qa.json", @@ -125,7 +125,6 @@ def build_prompt(self, question, answer, reference_answer): # "reference_answer_key": "solution", # "output_dir": "./eval_results/code_bench", # }, - ] # Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run) @@ -164,15 +163,15 @@ def create_judge_serving(): # Enhanced VLLM configuration vllm_config = { "hf_model_name_or_path": model_path, - "hf_cache_dir" : JUDGE_MODEL_CONFIG.get("hf_cache_dir", None), - "hf_local_dir" : JUDGE_MODEL_CONFIG.get("hf_local_dir", None), + "hf_cache_dir": JUDGE_MODEL_CONFIG.get("hf_cache_dir"), + "hf_local_dir": JUDGE_MODEL_CONFIG.get("hf_local_dir"), "vllm_tensor_parallel_size": JUDGE_MODEL_CONFIG.get("vllm_tensor_parallel_size", 1), - "vllm_temperature" : JUDGE_MODEL_CONFIG.get("vllm_temperature",0.9), - "vllm_top_p" : JUDGE_MODEL_CONFIG.get("vllm_top_p",0.9), + "vllm_temperature": JUDGE_MODEL_CONFIG.get("vllm_temperature", 0.9), + "vllm_top_p": JUDGE_MODEL_CONFIG.get("vllm_top_p", 0.9), "vllm_max_tokens": JUDGE_MODEL_CONFIG.get("max_tokens", 512), - "vllm_repetition_penalty" : JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0), - "vllm_seed" : JUDGE_MODEL_CONFIG.get("vllm_seed",None), - "vllm_max_model_len" : JUDGE_MODEL_CONFIG.get("vllm_max_model_len",None), + "vllm_repetition_penalty": JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0), + "vllm_seed": JUDGE_MODEL_CONFIG.get("vllm_seed", None), + "vllm_max_model_len": JUDGE_MODEL_CONFIG.get("vllm_max_model_len", None), "vllm_gpu_memory_utilization": JUDGE_MODEL_CONFIG.get("gpu_memory_utilization", 0.9) } @@ -195,12 +194,12 @@ def create_evaluator(judge_serving, eval_result_path): ) -def create_storage(data_file, cache_path): +def create_storage(data_file, cache_path, bench_name="eval_result"): """创建存储算子""" return FileStorage( first_entry_file_name=data_file, cache_path=cache_path, - file_name_prefix="eval", + file_name_prefix=bench_name, cache_type="json" ) @@ -214,7 +213,7 @@ def get_evaluator_config(): return { "JUDGE_MODEL_CONFIG": JUDGE_MODEL_CONFIG, "TARGET_MODELS": TARGET_MODELS, - "DATA_CONFIG": DATA_CONFIG, + "BENCH_CONFIG": BENCH_CONFIG, "EVALUATOR_RUN_CONFIG": EVALUATOR_RUN_CONFIG, "EVAL_CONFIG": EVAL_CONFIG, "create_judge_serving": create_judge_serving, @@ -230,7 +229,6 @@ def get_evaluator_config(): if __name__ == "__main__": # 直接运行时的简单评估 print("Starting local evaluation...") - from dataflow.cli_funcs.cli_eval import run_evaluation try: config = get_evaluator_config()