From 104fd760615901eb8e7affde7ce6bd4a8acd3de5 Mon Sep 17 00:00:00 2001
From: Yalin Feng <Feng_Yalin@163.com>
Date: Mon, 24 Nov 2025 21:37:15 +0800
Subject: [PATCH 1/2] add batch bench

---
 dataflow/cli_funcs/cli_eval.py                | 314 ++++++++++++------
 dataflow/cli_funcs/eval_pipeline/eval_api.py  |  65 +++-
 .../cli_funcs/eval_pipeline/eval_local.py     |  67 +++-
 3 files changed, 313 insertions(+), 133 deletions(-)

diff --git a/dataflow/cli_funcs/cli_eval.py b/dataflow/cli_funcs/cli_eval.py
index f279b10e..a6e2c523 100644
--- a/dataflow/cli_funcs/cli_eval.py
+++ b/dataflow/cli_funcs/cli_eval.py
@@ -8,7 +8,6 @@
 from pathlib import Path
 from typing import List, Dict, Any
 from datetime import datetime
-
 from dataflow import get_logger
 from dataflow.serving import LocalModelLLMServing_vllm
 from dataflow.operators.reasoning import ReasoningAnswerGenerator
@@ -35,7 +34,6 @@ class EvaluationPipeline:
 
     def __init__(self, config: Dict[str, Any]):
         self.config = config
-        # self.cli_args = cli_args
         self.prepared_models = []
         self.generated_files = []
 
@@ -103,10 +101,10 @@ def _prepare_models(self) -> List[Dict]:
                     continue
 
                 model_info = {
-                    **default_config,  # 1. 先设置默认值
-                    **item,  # 2. 用户配置覆盖默认值
-                    "name": item.get("name", Path(item["path"]).name),  # 3. 确保name字段正确
-                    "type": "local"  # 4. 强制设置type
+                    **default_config,
+                    **item,
+                    "name": item.get("name", Path(item["path"]).name),
+                    "type": "local"
                 }
             else:
                 logger.error(f"Invalid model format at index {idx}")
@@ -131,119 +129,180 @@ def _clear_vllm_cache(self):
                     logger.warning(f"Failed to clear cache: {e}")
 
     def _generate_answers(self) -> List[Dict]:
-        """生成模型答案"""
+        """生成模型答案 - 每个模型只加载一次"""
         generated_files = []
-        data_config = self.config.get("DATA_CONFIG", {})
-        input_file = data_config.get("input_file", "./.cache/data/qa.json")
-
-        if not Path(input_file).exists():
-            logger.error(f"Input file not found: {input_file}")
+        bench_config_list = self.config.get("BENCH_CONFIG", [])
+        
+        if not bench_config_list:
+            logger.error("No BENCH_CONFIG found")
             return []
-
-        self._clear_vllm_cache()
-
+        
+        # 外层循环：遍历模型
         for idx, model_info in enumerate(self.prepared_models, 1):
             llm_serving = None
-            answer_generator = None
-            storage = None
-
+            
             try:
-                logger.info(f"[{idx}/{len(self.prepared_models)}] Processing: {model_info['name']}")
-
-                cache_dir = model_info.get('cache_dir', './.cache/eval')
-                Path(cache_dir).mkdir(parents=True, exist_ok=True)
-                output_file = f"{cache_dir}/answers_{model_info['name']}.json"
-
+                logger.info(f"[{idx}/{len(self.prepared_models)}] Loading model: {model_info['name']}")
+                
+                # 清理缓存（每个模型加载前清理一次）
+                self._clear_vllm_cache()
+                
                 # 加载模型
                 llm_serving = LocalModelLLMServing_vllm(
                     hf_model_name_or_path=model_info['path'],
-                    vllm_tensor_parallel_size=model_info.get('tensor_parallel_size', 2),
-                    vllm_max_tokens=model_info.get('max_tokens', 1024),
-                    vllm_gpu_memory_utilization=model_info.get('gpu_memory_utilization', 0.8)
-                )
-
-                # 答案生成器
-                custom_prompt = model_info.get('answer_prompt', DEFAULT_ANSWER_PROMPT)
-                answer_generator = ReasoningAnswerGenerator(
-                    llm_serving=llm_serving,
-                    prompt_template=DiyAnswerGeneratorPrompt(custom_prompt)
+                    vllm_tensor_parallel_size=model_info.get('vllm_tensor_parallel_size', 2),
+                    vllm_temperature=model_info.get('vllm_temperature', 0.7),
+                    vllm_top_p=model_info.get('vllm_top_p', 0.9),
+                    vllm_max_tokens=model_info.get('vllm_max_tokens', 1024),
+                    vllm_repetition_penalty=model_info.get('vllm_repetition_penalty', 1.0),
+                    vllm_seed=model_info.get('vllm_seed', None),
+                    vllm_gpu_memory_utilization=model_info.get('vllm_gpu_memory_utilization', 0.8)
                 )
-
-                # 存储
-                cache_path = f"{cache_dir}/{model_info['name']}_generation"
-                storage = FileStorage(
-                    first_entry_file_name=input_file,
-                    cache_path=cache_path,
-                    file_name_prefix=model_info.get('file_prefix', 'answer_gen'),
-                    cache_type=model_info.get('cache_type', 'json')
-                )
-
-                # 运行生成
-                answer_generator.run(
-                    storage=storage.step(),
-                    input_key=data_config.get("question_key", "input"),
-                    output_key=model_info.get('output_key', 'model_generated_answer')
-                )
-
-                # 保存结果
-                file_prefix = model_info.get('file_prefix', 'answer_gen')
-                cache_type = model_info.get('cache_type', 'json')
-
-                # 查找所有匹配的文件
-                pattern = f"{file_prefix}_step*.{cache_type}"
-                matching_files = sorted(Path(cache_path).glob(pattern))
-
-                if matching_files:
-                    # 使用最新的文件（最后一个step）
-                    gen_file = matching_files[-1]
-                    shutil.copy2(gen_file, output_file)
-                    generated_files.append({
-                        "model_name": model_info['name'],
-                        "model_path": model_info['path'],
-                        "file_path": output_file
-                    })
-                else:
-                    logger.error(f"No generated file found for {model_info['name']} in {cache_path}")
-                    continue
-
+                
+                # 内层循环：遍历bench（复用模型）
+                for bench_idx, bench_config in enumerate(bench_config_list, 1):
+                    answer_generator = None
+                    storage = None
+                    
+                    try:
+                        bench_name = bench_config.get("name", "default")
+                        logger.info(f"  [{bench_idx}/{len(bench_config_list)}] Processing bench: {bench_name}")
+                        
+                        input_file = bench_config["input_file"]
+                        if not Path(input_file).exists():
+                            logger.error(f"Input file not found: {input_file}")
+                            continue
+                        
+                        question_key = bench_config.get("question_key", "input")
+                        bench_output_dir = bench_config.get("output_dir", "./eval_results")
+                        
+                        # 设置缓存和输出目录
+                        cache_dir = model_info.get('cache_dir', './.cache/eval')
+                        Path(cache_dir).mkdir(parents=True, exist_ok=True)
+                        Path(bench_output_dir).mkdir(parents=True, exist_ok=True)
+                        
+                        output_file = f"{bench_output_dir}/{bench_name}_answers_{model_info['name']}.json"
+                        
+                        # 答案生成器（复用llm_serving）
+                        custom_prompt = model_info.get('answer_prompt', DEFAULT_ANSWER_PROMPT)
+                        answer_generator = ReasoningAnswerGenerator(
+                            llm_serving=llm_serving,
+                            prompt_template=DiyAnswerGeneratorPrompt(custom_prompt)
+                        )
+                        
+                        # 存储
+                        cache_path = f"{cache_dir}/{bench_name}_{model_info['name']}_generation"
+                        storage = FileStorage(
+                            first_entry_file_name=input_file,
+                            cache_path=cache_path,
+                            file_name_prefix=model_info.get('file_prefix', 'answer_gen'),
+                            cache_type=model_info.get('cache_type', 'json')
+                        )
+                        
+                        # 运行生成
+                        answer_generator.run(
+                            storage=storage.step(),
+                            input_key=question_key,
+                            output_key=model_info.get('output_key', 'model_generated_answer')
+                        )
+                        
+                        # 保存结果
+                        file_prefix = model_info.get('file_prefix', 'answer_gen')
+                        cache_type = model_info.get('cache_type', 'json')
+                        pattern = f"{file_prefix}_step*.{cache_type}"
+                        matching_files = sorted(Path(cache_path).glob(pattern))
+                        
+                        if matching_files:
+                            gen_file = matching_files[-1]
+                            shutil.copy2(gen_file, output_file)
+                            generated_files.append({
+                                "model_name": model_info['name'],
+                                "model_path": model_info['path'],
+                                "file_path": output_file,
+                                "bench_name": bench_name
+                            })
+                            logger.success(f"  ✓ Generated answers for {bench_name}")
+                        else:
+                            logger.error(f"No generated file found in {cache_path}")
+                            continue
+                            
+                    except Exception as e:
+                        logger.error(f"Failed to process bench {bench_name}: {e}")
+                        import traceback
+                        traceback.print_exc()
+                        continue
+                        
+                    finally:
+                        # 清理bench级别的资源
+                        if answer_generator is not None:
+                            del answer_generator
+                        if storage is not None:
+                            del storage
+                        gc.collect()
+                
             except Exception as e:
-                logger.error(f"Failed to process {model_info['name']}: {e}")
+                logger.error(f"Failed to load model {model_info['name']}: {e}")
+                import traceback
+                traceback.print_exc()
                 continue
-
+                
             finally:
-                if answer_generator is not None:
-                    del answer_generator
-                if storage is not None:
-                    del storage
+                # 清理模型级别的资源
                 if llm_serving is not None:
+                    logger.info(f"Unloading model: {model_info['name']}")
                     del llm_serving
                 gc.collect()
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                     torch.cuda.synchronize()
-
+        
         return generated_files
 
     def _run_evaluation(self) -> List[Dict]:
         """运行评估"""
         try:
+            logger.info("Loading judge model...")
             judge_serving = self.config["create_judge_serving"]()
+            logger.info("✓ Judge model loaded")
         except Exception as e:
             logger.error(f"Failed to create judge: {e}")
             return []
 
         results = []
         eval_config = self.config.get("EVALUATOR_RUN_CONFIG", {})
-
-        for file_info in self.generated_files:
+        
+        total_evals = len(self.generated_files)
+        
+        for eval_idx, file_info in enumerate(self.generated_files, 1):
             try:
+                bench_name = file_info.get('bench_name', 'unknown')
+                model_name = file_info['model_name']
+                
+                logger.info(f"\n[Eval {eval_idx}/{total_evals}] {model_name} × {bench_name}")
+                
+                # 找到对应的bench配置
+                bench_config = None
+                for bc in self.config.get("BENCH_CONFIG", []):
+                    if bc.get("name") == bench_name:
+                        bench_config = bc
+                        break
+                
+                if not bench_config:
+                    logger.warning(f"  ⚠️  No bench config found")
+                    continue
+                
+                eval_output_dir = bench_config.get("eval_output_dir", "./eval_results")
+                
+                # 执行评估
+                logger.info(f"  📊 Evaluating...")
                 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-                result_file = f"./eval_results/{timestamp}_{file_info['model_name']}/result.json"
+                result_file = f"{eval_output_dir}/{timestamp}_{model_name}/result.json"
                 Path(result_file).parent.mkdir(parents=True, exist_ok=True)
 
                 storage = self.config["create_storage"](
                     file_info["file_path"],
-                    f"./.cache/eval/{file_info['model_name']}"
+                    f"./.cache/eval/{model_name}",
+                    bench_name
                 )
                 evaluator = self.config["create_evaluator"](judge_serving, result_file)
 
@@ -258,40 +317,95 @@ def _run_evaluation(self) -> List[Dict]:
                     with open(result_file, 'r') as f:
                         data = json.load(f)
                         if data:
-                            data[0]["model_name"] = file_info['model_name']
-                            results.append(data[0])
+                            result_data = data[0].copy()
+                            result_data["model_name"] = model_name
+                            result_data["bench_name"] = bench_name
+                            results.append(result_data)
+                            logger.info(f"  ✓ Accuracy: {result_data.get('accuracy', 0):.3f}")
 
             except Exception as e:
-                logger.error(f"Eval failed for {file_info['model_name']}: {e}")
+                logger.error(f"  ✗ Evaluation failed: {e}")
+                import traceback
+                traceback.print_exc()
                 continue
 
+        logger.info(f"\n✓ Evaluation complete: {len(results)} results")
         return results
 
     def _generate_report(self, results: List[Dict]):
-        """生成报告"""
+        """生成报告 - 支持批量bench独立输出"""
         if not results:
             logger.warning("No results")
             return
 
-        sorted_results = sorted(results, key=lambda x: x.get("accuracy", 0), reverse=True)
+        # 打印报告
+        print("\n" + "="*80)
+        print("EVALUATION RESULTS - ALL BENCHES & MODELS")
+        print("="*80)
 
-        print("\n" + "=" * 60)
-        print("Model Evaluation Results")
-        print("=" * 60)
+        # 按准确率排序
+        sorted_results = sorted(results, key=lambda x: x.get("accuracy", 0), reverse=True)
+        
         for i, r in enumerate(sorted_results, 1):
-            print(f"{i}. {r['model_name']}")
+            print(f"{i}. [{r.get('bench_name', 'unknown')}] {r['model_name']}")
             print(f"   Accuracy: {r.get('accuracy', 0):.3f}")
             print(f"   Total: {r.get('total_samples', 0)}")
             print(f"   Matched: {r.get('matched_samples', 0)}")
             print()
-        print("=" * 60)
-
-        # 保存详细报告
-        report_file = "./eval_results/report.json"
-        Path(report_file).parent.mkdir(parents=True, exist_ok=True)
-        with open(report_file, 'w') as f:
-            json.dump({"results": sorted_results}, f, indent=2)
-        print(f"Detailed report: {report_file}")
+        
+        # 按bench分组保存结果
+        bench_config_list = self.config.get("BENCH_CONFIG", [])
+        
+        # 为每个bench单独保存结果
+        bench_groups = {}
+        for result in sorted_results:
+            bench_name = result.get('bench_name', 'unknown')
+            if bench_name not in bench_groups:
+                bench_groups[bench_name] = []
+            bench_groups[bench_name].append(result)
+        
+        # 保存每个bench的结果到各自的output_dir
+        for bench_name, bench_results in bench_groups.items():
+            # 找到对应bench的配置
+            bench_output_dir = "./eval_results"  # 默认值
+            for bench_config in bench_config_list:
+                if bench_config.get("name") == bench_name:
+                    bench_output_dir = bench_config.get("output_dir", "./eval_results")
+                    break
+            
+            # 保存该bench的结果
+            report_file = f"{bench_output_dir}/results.json"
+            Path(report_file).parent.mkdir(parents=True, exist_ok=True)
+            
+            report_data = {
+                "bench_name": bench_name,
+                "timestamp": datetime.now().isoformat(),
+                "total_evaluations": len(bench_results),
+                "results": bench_results
+            }
+            
+            with open(report_file, 'w', encoding='utf-8') as f:
+                json.dump(report_data, f, ensure_ascii=False, indent=2)
+            
+            print(f"Bench '{bench_name}' results saved to: {report_file}")
+        
+        # 另外保存一个汇总文件（包含所有bench）
+        all_results_file = "./eval_results/all_results.json"
+        Path(all_results_file).parent.mkdir(parents=True, exist_ok=True)
+        
+        all_report_data = {
+            "timestamp": datetime.now().isoformat(),
+            "total_evaluations": len(sorted_results),
+            "total_benches": len(bench_groups),
+            "results": sorted_results
+        }
+        
+        with open(all_results_file, 'w', encoding='utf-8') as f:
+            json.dump(all_report_data, f, ensure_ascii=False, indent=2)
+        
+        print("="*80)
+        print(f"All results summary saved to: {all_results_file}")
+        print("="*80)
 
 
 class DataFlowEvalCLI:
diff --git a/dataflow/cli_funcs/eval_pipeline/eval_api.py b/dataflow/cli_funcs/eval_pipeline/eval_api.py
index a3c0acce..95312581 100644
--- a/dataflow/cli_funcs/eval_pipeline/eval_api.py
+++ b/dataflow/cli_funcs/eval_pipeline/eval_api.py
@@ -5,7 +5,7 @@
 from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion
 from dataflow.serving import APILLMServing_request
 from dataflow.utils.storage import FileStorage
-
+from dataflow.cli_funcs.cli_eval import run_evaluation
 
 # =============================================================================
 # Fair Evaluation Prompt Template
@@ -57,7 +57,7 @@ def build_prompt(self, question, answer, reference_answer):
 # 评估模型设置
 JUDGE_MODEL_CONFIG = {
     "model_name": "gpt-4o-mini",
-    "api_url": "",  # 请求URL 必填 / request (required)
+    "api_url": "http://123.129.219.111:3000/v1/chat/completions",  # 请求URL 必填 / request (required)
     "api_key_env": "DF_API_KEY",  # api_key 必填 / api_key (required)
     "max_workers": 3,
     "max_retries": 5,
@@ -77,12 +77,35 @@ def build_prompt(self, question, answer, reference_answer):
     # },
     {
         "name": "qwen_7b",
-        "path": "./Qwen2.5-7B-Instruct",
+        "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-7B-Instruct",
+        # 大模型可以用不同的参数
+        "vllm_tensor_parallel_size": 2,
+        "vllm_temperature" : 0.1,
+        "vllm_top_p" :0.9,
+        "vllm_max_tokens": 2048,
+        "vllm_repetition_penalty":1.0,
+        "vllm_seed":None,
+        "vllm_gpu_memory_utilization": 0.9,
 
+        # 可以为每个模型自定义提示词 不写就为默认模板 即build_prompt函数中的prompt
+        # 默认被评估模型提示词
+        # 再次提示:该prompt为被评估模型的提示词，请勿与评估模型提示词混淆！！！
+        # You can customize prompts for each model. If not specified, defaults to the template in build_prompt function.
+        # Default prompt for evaluated models
+        # IMPORTANT: This is the prompt for models being evaluated, NOT for the judge model!!!
+        "answer_prompt": """please answer the following question:"""  # 这里不要使用{question} / do not code {question} here
+    },
+    {
+        "name": "qwen_3b",
+        "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-3B-Instruct",
         # 大模型可以用不同的参数
-        "tensor_parallel_size": 2,
-        "max_tokens": 2048,
-        "gpu_memory_utilization": 0.9,
+        "vllm_tensor_parallel_size": 2,
+        "vllm_temperature" : 0.1,
+        "vllm_top_p" :0.9,
+        "vllm_max_tokens": 2048,
+        "vllm_repetition_penalty":1.0,
+        "vllm_seed":None,
+        "vllm_gpu_memory_utilization": 0.9,
 
         # 可以为每个模型自定义提示词 不写就为默认模板 即build_prompt函数中的prompt
         # 默认被评估模型提示词
@@ -102,13 +125,22 @@ def build_prompt(self, question, answer, reference_answer):
 ]
 
 # Data Configuration
-DATA_CONFIG = {
-    "input_file": "./.cache/data/qa.json",  # 输入数据文件
-    "output_dir": "./eval_results",  # 输出目录
-    "question_key": "input",  # 原始数据中的问题字段
-    "reference_answer_key": "output"  # 原始数据中的参考答案字段
-}
-
+BENCH_CONFIG = [
+    {
+        "name":"fuck_data",
+        "input_file": "/mnt/DataFlow/fyl/workspace/.cache/data/qa.json",  # 输入数据文件
+        "output_dir": "./api_eval",  # 输出目录
+        "question_key": "input",  # 原始数据中的问题字段
+        "reference_answer_key": "output"  # 原始数据中的参考答案字段
+    },
+    {
+        "name":"fuck_fuck_data",
+        "input_file": "/mnt/DataFlow/fyl/workspace/nt.json",  # 输入数据文件
+        "output_dir": "./api_eval",  # 输出目录
+        "question_key": "input",  # 原始数据中的问题字段
+        "reference_answer_key": "output"  # 原始数据中的参考答案字段
+    }
+]
 # Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run)
 EVALUATOR_RUN_CONFIG = {
     "input_test_answer_key": "model_generated_answer",  # 模型生成的答案字段名
@@ -157,12 +189,12 @@ def create_evaluator(judge_serving, eval_result_path):
     )
 
 
-def create_storage(data_file, cache_path):
+def create_storage(data_file, cache_path,bench_name="eval_result"):
     """创建存储算子"""
     return FileStorage(
         first_entry_file_name=data_file,
         cache_path=cache_path,
-        file_name_prefix="eval_result",
+        file_name_prefix=bench_name,
         cache_type="json"
     )
 
@@ -177,7 +209,7 @@ def get_evaluator_config():
     return {
         "JUDGE_MODEL_CONFIG": JUDGE_MODEL_CONFIG,  # 评估模型设置映射
         "TARGET_MODELS": TARGET_MODELS,  # 被评估模型设置映射
-        "DATA_CONFIG": DATA_CONFIG,  # 数据设置映射
+        "BENCH_CONFIG": BENCH_CONFIG,  # 数据设置映射
         "EVAL_CONFIG": EVAL_CONFIG,  # 评估模式设置映射
         "EVALUATOR_RUN_CONFIG": EVALUATOR_RUN_CONFIG,  # 评估数据集字段映射
         "create_judge_serving": create_judge_serving,
@@ -195,7 +227,6 @@ def get_evaluator_config():
     # 直接运行时的简单评估
     # Simple evaluation when run directly
     print("Starting API evaluation...")
-    from dataflow.cli_funcs.cli_eval import run_evaluation
 
     try:
         config = get_evaluator_config()
diff --git a/dataflow/cli_funcs/eval_pipeline/eval_local.py b/dataflow/cli_funcs/eval_pipeline/eval_local.py
index 4053685f..29de3393 100644
--- a/dataflow/cli_funcs/eval_pipeline/eval_local.py
+++ b/dataflow/cli_funcs/eval_pipeline/eval_local.py
@@ -54,9 +54,16 @@ def build_prompt(self, question, answer, reference_answer):
 # Judge Model Configuration (local strong model as judge)
 JUDGE_MODEL_CONFIG = {
     "model_path": "./Qwen2.5-7B-Instruct",  # 用更强的模型做裁判
-    "tensor_parallel_size": 1,
-    "max_tokens": 512,
-    "gpu_memory_utilization": 0.8,
+    "hf_cache_dir" : "",
+    "hf_local_dir" : "",
+    "vllm_tensor_parallel_size": 2,
+    "vllm_temperature" : 0.9,
+    "vllm_top_p" : 0.9,
+    "vllm_max_tokens": 512,
+    "vllm_repetition_penalty" : 1.0,
+    "vllm_seed" : None,
+    "vllm_max_model_len" : None,
+    "vllm_gpu_memory_utilization" : 0.9
 }
 
 # Target Models Configuration (字典格式 - 必需)
@@ -73,11 +80,14 @@ def build_prompt(self, question, answer, reference_answer):
     {
         "name": "qwen_7b",
         "path": "./Qwen2.5-7B-Instruct",
-
         # 大模型可以用不同的参数
-        "tensor_parallel_size": 2,
-        "max_tokens": 2048,
-        "gpu_memory_utilization": 0.9,
+        "vllm_tensor_parallel_size": 2,
+        "vllm_temperature" : 0.1,
+        "vllm_top_p" :0.9,
+        "vllm_max_tokens": 2048,
+        "vllm_repetition_penalty":1.0,
+        "vllm_seed":None,
+        "vllm_gpu_memory_utilization": 0.9,
 
         # 可以为每个模型自定义提示词
         "answer_prompt": """please answer the following question:"""
@@ -93,12 +103,30 @@ def build_prompt(self, question, answer, reference_answer):
 ]
 
 # Data Configuration
-DATA_CONFIG = {
-    "input_file": "/data1/fyl/workspace/.cache/data/qa.json",  # 输入数据文件
-    "output_dir": "./eval_results",  # 输出目录
-    "question_key": "input",  # 原始数据中的问题字段
-    "reference_answer_key": "output"  # 原始数据中的参考答案字段
-}
+BENCH_CONFIG = [
+    {
+        "name": "math_bench",  # bench名称
+        "input_file": "./.cache/data/qa.json",  # 数据文件
+        "question_key": "input",  # 问题字段名
+        "reference_answer_key": "output",  # 答案字段名
+        "output_dir": "./eval_results/math_bench",  # 输出目录
+    },
+    # {
+    #     "name": "reasoning_bench",
+    #     "input_file": "./.cache/data/reasoning_qa.jsonl",
+    #     "question_key": "input",
+    #     "reference_answer_key": "output",
+    #     "output_dir": "./eval_results/reasoning_bench",
+    # },
+    # {
+    #     "name": "code_bench",
+    #     "input_file": "./.cache/data/code_qa.json",
+    #     "question_key": "problem",
+    #     "reference_answer_key": "solution",
+    #     "output_dir": "./eval_results/code_bench",
+    # },
+    
+]
 
 # Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run)
 EVALUATOR_RUN_CONFIG = {
@@ -136,9 +164,16 @@ def create_judge_serving():
     # Enhanced VLLM configuration
     vllm_config = {
         "hf_model_name_or_path": model_path,
-        "vllm_tensor_parallel_size": JUDGE_MODEL_CONFIG.get("tensor_parallel_size", 1),
+        "hf_cache_dir" : JUDGE_MODEL_CONFIG.get("hf_cache_dir", None),
+        "hf_local_dir" : JUDGE_MODEL_CONFIG.get("hf_local_dir", None),
+        "vllm_tensor_parallel_size": JUDGE_MODEL_CONFIG.get("vllm_tensor_parallel_size", 1),
+        "vllm_temperature" : JUDGE_MODEL_CONFIG.get("vllm_temperature",0.9),
+        "vllm_top_p" : JUDGE_MODEL_CONFIG.get("vllm_top_p",0.9),
         "vllm_max_tokens": JUDGE_MODEL_CONFIG.get("max_tokens", 512),
-        "vllm_gpu_memory_utilization": JUDGE_MODEL_CONFIG.get("gpu_memory_utilization", 0.8)
+        "vllm_repetition_penalty" : JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0),
+        "vllm_seed" : JUDGE_MODEL_CONFIG.get("vllm_seed",None),
+        "vllm_max_model_len" : JUDGE_MODEL_CONFIG.get("vllm_max_model_len",None),
+        "vllm_gpu_memory_utilization": JUDGE_MODEL_CONFIG.get("gpu_memory_utilization", 0.9)
     }
 
     # Add optional VLLM parameters if they exist
@@ -165,7 +200,7 @@ def create_storage(data_file, cache_path):
     return FileStorage(
         first_entry_file_name=data_file,
         cache_path=cache_path,
-        file_name_prefix="eval_result",
+        file_name_prefix="eval",
         cache_type="json"
     )
 

From aba86e31169e53b5261061b5b61019a642cbb8a6 Mon Sep 17 00:00:00 2001
From: Yalin Feng <Feng_Yalin@163.com>
Date: Sat, 29 Nov 2025 03:36:52 +0800
Subject: [PATCH 2/2] evalpipeline

---
 dataflow/cli_funcs/eval_pipeline/eval_api.py  |  18 ++--
 .../cli_funcs/eval_pipeline/eval_local.py     | 102 +++++++++---------
 2 files changed, 59 insertions(+), 61 deletions(-)

diff --git a/dataflow/cli_funcs/eval_pipeline/eval_api.py b/dataflow/cli_funcs/eval_pipeline/eval_api.py
index 95312581..c3cb1bb0 100644
--- a/dataflow/cli_funcs/eval_pipeline/eval_api.py
+++ b/dataflow/cli_funcs/eval_pipeline/eval_api.py
@@ -57,7 +57,7 @@ def build_prompt(self, question, answer, reference_answer):
 # 评估模型设置
 JUDGE_MODEL_CONFIG = {
     "model_name": "gpt-4o-mini",
-    "api_url": "http://123.129.219.111:3000/v1/chat/completions",  # 请求URL 必填 / request (required)
+    "api_url": "API_URL",  # 请求URL 必填 / request (required)
     "api_key_env": "DF_API_KEY",  # api_key 必填 / api_key (required)
     "max_workers": 3,
     "max_retries": 5,
@@ -77,9 +77,9 @@ def build_prompt(self, question, answer, reference_answer):
     # },
     {
         "name": "qwen_7b",
-        "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-7B-Instruct",
+        "path": "./Qwen2.5-7B-Instruct",
         # 大模型可以用不同的参数
-        "vllm_tensor_parallel_size": 2,
+        "vllm_tensor_parallel_size": 1,
         "vllm_temperature" : 0.1,
         "vllm_top_p" :0.9,
         "vllm_max_tokens": 2048,
@@ -97,9 +97,9 @@ def build_prompt(self, question, answer, reference_answer):
     },
     {
         "name": "qwen_3b",
-        "path": "/mnt/DataFlow/fyl/workspace/Qwen2.5-3B-Instruct",
+        "path": "./Qwen2.5-3B-Instruct",
         # 大模型可以用不同的参数
-        "vllm_tensor_parallel_size": 2,
+        "vllm_tensor_parallel_size": 1,
         "vllm_temperature" : 0.1,
         "vllm_top_p" :0.9,
         "vllm_max_tokens": 2048,
@@ -127,15 +127,15 @@ def build_prompt(self, question, answer, reference_answer):
 # Data Configuration
 BENCH_CONFIG = [
     {
-        "name":"fuck_data",
-        "input_file": "/mnt/DataFlow/fyl/workspace/.cache/data/qa.json",  # 输入数据文件
+        "name":"bench_name",
+        "input_file": "./.cache/data/qa.json",  # 输入数据文件
         "output_dir": "./api_eval",  # 输出目录
         "question_key": "input",  # 原始数据中的问题字段
         "reference_answer_key": "output"  # 原始数据中的参考答案字段
     },
     {
-        "name":"fuck_fuck_data",
-        "input_file": "/mnt/DataFlow/fyl/workspace/nt.json",  # 输入数据文件
+        "name":"bench_name_1",
+        "input_file": "./.cache/data/data_qa.json",  # 输入数据文件
         "output_dir": "./api_eval",  # 输出目录
         "question_key": "input",  # 原始数据中的问题字段
         "reference_answer_key": "output"  # 原始数据中的参考答案字段
diff --git a/dataflow/cli_funcs/eval_pipeline/eval_local.py b/dataflow/cli_funcs/eval_pipeline/eval_local.py
index 29de3393..0cacf850 100644
--- a/dataflow/cli_funcs/eval_pipeline/eval_local.py
+++ b/dataflow/cli_funcs/eval_pipeline/eval_local.py
@@ -5,6 +5,7 @@
 from dataflow.serving import LocalModelLLMServing_vllm
 from dataflow.utils.storage import FileStorage
 from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion
+from dataflow.cli_funcs.cli_eval import run_evaluation
 
 # =============================================================================
 # Fair Evaluation Prompt Template
@@ -15,7 +16,6 @@ class FairAnswerJudgePrompt:
 
     def build_prompt(self, question, answer, reference_answer):
         prompt = f"""You are an expert evaluator assessing answer quality for academic questions.
-
             **Question:**
             {question}
 
@@ -54,70 +54,70 @@ def build_prompt(self, question, answer, reference_answer):
 # Judge Model Configuration (local strong model as judge)
 JUDGE_MODEL_CONFIG = {
     "model_path": "./Qwen2.5-7B-Instruct",  # 用更强的模型做裁判
-    "hf_cache_dir" : "",
-    "hf_local_dir" : "",
-    "vllm_tensor_parallel_size": 2,
-    "vllm_temperature" : 0.9,
-    "vllm_top_p" : 0.9,
+    "hf_cache_dir": "",
+    "hf_local_dir": "",
+    "vllm_tensor_parallel_size": 1,
+    "vllm_temperature": 0.9,
+    "vllm_top_p": 0.9,
     "vllm_max_tokens": 512,
-    "vllm_repetition_penalty" : 1.0,
-    "vllm_seed" : None,
-    "vllm_max_model_len" : None,
-    "vllm_gpu_memory_utilization" : 0.9
+    "vllm_repetition_penalty": 0.8,
+    "vllm_seed": None,
+    "vllm_max_model_len": None,
+    "vllm_gpu_memory_utilization": 0.9
 }
 
 # Target Models Configuration (字典格式 - 必需)
 TARGET_MODELS = [
-    # {
-    #     "name": "qwen_3b",  # 模型名称（可选，默认使用路径最后一部分）
-    #     "path": "./Qwen2.5-3B-Instruct",  # 模型路径（必需）
+    {
+        "name": "qwen_3b",  # 模型名称（可选，默认使用路径最后一部分）
+        "path": "./Qwen2.5-3B-Instruct",  # 模型路径（必需）
 
-    #     # ===== 答案生成的模型加载参数（可选）=====
-    #     "tensor_parallel_size": 1,  # GPU并行数量
-    #     "max_tokens": 1024,  # 最大生成token数
-    #     "gpu_memory_utilization": 0.8,  # GPU显存利用率
-    # },
+        # 大模型可以用不同的参数
+        "vllm_tensor_parallel_size": 1,
+        "vllm_temperature": 0.1,
+        "vllm_top_p": 0.9,
+        "vllm_max_tokens": 2048,
+        "vllm_repetition_penalty": 1.0,
+        "vllm_seed": None,
+        "vllm_gpu_memory_utilization": 0.9,
+    },
     {
         "name": "qwen_7b",
         "path": "./Qwen2.5-7B-Instruct",
         # 大模型可以用不同的参数
-        "vllm_tensor_parallel_size": 2,
-        "vllm_temperature" : 0.1,
-        "vllm_top_p" :0.9,
+        "vllm_tensor_parallel_size": 1,
+        "vllm_temperature": 0.1,
+        "vllm_top_p": 0.9,
         "vllm_max_tokens": 2048,
-        "vllm_repetition_penalty":1.0,
-        "vllm_seed":None,
+        "vllm_repetition_penalty": 1.0,
+        "vllm_seed": None,
         "vllm_gpu_memory_utilization": 0.9,
 
         # 可以为每个模型自定义提示词
-        "answer_prompt": """please answer the following question:"""
+        "answer_prompt": """Please answer the following question based on the provided information. Put your answer in \\boxed{{}}.
+
+        For example, if the answer is B, output: \\boxed{{B}}"""
 
     },
 
-    # 添加更多模型...
-    # {
-    #     "name": "llama_8b",
-    #     "path": "meta-llama/Llama-3-8B-Instruct",
-    #     "tensor_parallel_size": 2
-    # }
 ]
 
-# Data Configuration
+# Bench Configuration
 BENCH_CONFIG = [
     {
-        "name": "math_bench",  # bench名称
+        "name": "bench_name",  # bench名称
         "input_file": "./.cache/data/qa.json",  # 数据文件
         "question_key": "input",  # 问题字段名
         "reference_answer_key": "output",  # 答案字段名
-        "output_dir": "./eval_results/math_bench",  # 输出目录
+        "output_dir": "./eval_results/bench_name",  # 输出目录
+    },
+    {
+        "name": "bench_name_1",
+        "input_file": "./.cache/data/data_qa.json",
+        "question_key": "input",
+        "reference_answer_key": "output",
+        "output_dir": "./eval_results/bench_name_1",
     },
-    # {
-    #     "name": "reasoning_bench",
-    #     "input_file": "./.cache/data/reasoning_qa.jsonl",
-    #     "question_key": "input",
-    #     "reference_answer_key": "output",
-    #     "output_dir": "./eval_results/reasoning_bench",
-    # },
     # {
     #     "name": "code_bench",
     #     "input_file": "./.cache/data/code_qa.json",
@@ -125,7 +125,6 @@ def build_prompt(self, question, answer, reference_answer):
     #     "reference_answer_key": "solution",
     #     "output_dir": "./eval_results/code_bench",
     # },
-    
 ]
 
 # Evaluator Run Configuration (parameters passed to BenchDatasetEvaluator.run)
@@ -164,15 +163,15 @@ def create_judge_serving():
     # Enhanced VLLM configuration
     vllm_config = {
         "hf_model_name_or_path": model_path,
-        "hf_cache_dir" : JUDGE_MODEL_CONFIG.get("hf_cache_dir", None),
-        "hf_local_dir" : JUDGE_MODEL_CONFIG.get("hf_local_dir", None),
+        "hf_cache_dir": JUDGE_MODEL_CONFIG.get("hf_cache_dir"),
+        "hf_local_dir": JUDGE_MODEL_CONFIG.get("hf_local_dir"),
         "vllm_tensor_parallel_size": JUDGE_MODEL_CONFIG.get("vllm_tensor_parallel_size", 1),
-        "vllm_temperature" : JUDGE_MODEL_CONFIG.get("vllm_temperature",0.9),
-        "vllm_top_p" : JUDGE_MODEL_CONFIG.get("vllm_top_p",0.9),
+        "vllm_temperature": JUDGE_MODEL_CONFIG.get("vllm_temperature", 0.9),
+        "vllm_top_p": JUDGE_MODEL_CONFIG.get("vllm_top_p", 0.9),
         "vllm_max_tokens": JUDGE_MODEL_CONFIG.get("max_tokens", 512),
-        "vllm_repetition_penalty" : JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0),
-        "vllm_seed" : JUDGE_MODEL_CONFIG.get("vllm_seed",None),
-        "vllm_max_model_len" : JUDGE_MODEL_CONFIG.get("vllm_max_model_len",None),
+        "vllm_repetition_penalty": JUDGE_MODEL_CONFIG.get("vllm_repetition_penalty", 1.0),
+        "vllm_seed": JUDGE_MODEL_CONFIG.get("vllm_seed", None),
+        "vllm_max_model_len": JUDGE_MODEL_CONFIG.get("vllm_max_model_len", None),
         "vllm_gpu_memory_utilization": JUDGE_MODEL_CONFIG.get("gpu_memory_utilization", 0.9)
     }
 
@@ -195,12 +194,12 @@ def create_evaluator(judge_serving, eval_result_path):
     )
 
 
-def create_storage(data_file, cache_path):
+def create_storage(data_file, cache_path, bench_name="eval_result"):
     """创建存储算子"""
     return FileStorage(
         first_entry_file_name=data_file,
         cache_path=cache_path,
-        file_name_prefix="eval",
+        file_name_prefix=bench_name,
         cache_type="json"
     )
 
@@ -214,7 +213,7 @@ def get_evaluator_config():
     return {
         "JUDGE_MODEL_CONFIG": JUDGE_MODEL_CONFIG,
         "TARGET_MODELS": TARGET_MODELS,
-        "DATA_CONFIG": DATA_CONFIG,
+        "BENCH_CONFIG": BENCH_CONFIG,
         "EVALUATOR_RUN_CONFIG": EVALUATOR_RUN_CONFIG,
         "EVAL_CONFIG": EVAL_CONFIG,
         "create_judge_serving": create_judge_serving,
@@ -230,7 +229,6 @@ def get_evaluator_config():
 if __name__ == "__main__":
     # 直接运行时的简单评估
     print("Starting local evaluation...")
-    from dataflow.cli_funcs.cli_eval import run_evaluation
 
     try:
         config = get_evaluator_config()