From b40f2a3df9279d54cf2528bd17a0181390f8b145 Mon Sep 17 00:00:00 2001
From: sunjqa1 <sunjinge@node1.cluster.local>
Date: Thu, 11 Dec 2025 14:49:01 +0800
Subject: [PATCH 1/6] feat(inference): Add InfiniLM inference benchmark
 framework

---
 infinimetrics/inference/.gitkeep              |   0
 infinimetrics/inference/__init__.py           |  37 ++
 infinimetrics/inference/adapter_base.py       | 272 ++++++++
 .../inference/adapters/infinilm_adapter.py    | 613 ++++++++++++++++++
 infinimetrics/inference/create_test_trace.py  |  30 +
 .../inference/direct_infer_runner.py          | 326 ++++++++++
 infinimetrics/inference/infer_config.py       | 517 +++++++++++++++
 infinimetrics/inference/infer_main.py         | 262 ++++++++
 infinimetrics/inference/infer_runner_base.py  | 582 +++++++++++++++++
 .../inference/infer_runner_factory.py         | 146 +++++
 .../inference/service_infer_runner.py         | 438 +++++++++++++
 infinimetrics/inference/utils/gpu_monitor.py  | 158 +++++
 .../inference/utils/prompt_generator.py       | 543 ++++++++++++++++
 .../inference/utils/token_generator.py        | 235 +++++++
 infinimetrics/inference/utils/trace_client.py | 527 +++++++++++++++
 15 files changed, 4686 insertions(+)
 delete mode 100644 infinimetrics/inference/.gitkeep
 create mode 100644 infinimetrics/inference/__init__.py
 create mode 100644 infinimetrics/inference/adapter_base.py
 create mode 100644 infinimetrics/inference/adapters/infinilm_adapter.py
 create mode 100644 infinimetrics/inference/create_test_trace.py
 create mode 100644 infinimetrics/inference/direct_infer_runner.py
 create mode 100644 infinimetrics/inference/infer_config.py
 create mode 100644 infinimetrics/inference/infer_main.py
 create mode 100644 infinimetrics/inference/infer_runner_base.py
 create mode 100644 infinimetrics/inference/infer_runner_factory.py
 create mode 100644 infinimetrics/inference/service_infer_runner.py
 create mode 100644 infinimetrics/inference/utils/gpu_monitor.py
 create mode 100644 infinimetrics/inference/utils/prompt_generator.py
 create mode 100644 infinimetrics/inference/utils/token_generator.py
 create mode 100644 infinimetrics/inference/utils/trace_client.py

diff --git a/infinimetrics/inference/.gitkeep b/infinimetrics/inference/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/infinimetrics/inference/__init__.py b/infinimetrics/inference/__init__.py
new file mode 100644
index 0000000..6d1812f
--- /dev/null
+++ b/infinimetrics/inference/__init__.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""
+infinimetrics inference package
+Unified Reasoning Evaluation Framework
+"""
+
+__version__ = "1.0.0"
+__author__ = "Infinimetrics Team"
+
+from .infer_config import (
+    InferConfig, InferConfigManager,
+    InferMode, FrameworkType,
+    DirectInferArgs, ServiceInferArgs
+)
+from .infer_runner_base import (
+    InferRunnerBase, BenchmarkResult,
+    Metric, ScalarMetric, TimeseriesMetric
+)
+from .adapter_base import InferAdapter
+from .infer_runner_factory import InferRunnerFactory
+
+__all__ = [
+    # config
+    "InferConfig", "InferConfigManager",
+    "InferMode", "FrameworkType",
+    "DirectInferArgs", "ServiceInferArgs",
+    
+    # Runner
+    "InferRunnerBase", "BenchmarkResult",
+    "Metric", "ScalarMetric", "TimeseriesMetric",
+    
+    # adapter
+    "InferAdapter",
+    
+    # factory
+    "InferRunnerFactory",
+]
diff --git a/infinimetrics/inference/adapter_base.py b/infinimetrics/inference/adapter_base.py
new file mode 100644
index 0000000..ee96549
--- /dev/null
+++ b/infinimetrics/inference/adapter_base.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Inference Adapter Base Class
+Defines a unified interface for different frameworks (InfiniLM/vLLM)
+"""
+
+import abc
+import logging
+import random
+from typing import List, Tuple, Optional, Dict, Any, Set
+from typing import Set
+
+from infer_config import InferConfig
+
+logger = logging.getLogger(__name__)
+
+class InferAdapter(abc.ABC):
+    """Base class for inference adapters"""
+    
+    def __init__(self, config: InferConfig):
+        self.config = config
+        self.model_loaded = False
+        self.service_started = False
+        self.tokenizer = None
+    
+    @abc.abstractmethod
+    def load_model(self) -> None:
+        """
+        Load model
+        Subclasses must implement the actual model loading logic
+        """
+        pass
+    
+    @abc.abstractmethod
+    def unload_model(self) -> None:
+        """
+        Unload model
+        Subclasses must implement model unloading and resource cleanup
+        """
+        pass
+    
+    @abc.abstractmethod
+    def generate(
+        self, 
+        prompts: List[str], 
+        max_tokens: int,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50
+    ) -> Tuple[List[str], List[float], List[float]]:
+        """
+        Generate text
+        Returns: (list of generated texts, list of latencies (ms), list of TTFT values (ms))
+        """
+        pass
+    
+    @abc.abstractmethod
+    def batch_generate(
+        self,
+        batch_prompts: List[List[str]],
+        max_tokens: int,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50
+    ) -> Tuple[List[List[str]], List[List[float]], List[List[float]]]:
+        """
+        Batch text generation
+        Returns: (batch of generated texts, batch of latencies, batch of TTFT values)
+        """
+        pass
+    
+    @abc.abstractmethod
+    def calculate_perplexity(self, test_data: List[str]) -> float:
+        """
+        Calculate perplexity
+        """
+        pass
+    
+    @abc.abstractmethod
+    def launch_service(self, port: int = 8000) -> None:
+        """
+        Launch inference service
+        """
+        pass
+    
+    @abc.abstractmethod
+    def stop_service(self) -> None:
+        """
+        Stop inference service
+        """
+        pass
+    
+    @abc.abstractmethod
+    def is_service_ready(self, port: int = 8000) -> bool:
+        """
+        Check whether the service is ready
+        """
+        pass
+    
+    @abc.abstractmethod
+    def get_service_url(self) -> str:
+        """
+        Get service URL
+        """
+        pass
+    
+    def get_vocab_size(self) -> int:
+        """Get vocabulary size"""
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not loaded")
+        return len(self.tokenizer)
+
+    def get_special_token_ids(self) -> Set[int]:
+        """Get a set of special token IDs that should be excluded"""
+        if self.tokenizer is None:
+            return set()
+
+        special_ids = set()
+
+        # Retrieve tokenizer's special token map
+        special_tokens_map = getattr(self.tokenizer, 'special_tokens_map', {})
+
+        # Collect IDs of all special tokens
+        for key, token in special_tokens_map.items():
+            if isinstance(token, int):
+                special_ids.add(token)
+            elif isinstance(token, str):
+                token_id = self.tokenizer.convert_tokens_to_ids(token)
+                if token_id is not None:
+                    special_ids.add(token_id)
+
+        # Add commonly used special tokens
+        common_special_tokens = [
+            "bos_token", "eos_token", "pad_token", "unk_token",
+            "sep_token", "cls_token", "mask_token"
+        ]
+
+        for token_name in common_special_tokens:
+            token = getattr(self.tokenizer, token_name, None)
+            if token is not None:
+                if isinstance(token, str):
+                    token_id = self.tokenizer.convert_tokens_to_ids(token)
+                    if token_id is not None:
+                        special_ids.add(token_id)
+                elif hasattr(token, 'content'):
+                    # Handle special token objects
+                    token_id = self.tokenizer.convert_tokens_to_ids(token.content)
+                    if token_id is not None:
+                        special_ids.add(token_id)
+
+        logger.debug(f"Found {len(special_ids)} special token IDs: {sorted(list(special_ids))}")
+        return special_ids
+
+    def generate_random_tokens(self, num_tokens: int, exclude_special: bool = True) -> List[int]:
+        """
+        Generate a sequence of random token IDs
+
+        Args:
+            num_tokens: Number of tokens to generate
+            exclude_special: Whether to exclude special tokens
+
+        Returns:
+            List of random token IDs
+        """
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not loaded")
+
+        vocab_size = self.get_vocab_size()
+
+        if exclude_special:
+            special_ids = self.get_special_token_ids()
+            # Create valid token range (excluding special tokens)
+            all_ids = set(range(vocab_size))
+            valid_ids = sorted(list(all_ids - special_ids))
+
+            if not valid_ids:
+                logger.warning("No valid tokens after excluding special tokens, using all tokens")
+                valid_ids = list(range(vocab_size))
+        else:
+            valid_ids = list(range(vocab_size))
+
+        # Random sampling
+        tokens = random.choices(valid_ids, k=num_tokens)
+
+        logger.debug(f"Generated {num_tokens} random tokens (vocab_size={vocab_size}, "
+                     f"exclude_special={exclude_special})")
+
+        return tokens
+
+    def tokens_to_text(self, tokens: List[int]) -> str:
+        """Convert token IDs to text (for debugging)"""
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not loaded")
+        return self.tokenizer.decode(tokens, skip_special_tokens=True)
+
+    def generate_random_prompt(self, token_count: int, exclude_special: bool = True) -> str:
+        """
+        Generate a random prompt text
+
+        Args:
+            token_count: Number of tokens in the prompt
+            exclude_special: Whether to exclude special tokens
+
+        Returns:
+            A random prompt string
+        """
+        tokens = self.generate_random_tokens(token_count, exclude_special)
+        return self.tokens_to_text(tokens)
+
+    def generate_random_prompts(self, num_prompts: int, token_count: int,
+                               exclude_special: bool = True) -> List[str]:
+        """
+        Generate multiple random prompts
+
+        Args:
+            num_prompts: Number of prompts to generate
+            token_count: Number of tokens in each prompt
+            exclude_special: Whether to exclude special tokens
+
+        Returns:
+            List of random prompt strings
+        """
+        prompts = []
+        for i in range(num_prompts):
+            prompt = self.generate_random_prompt(token_count, exclude_special)
+            prompts.append(prompt)
+
+        logger.info(f"Generated {num_prompts} random prompts, {token_count} tokens each")
+        return prompts
+
+    def validate_config(self) -> List[str]:
+        """
+        Validate adapter configuration
+        Returns: List of error messages
+        """
+        errors = []
+        
+        # Validate model path
+        import os
+        if not os.path.exists(self.config.model_path):
+            errors.append(f"Model path does not exist: {self.config.model_path}")
+        
+        # Validate framework-specific configuration
+        errors.extend(self._validate_framework_config())
+        
+        return errors
+    
+    @abc.abstractmethod
+    def _validate_framework_config(self) -> List[str]:
+        """
+        Validate framework-specific configuration
+        Subclasses must implement this
+        """
+        pass
+    
+    def get_peak_memory_usage(self) -> Optional[float]:
+        """
+        Get peak GPU memory usage (GB)
+        Subclasses can override this to implement specific memory monitoring
+        """
+        try:
+            import torch
+            if torch.cuda.is_available():
+                # Get peak memory usage across all GPUs
+                max_memory = 0
+                for i in range(torch.cuda.device_count()):
+                    max_memory = max(max_memory, torch.cuda.max_memory_allocated(i))
+                return max_memory / (1024 ** 3)  # Convert to GB
+        except ImportError:
+            logger.warning("PyTorch not available, cannot get GPU memory usage")
+        
+        return None
diff --git a/infinimetrics/inference/adapters/infinilm_adapter.py b/infinimetrics/inference/adapters/infinilm_adapter.py
new file mode 100644
index 0000000..f80607b
--- /dev/null
+++ b/infinimetrics/inference/adapters/infinilm_adapter.py
@@ -0,0 +1,613 @@
+#!/usr/bin/env python3
+"""
+InfiniLM Adapter Implementation 
+"""
+
+import os
+import re
+import torch
+import sys
+import time
+import logging
+import random
+from pathlib import Path
+from typing import List, Tuple, Optional, Dict, Any
+import subprocess
+import threading
+
+# Try to import InfiniLM related modules
+try:
+    # Add scripts directory to path
+    scripts_dir = Path.cwd() / "scripts"
+    if scripts_dir.exists():
+        sys.path.insert(0, str(scripts_dir))
+    
+    # Import jiuge module
+    from jiuge import JiugeForCauslLM
+    from libinfinicore_infer import DeviceType
+    from infer_task import InferTask, KVCache
+    
+    INFINILM_AVAILABLE = True
+    logger = logging.getLogger(__name__)
+    logger.info("InfiniLM modules imported successfully")
+    
+except ImportError as e:
+    INFINILM_AVAILABLE = False
+    logger = logging.getLogger(__name__)
+    logger.error(f"Failed to import InfiniLM modules: {e}")
+    raise
+
+from adapter_base import InferAdapter
+from infer_config import InferConfig, InferMode, DirectInferArgs
+from utils.token_generator import TokenGenerator, create_token_generator
+
+class InfiniLMAdapter(InferAdapter):
+    """InfiniLM adapter implementation - Fixed version (using API correctly)"""
+    
+    def __init__(self, config: InferConfig):
+        super().__init__(config)
+        
+        # InfiniLM specific attributes
+        self.jiuge_model: Optional[JiugeForCauslLM] = None
+        self.token_generator: Optional[TokenGenerator] = None
+        
+        # Service related
+        self.server_process: Optional[subprocess.Popen] = None
+        self.server_port = 8000
+        
+        logger.info(f"InfiniLMAdapter created for model: {config.model}")
+        logger.info(f"Model path: {config.model_path}")
+    
+    def load_model(self) -> None:
+        """Load real InfiniLM model (fixed API usage)"""
+        if not INFINILM_AVAILABLE:
+            logger.error("InfiniLM modules not available. Cannot load model.")
+            raise ImportError("InfiniLM modules not available")
+        
+        logger.info(f"Loading real InfiniLM model from: {self.config.model_path}")
+        
+        try:
+            # Determine device type
+            device_type = self._get_device_type()
+            
+            # Get tp size (from infer_args.parallel)
+            tp_size = self.config.infer_args.parallel.tp
+            
+            # ✅ Fix 1: Correctly call JiugeForCauslLM constructor
+            # Original API: JiugeForCauslLM(model_dir_path, device, ndev, max_tokens=None)
+            self.jiuge_model = JiugeForCauslLM(
+                self.config.model_path,
+                device_type,  # ✅ Not device=device_type
+                tp_size,
+                max_tokens=self.config.infer_args.max_seq_len  # ✅ This parameter is optional
+            )
+            
+            # Get tokenizer
+            self.tokenizer = self.jiuge_model.tokenizer
+            
+            # Create token generator
+            self.token_generator = create_token_generator(
+                self.tokenizer,
+                exclude_special_tokens=True
+            )
+            
+            self.model_loaded = True
+            logger.info("Real InfiniLM model loaded successfully")
+            logger.info(f"Tokenizer vocab size: {self.get_vocab_size()}")
+            logger.info(f"Model max context length: {self.jiuge_model.max_context_len()}")
+            logger.info(f"EOS token ID: {self.jiuge_model.eos_token_id}")
+            
+        except Exception as e:
+            logger.error(f"Failed to load real InfiniLM model: {e}", exc_info=True)
+            raise
+    
+    def unload_model(self) -> None:
+        """Unload model"""
+        if self.jiuge_model:
+            try:
+                self.jiuge_model.destroy_model_instance()
+                logger.info("InfiniLM model unloaded")
+            except Exception as e:
+                logger.warning(f"Error unloading model: {e}")
+            
+            self.jiuge_model = None
+        
+        self.model_loaded = False
+        self.tokenizer = None
+        self.token_generator = None
+    
+    def generate(
+        self, 
+        prompts: List[str], 
+        max_tokens: int,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50
+    ) -> Tuple[List[str], List[float], List[float]]:
+        """
+        Real InfiniLM inference implementation (fixed API usage)
+        
+        Args:
+            prompts: List of input prompts
+            max_tokens: Maximum tokens to generate per prompt
+            temperature, top_p, top_k: Sampling parameters
+        
+        Returns:
+            (List of generated texts, Latency list (ms), TTFT list (ms))
+        """
+        if not self.model_loaded or not self.jiuge_model:
+            raise RuntimeError("Model not loaded")
+        
+        logger.info(f"Real InfiniLM batch inference for {len(prompts)} prompts")
+        logger.info(f"  Max tokens per prompt: {max_tokens}")
+        logger.info(f"  Temperature: {temperature}, Top-p: {top_p}, Top-k: {top_k}")
+        
+        # 1. Encode prompts
+        token_lists = []
+        for i, prompt in enumerate(prompts):
+            tokens = self.tokenizer.encode(prompt)
+            token_lists.append(tokens)
+            if i == 0:  # Record first prompt information
+                logger.info(f"  First prompt: {len(tokens)} tokens")
+                logger.debug(f"  First prompt preview: {prompt[:100]}...")
+        
+        # 2. Create InferTask and KVCache for each prompt
+        tasks = []
+        kv_caches = []
+        
+        for i, tokens in enumerate(token_lists):
+            try:
+                # ✅ Fix 2: Correctly create InferTask
+                # InferTask parameters: id, tokens, max_tokens, temperature, topk, topp, topa, end_tokens
+                # topa parameter might be needed, set to 0 for now
+                topa = 0  # Assuming no top-a sampling needed
+                
+                # ✅ Fix 3: end_tokens should be a list
+                if isinstance(self.jiuge_model.eos_token_id, list):
+                    end_tokens = self.jiuge_model.eos_token_id
+                else:
+                    end_tokens = [self.jiuge_model.eos_token_id]
+                
+                # ✅ Fix 4: Use max_seq_len from config, not max_context_len
+                max_seq_len = min(
+                    self.config.infer_args.max_seq_len,
+                    self.jiuge_model.max_context_len()
+                )
+                
+                task = InferTask(
+                    id=i,
+                    tokens=tokens,
+                    max_tokens=max_seq_len,
+                    temperature=temperature,
+                    topk=top_k,
+                    topp=top_p,
+                    end_tokens=end_tokens
+                )
+                
+                # Create and bind KVCache
+                kv_cache = KVCache(self.jiuge_model)
+                task.bind_kvcache(kv_cache)
+                
+                tasks.append(task)
+                kv_caches.append(kv_cache)
+                
+                logger.debug(f"  Created InferTask {i}: {len(tokens)} input tokens")
+                
+            except Exception as e:
+                logger.error(f"Failed to create InferTask for prompt {i}: {e}")
+                raise
+        
+        logger.info(f"Created {len(tasks)} InferTasks for batch inference")
+        
+        # 3. Execute batch inference
+        generated_texts = []
+        latencies = []
+        ttfts = []
+        
+        # Pre-allocate result lists for each prompt
+        all_generated_tokens = [[] for _ in range(len(tasks))]
+        
+        try:
+            # Measure TTFT (first batch inference)
+            start_time = time.perf_counter()
+            output_tokens_batch = self.jiuge_model.batch_infer_one_round(tasks)
+            ttft = (time.perf_counter() - start_time) * 1000
+            
+            # ✅ Fix 5: batch_infer_one_round returns List[List[int]]
+            # Each task returns a token list (may contain multiple tokens)
+            for i, task_output in enumerate(output_tokens_batch):
+                if task_output:  # May have output
+                    # Take first token (if multiple tokens returned, take first)
+                    first_token = task_output[0] if isinstance(task_output, list) else task_output
+                    all_generated_tokens[i].append(first_token)
+                    
+                    # Record TTFT for this task (all tasks use same TTFT since batch inference)
+                    ttfts.append(ttft)
+                else:
+                    all_generated_tokens[i].append(0)  # Placeholder
+                    ttfts.append(0.0)
+            
+            # 4. Continue generating remaining tokens (token by token)
+            total_generated = 1  # Already generated first token
+            
+            while total_generated < max_tokens:
+                # Update status for all tasks
+                active_tasks = []
+                active_indices = []
+                
+                for i, task in enumerate(tasks):
+                    if len(all_generated_tokens[i]) > 0:
+                        last_token = all_generated_tokens[i][-1]
+                        
+                        # ✅ Fix 6: Correctly check EOS
+                        if isinstance(self.jiuge_model.eos_token_id, list):
+                            is_eos = last_token in self.jiuge_model.eos_token_id
+                        else:
+                            is_eos = last_token == self.jiuge_model.eos_token_id
+                        
+                        if not is_eos and len(all_generated_tokens[i]) < max_tokens:
+                            task.next(last_token)
+                            active_tasks.append(task)
+                            active_indices.append(i)
+                
+                # If no active tasks, stop generation
+                if not active_tasks:
+                    logger.info("All tasks reached EOS or max tokens")
+                    break
+                
+                # Batch inference for active tasks
+                iteration_start = time.perf_counter()
+                active_outputs = self.jiuge_model.batch_infer_one_round(active_tasks)
+                iteration_time = (time.perf_counter() - iteration_start) * 1000
+                
+                # Process outputs
+                for idx, task_idx in enumerate(active_indices):
+                    if idx < len(active_outputs) and active_outputs[idx]:
+                        next_token = active_outputs[idx][0] if isinstance(active_outputs[idx], list) else active_outputs[idx]
+                        all_generated_tokens[task_idx].append(next_token)
+                
+                total_generated += 1
+                
+                if total_generated % 10 == 0:
+                    logger.debug(f"  Generated {total_generated}/{max_tokens} tokens")
+            
+            # 5. Calculate total latency and decode text
+            total_latency = (time.perf_counter() - start_time) * 1000
+            
+            for i, generated_tokens in enumerate(all_generated_tokens):
+                # Calculate latency for this prompt (for batch inference, all prompts have same latency)
+                latencies.append(total_latency)
+                
+                # Decode text
+                if generated_tokens:
+                    generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+                    generated_texts.append(generated_text)
+                    
+                    logger.debug(f"  Prompt {i}: {len(generated_tokens)} tokens generated")
+                    if i == 0 and generated_text:
+                        logger.debug(f"  First generated text preview: {generated_text[:100]}...")
+                else:
+                    generated_texts.append("")
+                    logger.warning(f"  Prompt {i}: No tokens generated")
+            
+        except Exception as e:
+            logger.error(f"Error during batch inference: {e}", exc_info=True)
+            raise
+        
+        finally:
+            # 6. Clean up KVCaches
+            logger.info("Cleaning up KVCaches...")
+            for i, kv_cache in enumerate(kv_caches):
+                try:
+                    if kv_cache and self.jiuge_model:
+                        kv_cache.drop(self.jiuge_model)
+                except Exception as e:
+                    logger.warning(f"Failed to drop KV cache {i}: {e}")
+        
+        # 7. Return results
+        logger.info(f"Inference completed: {len(generated_texts)} prompts processed")
+        if latencies:
+            avg_latency = sum(latencies) / len(latencies)
+            avg_ttft = sum(ttfts) / len(ttfts) if ttfts else 0
+            logger.info(f"  Avg latency: {avg_latency:.2f}ms")
+            logger.info(f"  Avg TTFT: {avg_ttft:.2f}ms")
+        
+        return generated_texts, latencies, ttfts
+    
+    def batch_generate(
+        self,
+        batch_prompts: List[List[str]],
+        max_tokens: int,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50
+    ) -> Tuple[List[List[str]], List[List[float]], List[List[float]]]:
+        """
+        Batch text generation (multiple batches)
+        
+        Note: For large batches, we may need to split to avoid OOM
+        """
+        logger.info(f"Batch generating for {len(batch_prompts)} batches")
+        
+        all_texts = []
+        all_latencies = []
+        all_ttfts = []
+        
+        for batch_idx, prompts in enumerate(batch_prompts):
+            logger.info(f"Processing batch {batch_idx+1}/{len(batch_prompts)} "
+                       f"({len(prompts)} prompts)")
+            
+            # Check batch size to avoid OOM
+            max_batch_size = 8  # Safe value, can adjust based on GPU memory
+            if len(prompts) > max_batch_size:
+                logger.warning(f"Batch size {len(prompts)} too large, splitting...")
+                
+                # Split processing
+                split_texts = []
+                split_latencies = []
+                split_ttfts = []
+                
+                for i in range(0, len(prompts), max_batch_size):
+                    sub_prompts = prompts[i:i + max_batch_size]
+                    logger.info(f"  Processing sub-batch {i//max_batch_size + 1}")
+                    
+                    texts, latencies, ttfts = self.generate(
+                        sub_prompts, max_tokens, temperature, top_p, top_k
+                    )
+                    
+                    split_texts.extend(texts)
+                    split_latencies.extend(latencies)
+                    split_ttfts.extend(ttfts)
+                
+                all_texts.append(split_texts)
+                all_latencies.append(split_latencies)
+                all_ttfts.append(split_ttfts)
+            else:
+                texts, latencies, ttfts = self.generate(
+                    prompts, max_tokens, temperature, top_p, top_k
+                )
+                
+                all_texts.append(texts)
+                all_latencies.append(latencies)
+                all_ttfts.append(ttfts)
+        
+        return all_texts, all_latencies, all_ttfts
+    
+    def get_peak_memory_usage(self) -> Optional[float]:
+        """Get peak memory usage (GB)"""
+        try:
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            
+                # Get peak memory (bytes)
+                max_memory_bytes = 0
+                for i in range(torch.cuda.device_count()):
+                    device_max = torch.cuda.max_memory_allocated(i)
+                    if device_max > max_memory_bytes:
+                        max_memory_bytes = device_max
+            
+                # Convert to GB
+                max_memory_gb = max_memory_bytes / (1024 ** 3)
+            
+                # Also get current memory usage
+                current_memory_bytes = torch.cuda.memory_allocated()
+                current_memory_gb = current_memory_bytes / (1024 ** 3)
+            
+                logger.info(f"GPU memory - Peak: {max_memory_gb:.2f} GB, Current: {current_memory_gb:.2f} GB")
+                return max_memory_gb
+            
+        except ImportError:
+            logger.warning("PyTorch not available, cannot get GPU memory usage")
+        except Exception as e:
+            logger.warning(f"Failed to get GPU memory usage: {e}")
+    
+        # Try to get via nvidia-smi (fallback method)
+        try:
+        
+            result = subprocess.run(
+                ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+        
+            if result.returncode == 0:
+                # Parse output, e.g.: "1234, 24576\n"
+                lines = result.stdout.strip().split('\n')
+                max_memory_mb = 0
+            
+                for line in lines:
+                    if line:
+                        used_str, total_str = line.split(',')
+                        used_mb = float(used_str.strip())
+                        total_mb = float(total_str.strip())
+                    
+                        if used_mb > max_memory_mb:
+                            max_memory_mb = used_mb
+            
+                if max_memory_mb > 0:
+                    max_memory_gb = max_memory_mb / 1024
+                    logger.info(f"GPU memory (nvidia-smi): {max_memory_gb:.2f} GB")
+                    return max_memory_gb
+                
+        except Exception as e:
+            logger.debug(f"nvidia-smi fallback failed: {e}")
+    
+        return None
+
+    def calculate_perplexity(self, test_data: List[str]) -> float:
+        """Calculate perplexity"""
+        if not self.model_loaded or not self.jiuge_model:
+            raise RuntimeError("Model not loaded")
+        
+        logger.info(f"Calculating perplexity for {len(test_data)} test samples")
+        
+        try:
+            # Convert text to token sequences
+            test_sequences = []
+            for text in test_data:
+                tokens = self.tokenizer.encode(text)
+                if len(tokens) > self.config.infer_args.max_seq_len:
+                    tokens = tokens[:self.config.infer_args.max_seq_len]
+                test_sequences.append(tokens)
+            
+            # Use jiuge model's perplexity method
+            # Note: Need to test batch_size to avoid OOM
+            batch_size = min(4, len(test_sequences))
+            perplexity = self.jiuge_model.perplexity(test_sequences, batch_size=batch_size)
+            
+            logger.info(f"Perplexity calculated: {perplexity:.4f}")
+            return perplexity
+            
+        except Exception as e:
+            logger.error(f"Failed to calculate perplexity: {e}")
+            # Return a default value, don't interrupt tests
+            return 0.0
+    
+    def launch_service(self, port: int = 8000) -> None:
+        """Launch InfiniLM inference service"""
+        logger.info(f"Launching InfiniLM service on port {port}")
+
+        # Build launch command
+        cmd = [
+            sys.executable, "scripts/launch_server.py",
+            "--model-path", self.config.model_path,
+            "--dev", "nvidia",
+            "--ndev", str(self.config.infer_args.parallel.tp),
+            "--max-batch", "4"
+        ]
+
+        if port != 8000:
+            cmd.extend(["--port", str(port)])
+
+        # Start service process
+        try:
+            self.server_process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                bufsize=1,
+                universal_newlines=True
+            )
+
+            self.server_port = port
+            self.service_started = True
+
+            # Start thread to read output
+            self._start_output_reader()
+
+            logger.info(f"InfiniLM service started with PID: {self.server_process.pid}")
+            logger.info(f"Command: {' '.join(cmd)}")
+
+        except Exception as e:
+            logger.error(f"Failed to launch InfiniLM service: {e}")
+            raise
+
+    def stop_service(self) -> None:
+        """Stop inference service"""
+        if self.server_process:
+            logger.info("Stopping InfiniLM service")
+            
+            try:
+                self.server_process.terminate()
+                self.server_process.wait(timeout=10)
+                logger.info("InfiniLM service stopped")
+            except subprocess.TimeoutExpired:
+                logger.warning("Service did not stop gracefully, forcing kill")
+                self.server_process.kill()
+            
+            self.server_process = None
+        
+        self.service_started = False
+    
+    def is_service_ready(self, port: int = 8000) -> bool:
+        """Simplified service readiness check - only check port"""
+        if not self.service_started or not self.server_process:
+            logger.debug("Service not started or no server process")
+            return False
+    
+        # Check if process is alive
+        if self.server_process.poll() is not None:
+            logger.error(f"Server process died with return code: {self.server_process.returncode}")
+            return False
+    
+        # Only check if port is open
+        import socket
+        try:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.settimeout(1)
+            result = sock.connect_ex(('localhost', port))
+            sock.close()
+        
+            if result == 0:
+                logger.debug(f"Port {port} is open, service is ready")
+                return True
+            else:
+                logger.debug(f"Port {port} not open yet (result={result})")
+                return False
+        except Exception as e:
+            logger.debug(f"Port check failed: {e}")
+            return False
+    
+    def get_service_url(self) -> str:
+        """Get service URL"""
+        return f"http://localhost:{self.server_port}"
+    
+    def _get_device_type(self):
+        """Get device type based on configuration"""
+        gpu_platform = self.config.device.gpu_platform.lower()
+        
+        if gpu_platform == "nvidia":
+            return DeviceType.DEVICE_TYPE_NVIDIA
+        elif gpu_platform == "cpu":
+            return DeviceType.DEVICE_TYPE_CPU
+        else:
+            logger.warning(f"Unknown GPU platform: {gpu_platform}, using NVIDIA as default")
+            return DeviceType.DEVICE_TYPE_NVIDIA
+    
+    def _start_output_reader(self):
+        """Start output reading thread"""
+        def read_output():
+            if self.server_process:
+                for line in self.server_process.stdout:
+                    logger.info(f"[InfiniLM Server] {line.strip()}")
+        
+        self.output_thread = threading.Thread(target=read_output, daemon=True)
+        self.output_thread.start()
+    
+    def _validate_framework_config(self) -> List[str]:
+        """Validate InfiniLM specific configuration"""
+        errors = []
+        
+        # Check if scripts directory exists
+        scripts_dir = Path("scripts")
+        if not scripts_dir.exists():
+            errors.append("scripts directory not found in current directory")
+        else:
+            # Check for necessary script files
+            required_scripts = ["jiuge.py", "launch_server.py"]
+            for script in required_scripts:
+                if not (scripts_dir / script).exists():
+                    errors.append(f"Required script not found: {script}")
+        
+        # Check parallel configuration
+        if self.config.infer_args.parallel.tp <= 0:
+            errors.append("Tensor parallel size (tp) must be positive")
+        
+        # Check model directory
+        model_dir = Path(self.config.model_path)
+        if not model_dir.exists():
+            errors.append(f"Model directory does not exist: {model_dir}")
+        else:
+            # Check for necessary model files
+            model_files = list(model_dir.glob("*.safetensors")) + list(model_dir.glob("*.bin"))
+            if not model_files:
+                errors.append(f"No model files found in {model_dir}")
+            
+            # Check config.json
+            config_file = model_dir / "config.json"
+            if not config_file.exists():
+                errors.append(f"config.json not found in {model_dir}")
+        
+        return errors
\ No newline at end of file
diff --git a/infinimetrics/inference/create_test_trace.py b/infinimetrics/inference/create_test_trace.py
new file mode 100644
index 0000000..a5525bc
--- /dev/null
+++ b/infinimetrics/inference/create_test_trace.py
@@ -0,0 +1,30 @@
+import csv
+import random
+from pathlib import Path
+
+def create_test_trace(output_file: str = "test_trace.csv", num_requests: int = 50):
+    """Create a test trace file"""
+    
+    with open(output_file, 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['request_id', 'arrival_timestamp_ms', 'input_token_num', 'output_token_num'])
+        
+        current_time = 0
+        
+        for i in range(num_requests):
+            request_id = f"req-{i:04d}"
+            
+            # Random arrival interval (0–100ms)
+            interval = random.uniform(0, 100)
+            current_time += interval
+            
+            # Random token counts
+            input_tokens = random.choice([64, 128, 256, 512])
+            output_tokens = random.choice([64, 128, 256])
+            
+            writer.writerow([request_id, round(current_time, 2), input_tokens, output_tokens])
+    
+    print(f"Created test trace file: {output_file} with {num_requests} requests")
+
+if __name__ == "__main__":
+    create_test_trace()
diff --git a/infinimetrics/inference/direct_infer_runner.py b/infinimetrics/inference/direct_infer_runner.py
new file mode 100644
index 0000000..25580c2
--- /dev/null
+++ b/infinimetrics/inference/direct_infer_runner.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3
+"""
+Direct Inference Runner Implementation
+Launch real model for batch inference testing
+"""
+
+import logging
+import time
+import json
+from pathlib import Path
+from typing import List, Dict, Any
+from datetime import datetime
+
+from infer_runner_base import InferRunnerBase, ScalarMetric, TimeseriesMetric
+from infer_config import InferConfig, DirectInferArgs
+
+logger = logging.getLogger(__name__)
+
+class DirectInferRunner(InferRunnerBase):
+    """Direct Inference Runner"""
+    
+    def __init__(self, config: InferConfig, adapter):
+        super().__init__(config, adapter)
+        self.infer_args: DirectInferArgs = config.infer_args
+        
+        # GPU monitor
+        self.gpu_monitor = None
+
+        logger.info(f"DirectInferRunner created for batch_size={self.infer_args.static_batch_size}")
+    
+    def setup(self) -> None:
+        """Set up direct inference environment"""
+        logger.info("Setting up direct inference environment")
+        
+        # Create GPU monitor
+        device_ids = self.config.device.device_ids
+        if self.config.device.cpu_only:
+            logger.info("CPU-only mode, GPU monitoring disabled")
+            self.gpu_monitor = None
+        else:
+            from utils.gpu_monitor import create_gpu_monitor
+            self.gpu_monitor = create_gpu_monitor(
+                gpu_platform=self.config.device.gpu_platform,
+                device_ids=device_ids
+            )
+
+        # Start GPU monitoring
+        if self.gpu_monitor:
+            self.gpu_monitor.start_monitoring()
+            logger.info(f"GPU monitoring started for devices: {device_ids}")
+
+        # Validate configuration
+        if self.infer_args.static_batch_size <= 0:
+            raise ValueError(f"Invalid batch size: {self.infer_args.static_batch_size}")
+        
+        # Load model
+        logger.info("Loading model via adapter")
+        self.adapter.load_model()
+    
+    def execute(self) -> None:
+        """Execute direct inference test"""
+        logger.info("Executing direct inference benchmark")
+        
+        # Generate test prompts
+        prompts = self._generate_test_prompts()
+        total_prompts = len(prompts)
+        
+        logger.info(f"Generated {total_prompts} test prompts")
+        logger.info(f"Input tokens per prompt: {self.infer_args.prompt_token_num}")
+        logger.info(f"Output tokens per prompt: {self.infer_args.output_token_num}")
+        
+        # Warmup phase
+        if self.config.warmup_iterations > 0:
+            logger.info(f"Warmup phase ({self.config.warmup_iterations} iterations)")
+            
+            warmup_prompts = prompts[:self.config.warmup_iterations * self.infer_args.static_batch_size]
+            
+            # Perform warmup in batches
+            for i in range(0, len(warmup_prompts), self.infer_args.static_batch_size):
+                batch_end = min(i + self.infer_args.static_batch_size, len(warmup_prompts))
+                batch_prompts = warmup_prompts[i:batch_end]
+                
+                if not batch_prompts:
+                    break
+                    
+                logger.debug(f"Warmup batch {i//self.infer_args.static_batch_size + 1}")
+                
+                try:
+                    _, _, _ = self.adapter.generate(
+                        batch_prompts,
+                        self.infer_args.output_token_num,
+                        self.infer_args.temperature,
+                        self.infer_args.top_p,
+                        self.infer_args.top_k
+                    )
+                except Exception as e:
+                    logger.warning(f"Warmup batch failed: {e}")
+        
+        # Actual testing phase
+        logger.info(f"Measurement phase ({self.config.measured_iterations} iterations)")
+        
+        measurement_start = time.perf_counter()
+        
+        for i in range(self.config.measured_iterations):
+            # Calculate prompt indices for current batch
+            start_idx = (self.config.warmup_iterations + i) * self.infer_args.static_batch_size
+            end_idx = start_idx + self.infer_args.static_batch_size
+            
+            if start_idx >= total_prompts:
+                logger.warning(f"Iteration {i+1}: Not enough prompts, reusing earlier prompts")
+                start_idx = i * self.infer_args.static_batch_size % total_prompts
+                end_idx = start_idx + self.infer_args.static_batch_size
+            
+            batch_prompts = prompts[start_idx:end_idx]
+            
+            logger.info(f"Running measurement iteration {i+1}/{self.config.measured_iterations}")
+            
+            # Call adapter to generate
+            generated_texts, latencies, ttfts = self.adapter.generate(
+                batch_prompts,
+                self.infer_args.output_token_num,
+                self.infer_args.temperature,
+                self.infer_args.top_p,
+                self.infer_args.top_k
+            )
+            
+            # Collect data
+            for latency in latencies:
+                self.result.add_latency(latency)
+            for ttft in ttfts:
+                self.result.add_ttft(ttft)
+            
+            # Calculate throughput (tokens/s)
+            if latencies:
+                avg_latency = sum(latencies) / len(latencies)
+                # Calculate total tokens in this batch
+                batch_tokens = len(batch_prompts) * self.infer_args.output_token_num
+                throughput = (batch_tokens * 1000) / avg_latency if avg_latency > 0 else 0
+                self.result.add_throughput(throughput)
+                
+                logger.info(f"  Iteration {i+1}: avg_latency={avg_latency:.2f}ms, throughput={throughput:.2f} tokens/s")
+            else:
+                logger.warning(f"  Iteration {i+1}: No latency data collected")
+            
+            # Show progress
+            progress = (i + 1) / self.config.measured_iterations * 100
+            logger.info(f"Measurement progress: {progress:.1f}%")
+        
+        measurement_end = time.perf_counter()
+        total_measurement_time = measurement_end - measurement_start
+        logger.info(f"Measurement phase completed in {total_measurement_time:.2f}s")
+    
+    def collect_metrics(self) -> None:
+        """Collect direct inference metrics"""
+        logger.info("Collecting direct inference metrics")
+        
+        # Get peak memory usage
+        peak_memory = 0.0
+        if self.gpu_monitor:
+            try:
+                self.gpu_monitor.stop_monitoring()
+                peak_memory = self.gpu_monitor.get_peak_memory_gb()
+                logger.info(f"Peak GPU memory usage: {peak_memory:.6f} GB")
+            except Exception as e:
+                logger.warning(f"Failed to get peak memory from GPU monitor: {e}")
+
+        # Save to result
+        self.result.peak_memory_usage = peak_memory
+        
+        # Calculate total tokens
+        total_iterations = self.config.warmup_iterations + self.config.measured_iterations
+        total_batches = total_iterations * self.infer_args.static_batch_size
+        self.result.total_tokens = total_batches * self.infer_args.output_token_num
+        
+        logger.info(f"Total tokens generated: {self.result.total_tokens}")
+        
+        # Calculate throughput statistics
+        if self.result.throughput_data:
+            avg_throughput = sum(self.result.throughput_data) / len(self.result.throughput_data)
+            logger.info(f"Average throughput: {avg_throughput:.2f} tokens/s")
+        
+        # Add ppl and accuracy placeholders
+        # 1. Calculate perplexity (if test dataset exists)
+        if self.config.test_dataset:
+            try:
+                perplexity = self._calculate_perplexity()
+                # Use correct ScalarMetric class
+                self.result.add_metric(ScalarMetric(
+                    name="infer.ppl",
+                    value=perplexity,
+                    unit=None
+                ))
+                logger.info(f"Perplexity calculated: {perplexity:.4f}")
+            except Exception as e:
+                logger.warning(f"Failed to calculate perplexity: {e}")
+                # Add placeholder
+                self.result.add_metric(ScalarMetric(
+                    name="infer.ppl",
+                    value=0.0,  # placeholder
+                    unit=None
+                ))
+        else:
+            # No test dataset, add placeholder
+            self.result.add_metric(ScalarMetric(
+                name="infer.ppl",
+                value=0.0,  # placeholder
+                unit=None
+            ))
+        
+        # 2. Add accuracy placeholder
+        self.result.add_metric(ScalarMetric(
+            name="infer.accuracy",
+            value=0.0,  # TODO: Actually calculate accuracy
+            unit=None
+        ))
+        
+        # Calculate statistics
+        stats = self.calculate_statistics()
+        
+        if 'avg_latency' in stats:
+            logger.info(f"Average latency: {stats['avg_latency']:.2f} ms")
+        
+        if 'avg_ttft' in stats:
+            logger.info(f"Average TTFT: {stats['avg_ttft']:.2f} ms")
+    
+    def _calculate_perplexity(self) -> float:
+        """Calculate perplexity"""
+        if not hasattr(self.adapter, 'calculate_perplexity'):
+            logger.warning("Adapter does not support perplexity calculation")
+            return 0.0
+        
+        # Load test data
+        test_data = self._load_test_data()
+        if not test_data:
+            logger.warning("No test data available for perplexity calculation")
+            return 0.0
+        
+        try:
+            return self.adapter.calculate_perplexity(test_data)
+        except Exception as e:
+            logger.error(f"Error calculating perplexity: {e}")
+            return 0.0
+    
+    def _load_test_data(self) -> List[str]:
+        """Load test data"""
+        test_file = self.config.test_dataset
+        if not test_file:
+            logger.warning("No test dataset specified")
+            return []
+        
+        # Ensure Path is imported when using it
+        test_path = Path(test_file)
+        if not test_path.exists():
+            logger.warning(f"Test dataset file not found: {test_file}")
+            return []
+        
+        try:
+            import json
+            with open(test_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            
+            # Extract text based on data format
+            test_texts = []
+            if isinstance(data, list):
+                for item in data:
+                    if isinstance(item, dict):
+                        text = item.get('text') or item.get('content') or item.get('prompt')
+                        if text:
+                            test_texts.append(str(text))
+                    elif isinstance(item, str):
+                        test_texts.append(item)
+            elif isinstance(data, dict):
+                # Could be multiple keys
+                for key, value in data.items():
+                    if isinstance(value, str):
+                        test_texts.append(value)
+                    elif isinstance(value, list):
+                        test_texts.extend([str(v) for v in value if isinstance(v, str)])
+            
+            logger.info(f"Loaded {len(test_texts)} test samples from {test_file}")
+            return test_texts[:100]  # Limit quantity to avoid excessive computation time
+            
+        except Exception as e:
+            logger.error(f"Failed to load test data: {e}")
+            return []
+    
+    def _generate_test_prompts(self) -> List[str]:
+        """Generate test prompts"""
+        prompts = []
+        base_template = "Please provide a detailed explanation about {topic}. "
+        
+        topics = [
+            "artificial intelligence and its applications in healthcare",
+            "machine learning algorithms and their use cases", 
+            "deep learning and neural networks",
+            "natural language processing techniques",
+            "computer vision and image recognition",
+            "reinforcement learning and autonomous systems",
+            "quantum computing and its potential impact",
+            "blockchain technology and decentralized applications",
+            "Internet of Things and smart devices",
+            "cloud computing and distributed systems"
+        ]
+        
+        total_prompts_needed = (self.config.warmup_iterations + self.config.measured_iterations) * self.infer_args.static_batch_size
+        
+        for i in range(total_prompts_needed):
+            topic = topics[i % len(topics)]
+            
+            # Create prompt of specified length
+            base_prompt = base_template.format(topic=topic)
+            repeat_count = max(1, self.infer_args.prompt_token_num // len(base_prompt))
+            
+            prompt = base_prompt * repeat_count
+            prompt = prompt[:self.infer_args.prompt_token_num]
+            
+            # Add unique identifier
+            import random
+            import string
+            random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+            prompt += f" [Request {i+1}:{random_suffix}]"
+            
+            prompts.append(prompt)
+        
+        logger.info(f"Generated {len(prompts)} test prompts")
+        return prompts
\ No newline at end of file
diff --git a/infinimetrics/inference/infer_config.py b/infinimetrics/inference/infer_config.py
new file mode 100644
index 0000000..bb242be
--- /dev/null
+++ b/infinimetrics/inference/infer_config.py
@@ -0,0 +1,517 @@
+#!/usr/bin/env python3
+"""
+Inference Configuration Manager
+Parses config.json, identifies direct/service mode, identifies infinilm/vllm framework
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass
+from enum import Enum
+from datetime import datetime
+import random
+import string
+
+logger = logging.getLogger(__name__)
+
+class InferMode(Enum):
+    """Inference mode enumeration"""
+    DIRECT = "direct"
+    SERVICE = "service"
+
+class FrameworkType(Enum):
+    """Framework type enumeration"""
+    INFINILM = "infinilm"
+    VLLM = "vllm"
+
+@dataclass
+class ParallelConfig:
+    """Parallel configuration"""
+    dp: int = 1
+    tp: int = 1
+    pp: int = 1
+    sp: int = 1
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ParallelConfig':
+        """Create parallel configuration from dictionary"""
+        if not data:
+            return cls()
+        return cls(
+            dp=data.get("dp", 1),
+            tp=data.get("tp", 1),
+            pp=data.get("pp", 1),
+            sp=data.get("sp", 1)
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "dp": self.dp,
+            "tp": self.tp,
+            "pp": self.pp,
+            "sp": self.sp
+        }
+
+@dataclass
+class DeviceConfig:
+    """Device configuration"""
+    gpu_platform: str = "nvidia"
+    device_ids: List[int] = None
+    cpu_only: bool = False
+
+    def __post_init__(self):
+        if self.device_ids is None:
+            self.device_ids = [0]
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'DeviceConfig':
+        """Create device configuration from dictionary"""
+        if not data:
+            return cls()
+        return cls(
+            gpu_platform=data.get("gpu_platform", "nvidia"),
+            device_ids=data.get("device_ids", [0]),
+            cpu_only=data.get("cpu_only", False)
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "gpu_platform": self.gpu_platform,
+            "device_ids": self.device_ids,
+            "cpu_only": self.cpu_only
+        }
+
+@dataclass
+class DirectInferArgs:
+    """Direct inference arguments"""
+    parallel: ParallelConfig
+    static_batch_size: int
+    prompt_token_num: int
+    output_token_num: int = 128
+    max_seq_len: int = 4096
+    temperature: float = 0.7
+    top_p: float = 0.9
+    top_k: int = 50
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'DirectInferArgs':
+        """Create direct inference arguments from dictionary"""
+        return cls(
+            parallel=ParallelConfig.from_dict(data.get("parallel", {})),
+            static_batch_size=data.get("static_batch_size", 1),
+            prompt_token_num=data.get("prompt_token_num", 1024),
+            output_token_num=data.get("output_token_num", 128),
+            max_seq_len=data.get("max_seq_len", 4096),
+            temperature=data.get("temperature", 0.7),
+            top_p=data.get("top_p", 0.9),
+            top_k=data.get("top_k", 50)
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "parallel": self.parallel.to_dict(),
+            "static_batch_size": self.static_batch_size,
+            "prompt_token_num": self.prompt_token_num,
+            "output_token_num": self.output_token_num,
+            "max_seq_len": self.max_seq_len,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k
+        }
+
+@dataclass
+class ServiceInferArgs:
+    """Service inference arguments"""
+    parallel: ParallelConfig
+    request_trace: str
+    concurrency: int = 32
+    max_seq_len: int = 4096
+    stream: bool = True
+    timeout_ms: int = 30000
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ServiceInferArgs':
+        """Create service inference arguments from dictionary"""
+        return cls(
+            parallel=ParallelConfig.from_dict(data.get("parallel", {})),
+            request_trace=data.get("request_trace", ""),
+            concurrency=data.get("concurrency", 32),
+            max_seq_len=data.get("max_seq_len", 4096),
+            stream=data.get("stream", True),
+            timeout_ms=data.get("timeout_ms", 30000)
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "parallel": self.parallel.to_dict(),
+            "request_trace": self.request_trace,
+            "concurrency": self.concurrency,
+            "max_seq_len": self.max_seq_len,
+            "stream": self.stream,
+            "timeout_ms": self.timeout_ms
+        }
+
+@dataclass
+class InferConfig:
+    """Main inference configuration class"""
+    # Basic information
+    run_id: str
+    testcase: str
+    model: str
+    model_path: str
+    model_config: Optional[str]
+
+    # Dataset
+    train_dataset: Optional[str]
+    validation_dataset: Optional[str]
+    test_dataset: Optional[str]
+
+    # Output
+    output_dir: str
+
+    # Execution mode
+    mode: InferMode
+    framework: FrameworkType
+
+    # Device configuration
+    device: DeviceConfig
+
+    # Inference arguments (varies by mode)
+    infer_args: Any  # DirectInferArgs or ServiceInferArgs
+
+    # Execution parameters
+    timeout_ms: int
+    warmup_iterations: int
+    measured_iterations: int
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> 'InferConfig':
+        """Create configuration object from dictionary - implemented with clear logic"""
+        # 1. Read outer level
+        outer_run_id = config_dict.get("run_id")
+        outer_testcase = config_dict.get("testcase")
+        config_data = config_dict.get("config", {})
+        
+        # 2. Read inner level (for error checking)
+        inner_run_id = config_data.get("run_id")
+        inner_testcase = config_data.get("testcase")
+        
+        # 3. Process testcase
+        # Rule 2.1: If outer level has testcase → use it
+        if outer_testcase:
+            testcase = outer_testcase
+            logger.info(f"Using outer testcase: {testcase}")
+
+        # Rule 2.2: If inner level has testcase → raise error
+        elif inner_testcase:
+            raise ValueError(
+                "testcase must be at the outer level, not inside 'config'. "
+                f"Found: '{inner_testcase}' inside 'config'. "
+                "Please move it to the outer level."
+            )
+        
+        # Rule 2.3: If no testcase at either level → raise error
+        else:
+            raise ValueError(
+                "testcase is required at the outer level of the config. "
+                "Example: {\"testcase\": \"infer.InfiniLM.Direct\", ...}"
+            )
+        
+        testcase = outer_testcase
+        logger.info(f"Using testcase: {testcase}")
+        
+        # 4. Process run_id
+        # Rule 1.2: If inner level has run_id → raise error
+        if inner_run_id:
+            raise ValueError(
+                "run_id must be at the outer level, not inside 'config'. "
+                f"Found: '{inner_run_id}' inside 'config'. "
+                "Please move it to the outer level or remove it to auto-generate."
+            )
+        
+        # Rule 1.1: If outer level has run_id → use it (add timestamp+random code to prevent overwriting)
+        elif outer_run_id:
+            run_id = cls._enhance_user_run_id(outer_run_id)
+            logger.info(f"Using enhanced user-provided run_id: {run_id}")
+        else:
+            # Rule 1.3: Auto-generate run_id
+            run_id = cls._generate_auto_run_id(testcase)
+            logger.info(f"Auto-generated run_id: {run_id}")
+        
+        # 5. Parse mode and framework from testcase
+        testcase_lower = testcase.lower()
+        
+        # Determine inference mode
+        if "service" in testcase_lower:
+            mode = InferMode.SERVICE
+        elif "direct" in testcase_lower:
+            mode = InferMode.DIRECT
+        else:
+            mode = InferMode.DIRECT
+        
+        # Determine framework
+        if "vllm" in testcase_lower:
+            framework = FrameworkType.VLLM
+        elif "infinilm" in testcase_lower:
+            framework = FrameworkType.INFINILM
+        else:
+            raise ValueError(f"Cannot determine framework from testcase: {testcase}")
+        
+        # 6. Parse model_path
+        model_path = config_data.get("model_path")
+        model_config = config_data.get("model_config")
+        
+        if not model_path and model_config:
+            model_path = str(Path(model_config).parent)
+            logger.info(f"Inferred model_path from model_config: {model_path}")
+        elif not model_path:
+            raise ValueError("Either model_path or model_config must be provided in config")
+        
+        # 7. Parse inference arguments
+        infer_args_dict = config_data.get("infer_args", {})
+        if mode == InferMode.DIRECT:
+            infer_args = DirectInferArgs.from_dict(infer_args_dict)
+        else:
+            infer_args = ServiceInferArgs.from_dict(infer_args_dict)
+        
+        # 8. Parse device configuration
+        device_config = DeviceConfig.from_dict(config_data.get("device", {}))
+        
+        return cls(
+            run_id=run_id,
+            testcase=testcase,
+            model=config_data.get("model", "unknown"),
+            model_path=model_path,
+            model_config=model_config,
+            
+            train_dataset=config_data.get("train_dataset"),
+            validation_dataset=config_data.get("validation_dataset"),
+            test_dataset=config_data.get("test_dataset"),
+            
+            output_dir=config_data.get("output_dir", "./test_output"),
+            
+            mode=mode,
+            framework=framework,
+            device=device_config,
+            infer_args=infer_args,
+            
+            timeout_ms=config_data.get("timeout_ms", 30000)if mode == InferMode.SERVICE else None,
+            warmup_iterations=config_data.get("warmup_iterations", 10),
+            measured_iterations=config_data.get("measured_iterations", 100)
+        )
+    
+    def _enhance_user_run_id(user_run_id: str) -> str:
+        """
+        Enhance user-provided run_id by adding timestamp and random code to prevent overwriting
+
+        Args:
+            user_run_id: User-provided run_id
+
+        Returns:
+            Enhanced run_id: {user_run_id}.{timestamp}.{random8}
+        """
+        # If already contains timestamp and random code, return directly (prevent duplicate addition)
+        import re
+        timestamp_pattern = r'\.\d{8}_\d{6}\.[a-z0-9]{8}$'
+        if re.search(timestamp_pattern, user_run_id):
+            logger.info(f"User run_id already contains timestamp and random code: {user_run_id}")
+            return user_run_id
+
+        # Add timestamp and random code
+        from datetime import datetime
+        import random
+        import string
+
+        # Clean user run_id
+        cleaned_user_id = user_run_id.strip().strip(".").replace("..", ".")
+
+        # Timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+        # 8-character random code
+        random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
+
+        # Combine
+        enhanced_run_id = f"{cleaned_user_id}.{timestamp}.{random_suffix}"
+
+        logger.info(f"Enhanced user run_id: {user_run_id} -> {enhanced_run_id}")
+        return enhanced_run_id
+
+    @staticmethod
+    def _generate_auto_run_id(testcase: str) -> str:
+        """
+        Auto-generate run_id
+        Format: {testcase}.{timestamp}.{random8}
+        Example: infer.InfiniLM.Direct.20251210_143025.a1b2c3d4
+        """
+        # Clean testcase
+        cleaned_testcase = testcase.strip().strip(".").replace("..", ".")
+        
+        # Timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        
+        # 8-character random code
+        random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
+        
+        # Combine
+        run_id = f"{cleaned_testcase}.{timestamp}.{random_suffix}"
+        
+        return run_id
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary (for JSON output)"""
+        return {
+            "run_id": self.run_id,
+            "testcase": self.testcase,
+            "mode": self.mode.value,
+            "framework": self.framework.value,
+            "model": self.model,
+            "model_path": self.model_path,
+            "model_config": self.model_config,
+            "device": self.device.to_dict(),
+            "infer_args": self.infer_args.to_dict() if hasattr(self.infer_args, 'to_dict') else {},
+            "warmup_iterations": self.warmup_iterations,
+            "measured_iterations": self.measured_iterations
+        }
+        if self.mode == InferMode.SERVICE and self.timeout_ms is not None:
+            result["timeout_ms"] = self.timeout_ms
+        
+        return result
+
+class InferConfigManager:
+    """Inference Configuration Manager"""
+    
+    @staticmethod
+    def load_config(config_file: str) -> InferConfig:
+        """Load configuration from config file"""
+        try:
+            with open(config_file, 'r', encoding='utf-8') as f:
+                config_data = json.load(f)
+            
+            # Support single config or config list
+            if isinstance(config_data, dict):
+                config_dict = config_data
+            elif isinstance(config_data, list) and len(config_data) > 0:
+                config_dict = config_data[0]
+            else:
+                raise ValueError(f"Invalid config format in {config_file}")
+            
+            # Create configuration object
+            config = InferConfig.from_dict(config_dict)
+            
+            # Validate configuration
+            errors = InferConfigManager.validate_config(config)
+            if errors:
+                error_msg = "Configuration validation errors:\n" + "\n".join(f"  - {e}" for e in errors)
+                logger.error(error_msg)
+                raise ValueError(error_msg)
+            
+            return config
+            
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse JSON config file: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"Failed to load config from {config_file}: {e}")
+            raise
+    
+    @staticmethod
+    def validate_config(config: InferConfig) -> List[str]:
+        """Validate configuration effectiveness"""
+        errors = []
+        
+        # Basic validation
+        if not config.run_id:
+            errors.append("run_id cannot be empty")
+        
+        if not config.testcase:
+            errors.append("testcase cannot be empty")
+        
+        if not config.model or config.model == "unknown":
+            errors.append("model name is required")
+        
+        # Model path validation
+        model_dir = Path(config.model_path)
+        if not model_dir.exists():
+            errors.append(f"Model directory does not exist: {model_dir}")
+        else:
+            # Check config.json
+            config_file = model_dir / "config.json"
+            if not config_file.exists():
+                errors.append(f"config.json not found in model directory: {model_dir}")
+        
+        # Mode-specific validation
+        if config.mode == InferMode.DIRECT:
+            if not isinstance(config.infer_args, DirectInferArgs):
+                errors.append("Direct mode requires DirectInferArgs")
+            else:
+                if config.infer_args.static_batch_size <= 0:
+                    errors.append("Batch size must be positive")
+                if config.infer_args.prompt_token_num <= 0:
+                    errors.append("Prompt token number must be positive")
+        
+        elif config.mode == InferMode.SERVICE:
+            if not isinstance(config.infer_args, ServiceInferArgs):
+                errors.append("Service mode requires ServiceInferArgs")
+            else:
+                if not config.infer_args.request_trace:
+                    errors.append("Request trace is required for service mode")
+                if config.infer_args.concurrency <= 0:
+                    errors.append("Concurrency must be positive")
+                
+                # Verify trace file exists
+                trace_path = Path(config.infer_args.request_trace)
+                if not trace_path.exists():
+                    errors.append(f"Trace file not found: {trace_path}")
+        
+        # Output directory validation
+        output_dir = Path(config.output_dir)
+        try:
+            output_dir.mkdir(parents=True, exist_ok=True)
+            test_file = output_dir / ".test_write"
+            test_file.write_text("test")
+            test_file.unlink()
+        except Exception as e:
+            errors.append(f"Output directory is not writable: {e}")
+        
+        return errors
+
+    @staticmethod
+    def generate_auto_run_id(testcase: str) -> str:
+        """
+        Auto-generate run_id (public method)
+        
+        Format: {testcase}.{timestamp}.{random8}
+        Example: infer.InfiniLM.Direct.20251210_143025.a1b2c3d4
+        
+        Args:
+            testcase: testcase string
+            
+        Returns:
+            Generated run_id
+        """
+        # Clean testcase
+        cleaned_testcase = testcase.strip().strip(".").replace("..", ".")
+        
+        # Timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        
+        # 8-character random code
+        random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
+        
+        # Combine
+        run_id = f"{cleaned_testcase}.{timestamp}.{random_suffix}"
+        
+        return run_id
+    
+    # Original private method calls public method
+    @staticmethod
+    def _generate_auto_run_id(testcase: str) -> str:
+        """Private method, calls public method (maintain backward compatibility)"""
+        return InferConfig.generate_auto_run_id(testcase)
\ No newline at end of file
diff --git a/infinimetrics/inference/infer_main.py b/infinimetrics/inference/infer_main.py
new file mode 100644
index 0000000..2eab84b
--- /dev/null
+++ b/infinimetrics/inference/infer_main.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+"""
+Main entry point for inference evaluation
+Usage:
+cd ~/InfiniLM  # or vLLM directory
+python /path/to/infinimetrics/inference/infer_main.py --config config.json
+"""
+
+import argparse
+import os
+import sys
+import json
+import logging
+import traceback
+from pathlib import Path
+
+# Add current directory to Python path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, current_dir)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('infer_benchmark.log', mode='w')
+    ]
+)
+
+logger = logging.getLogger(__name__)
+
+def setup_logging(verbose: bool):
+    """Set logging level"""
+    if verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logger.debug("Verbose logging enabled")
+    else:
+        logging.getLogger().setLevel(logging.INFO)
+
+def check_current_directory():
+    """Check current directory and provide guidance"""
+    current_dir = os.getcwd()
+    logger.info(f"Current working directory: {current_dir}")
+    
+    # Check if we're in InfiniLM or vLLM directory
+    possible_frameworks = []
+    
+    if os.path.exists("scripts/jiuge.py"):
+        possible_frameworks.append("InfiniLM")
+    
+    if os.path.exists("vllm") or "site-packages/vllm" in current_dir:
+        possible_frameworks.append("vLLM")
+    
+    if possible_frameworks:
+        logger.info(f"Detected framework(s): {', '.join(possible_frameworks)}")
+    else:
+        logger.warning("No known inference framework detected in current directory")
+        logger.warning("Please run this script from either:")
+        logger.warning("  1. InfiniLM directory (contains scripts/jiuge.py)")
+        logger.warning("  2. vLLM directory or vLLM installation directory")
+
+def load_config(config_file: str):
+    """Load configuration file"""
+    from infer_config import InferConfigManager
+    
+    if not os.path.exists(config_file):
+        logger.error(f"Config file not found: {config_file}")
+        sys.exit(1)
+    
+    try:
+        config = InferConfigManager.load_config(config_file)
+        logger.info(f"Configuration loaded successfully: {config.run_id}")
+        return config
+    except Exception as e:
+        logger.error(f"Failed to load configuration: {e}")
+        sys.exit(1)
+
+def check_dependencies():
+    """Check for required dependencies"""
+    from infer_runner_factory import InferRunnerFactory
+    
+    dependencies = InferRunnerFactory.check_dependencies()
+    
+    logger.info("Dependency check:")
+    for dep, available in dependencies.items():
+        status = "✓" if available else "✗"
+        logger.info(f"  {status} {dep}")
+    
+    # Check for essential dependencies
+    if not dependencies["numpy"]:
+        logger.warning("NumPy is not installed. Some statistics may not be available.")
+    
+    return dependencies
+
+def main():
+    """Main function"""
+    parser = argparse.ArgumentParser(
+        description="Unified Inference Benchmark Framework",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Usage examples:
+  1. Run from InfiniLM directory:
+     cd ~/InfiniLM
+     python /path/to/infinimetrics/inference/infer_main.py --config config.json
+     
+  2. Run from vLLM directory:
+     cd ~/vllm
+     python /path/to/infinimetrics/inference/infer_main.py --config config.json
+     
+  3. Enable verbose logging:
+     python infer_main.py --config config.json --verbose
+        """
+    )
+    
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=False,
+        help="Path to configuration file (JSON format)"
+    )
+    
+    parser.add_argument(
+        "--validate-only",
+        action="store_true",
+        help="Only validate configuration, do not execute tests"
+    )
+    
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    
+    parser.add_argument(
+        "--check-deps",
+        action="store_true",
+        help="Check dependencies and exit"
+    )
+    
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Override output directory from configuration"
+    )
+    
+    args = parser.parse_args()
+    
+    # Set logging level
+    setup_logging(args.verbose)
+    
+    logger.info("=" * 60)
+    logger.info("Unified Inference Benchmark Framework")
+    logger.info("=" * 60)
+    
+    # Check current directory
+    check_current_directory()
+    
+    # Check dependencies
+    if args.check_deps:
+        check_dependencies()
+        sys.exit(0)
+    
+    if not args.config:
+        parser.error("the following arguments are required: --config")
+
+    # Load configuration
+    config = load_config(args.config)
+    
+    # Override output directory if specified
+    if args.output_dir:
+        config.output_dir = args.output_dir
+        logger.info(f"Output directory overridden: {config.output_dir}")
+    
+    # Validate configuration
+    from infer_config import InferConfigManager
+    errors = InferConfigManager.validate_config(config)
+    
+    if errors:
+        logger.warning("Configuration validation warnings:")
+        for error in errors:
+            logger.warning(f"  - {error}")
+    
+    if args.validate_only:
+        logger.info("Configuration validation completed")
+        if not errors:
+            logger.info("✓ Configuration is valid")
+        else:
+            logger.warning("⚠ Configuration has warnings but may still work")
+        sys.exit(0)
+    
+    # Check dependencies
+    dependencies = check_dependencies()
+    
+    # Check if framework is available
+    if config.framework.value == "infinilm" and not dependencies["infinilm"]:
+        logger.error("InfiniLM not detected in current directory")
+        logger.error("Please run this script from InfiniLM directory")
+        sys.exit(1)
+    
+    if config.framework.value == "vllm" and not dependencies["vllm"]:
+        logger.error("vLLM not detected or not installed")
+        logger.error("Please install vLLM or run from vLLM directory")
+        sys.exit(1)
+    
+    try:
+        # Create Runner and Adapter
+        from infer_runner_factory import InferRunnerFactory
+        runner, adapter = InferRunnerFactory.create_runner_and_adapter(config)
+        
+        # Run benchmark
+        logger.info(f"Starting benchmark: {config.run_id}")
+        logger.info(f"Mode: {config.mode.value}, Framework: {config.framework.value}")
+        
+        result_file = runner.run()
+        
+        if not isinstance(result_file, str):
+            logger.error(f"Expected string result file path, got: {type(result_file)}")
+            if isinstance(result_file, dict):
+                # Emergency handling: print results directly
+                logger.info("Results (dict format):")
+                logger.info(json.dumps(result_file, indent=2))
+                # Attempt to save to file
+                emergency_file = Path(config.output_dir) / "infer" / f"emergency_{config.run_id}_results.json"
+                with open(emergency_file, 'w') as f:
+                    json.dump(result_file, f, indent=2)
+                result_file = str(emergency_file)
+            else:
+                raise TypeError(f"Result file must be string, got {type(result_file)}")
+
+        logger.info("=" * 60)
+        logger.info("BENCHMARK COMPLETED SUCCESSFULLY")
+        logger.info("=" * 60)
+        logger.info(f"Results saved to: {result_file}")
+        
+        # Output result location
+        result_path = Path(result_file)
+        if result_path.exists():
+            with open(result_path, 'r') as f:
+                result_data = json.load(f)
+                success = result_data.get("success", 0)
+                logger.info(f"Benchmark success status: {success}")
+        else:
+            logger.warning(f"Result file not found: {result_file}")
+
+        return 0
+        
+    except ImportError as e:
+        logger.error(f"Import error: {e}")
+        logger.error("This may be because:")
+        logger.error("  1. You're not in the correct framework directory")
+        logger.error("  2. The framework is not properly installed")
+        logger.error("  3. The adapter implementation is missing")
+        return 1
+        
+    except Exception as e:
+        logger.error(f"Benchmark failed with error: {e}")
+        logger.error(traceback.format_exc())
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/infinimetrics/inference/infer_runner_base.py b/infinimetrics/inference/infer_runner_base.py
new file mode 100644
index 0000000..219d272
--- /dev/null
+++ b/infinimetrics/inference/infer_runner_base.py
@@ -0,0 +1,582 @@
+#!/usr/bin/env python3
+"""
+Inference Runner Base Class
+Defines common interfaces and template methods for all Runners
+"""
+
+import abc
+import csv
+import json
+import time
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Tuple
+from datetime import datetime
+import re
+
+from infer_config import InferConfig
+
+logger = logging.getLogger(__name__)
+
+class Metric:
+    """Base metric class"""
+    
+    def __init__(self, name: str, metric_type: str, unit: Optional[str] = None):
+        self.name = name
+        self.type = metric_type
+        self.unit = unit
+        self.value = None
+        self.raw_data_url = None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        result = {
+            "name": self.name,
+            "type": self.type,
+            "unit": self.unit
+        }
+        
+        if self.type == "scalar":
+            result["value"] = self.value
+        elif self.type == "timeseries":
+            result["raw_data_url"] = self.raw_data_url
+        
+        return result
+
+class ScalarMetric(Metric):
+    """Scalar metric"""
+    
+    def __init__(self, name: str, value: Any, unit: Optional[str] = None):
+        super().__init__(name, "scalar", unit)
+        self.value = value
+
+
+class TimeseriesMetric(Metric):
+    """Time series metric"""
+    
+    def __init__(self, name: str, raw_data_url: str, unit: Optional[str] = None):
+        super().__init__(name, "timeseries", unit)
+        self.raw_data_url = raw_data_url
+
+class BenchmarkResult:
+    """Benchmark result container"""
+    
+    def __init__(self):
+        self.metrics: List[Metric] = []
+        self.start_time: Optional[float] = None
+        self.end_time: Optional[float] = None
+        
+        # Performance data
+        self.latency_data: List[float] = []  # Latency data (milliseconds)
+        self.ttft_data: List[float] = []     # Time to first token data (milliseconds)
+        self.throughput_data: List[float] = []  # Throughput data
+        
+        # Additional data
+        self.peak_memory_usage: Optional[float] = None  # Peak memory usage (GB)
+        self.total_tokens: int = 0  # Total token count
+        self.success_rate: float = 1.0  # Success rate
+    
+    def add_metric(self, metric: Metric):
+        """Add a metric"""
+        self.metrics.append(metric)
+    
+    def add_latency(self, latency_ms: float):
+        """Add latency data"""
+        self.latency_data.append(latency_ms)
+    
+    def add_ttft(self, ttft_ms: float):
+        """Add TTFT data"""
+        self.ttft_data.append(ttft_ms)
+    
+    def add_throughput(self, throughput: float):
+        """Add throughput data"""
+        self.throughput_data.append(throughput)
+
+class InferRunnerBase(abc.ABC):
+    """Inference Runner base class (Template Method Pattern)"""
+    
+    def __init__(self, config: InferConfig, adapter):
+        self.config = config
+        self.adapter = adapter
+        self.result = BenchmarkResult()
+        self.infer_dir: Optional[Path] = None
+    
+    def prepare_output_dir(self) -> Path:
+        """Prepare output directory"""
+        output_dir = Path(self.config.output_dir)
+        self.infer_dir = output_dir / "infer"
+        
+        # Create directory
+        self.infer_dir.mkdir(parents=True, exist_ok=True)
+        
+        logger.info(f"Output directory prepared: {self.infer_dir}")
+        return self.infer_dir
+    
+    def save_timeseries_data(self):
+        """Save time series data to CSV files"""
+        if not self.infer_dir:
+            raise ValueError("Output directory not prepared")
+        
+        # Save latency data
+        if self.result.latency_data:
+            # ✅ Clean filename
+            safe_run_id = self._sanitize_filename(self.config.run_id)
+            latency_file = self.infer_dir / f"{safe_run_id}_infer_latency.csv"
+            
+            with open(latency_file, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(['timestamp', 'latency_ms'])
+                for i, latency in enumerate(self.result.latency_data):
+                    writer.writerow([i, latency])
+            
+            # Add different metrics based on mode
+            if self.config.mode.value == "direct":
+                metric_name = "infer.compute_latency"
+            else:
+                metric_name = "infer.e2e_latency"
+            
+            self.result.add_metric(TimeseriesMetric(
+                name=metric_name,
+                raw_data_url=f"./infer/{latency_file.name}",
+                unit="ms"
+            ))
+        
+        # Save TTFT data
+        if self.result.ttft_data:
+            safe_run_id = self._sanitize_filename(self.config.run_id)
+            ttft_file = self.infer_dir / f"{safe_run_id}_infer_ttft.csv"
+            
+            with open(ttft_file, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(['timestamp', 'ttft_ms'])
+                for i, ttft in enumerate(self.result.ttft_data):
+                    writer.writerow([i, ttft])
+            
+            self.result.add_metric(TimeseriesMetric(
+                name="infer.ttft",
+                raw_data_url=f"./infer/{ttft_file.name}",
+                unit="ms"
+            ))
+        
+        # Save throughput data
+        if self.result.throughput_data:
+            safe_run_id = self._sanitize_filename(self.config.run_id)
+            throughput_file = self.infer_dir / f"{safe_run_id}_infer_throughput.csv"
+            
+            with open(throughput_file, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(['timestamp', 'throughput'])
+                for i, throughput in enumerate(self.result.throughput_data):
+                    writer.writerow([i, throughput])
+            
+            if self.config.mode.value == "direct":
+                metric_name = "infer.direct_throughput_tps"
+                unit = "tokens/s/gpu"
+            else:
+                metric_name = "infer.response_per_second"
+                unit = "requests/s"
+            
+            self.result.add_metric(TimeseriesMetric(
+                name=metric_name,
+                raw_data_url=f"./infer/{throughput_file.name}",
+                unit=unit
+            ))
+    
+    def calculate_statistics(self) -> Dict[str, float]:
+        """Calculate statistical information"""
+        stats = {}
+        
+        try:
+            import numpy as np
+            
+            # Calculate latency statistics
+            if self.result.latency_data:
+                stats['avg_latency'] = np.mean(self.result.latency_data)
+                stats['p50_latency'] = np.percentile(self.result.latency_data, 50)
+                stats['p95_latency'] = np.percentile(self.result.latency_data, 95)
+                stats['p99_latency'] = np.percentile(self.result.latency_data, 99)
+                stats['min_latency'] = np.min(self.result.latency_data)
+                stats['max_latency'] = np.max(self.result.latency_data)
+                stats['std_latency'] = np.std(self.result.latency_data)
+            
+            # Calculate TTFT statistics
+            if self.result.ttft_data:
+                stats['avg_ttft'] = np.mean(self.result.ttft_data)
+                stats['p50_ttft'] = np.percentile(self.result.ttft_data, 50)
+                stats['p95_ttft'] = np.percentile(self.result.ttft_data, 95)
+                stats['p99_ttft'] = np.percentile(self.result.ttft_data, 99)
+                stats['min_ttft'] = np.min(self.result.ttft_data)
+                stats['max_ttft'] = np.max(self.result.ttft_data)
+            
+            # Calculate throughput statistics
+            if self.result.throughput_data:
+                stats['avg_throughput'] = np.mean(self.result.throughput_data)
+                stats['max_throughput'] = np.max(self.result.throughput_data)
+                stats['min_throughput'] = np.min(self.result.throughput_data)
+            
+            # Calculate total time and throughput rate
+            if self.result.start_time and self.result.end_time:
+                total_duration = self.result.end_time - self.result.start_time
+                stats['total_duration'] = total_duration
+                
+                if self.config.mode.value == "direct":
+                    if self.result.total_tokens > 0 and total_duration > 0:
+                        stats['overall_throughput'] = self.result.total_tokens / total_duration
+                
+                elif self.config.mode.value == "service":
+                    if len(self.result.latency_data) > 0 and total_duration > 0:
+                        stats['requests_per_second'] = len(self.result.latency_data) / total_duration
+            
+            # Success rate
+            stats['success_rate'] = self.result.success_rate
+            
+        except ImportError:
+            logger.warning("NumPy not available, skipping statistics calculation")
+        
+        return stats
+
+    def dump_json(self) -> str:
+        """Output JSON results - ensure string path is returned"""
+        if not self.infer_dir:
+            raise ValueError("Output directory not prepared")
+
+        safe_run_id = self._sanitize_filename(self.config.run_id)
+        json_filename = f"{safe_run_id}_results.json"
+
+        config_dict = {
+            "run_id": self.config.run_id,
+            "testcase": self.config.testcase,
+            "success": 1 if self.result.success_rate >= 0.95 else 0,
+            "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "config": {
+                "command": self._build_command_string(),
+                "framework": self.config.framework.value,
+                "model": self.config.model,
+                "model_config": self.config.model_config,
+                "train_dataset": self.config.train_dataset,
+                "validation_dataset": self.config.validation_dataset,
+                "test_dataset": self.config.test_dataset,
+                "infer_args": self.config.infer_args.to_dict() if hasattr(self.config.infer_args, 'to_dict') else {},
+                "warmup_iterations": self.config.warmup_iterations,
+                "measured_iterations": self.config.measured_iterations
+            },
+            "metrics": []
+        }
+
+        # ✅ 1. First add all existing metrics
+        for metric in self.result.metrics:
+            config_dict["metrics"].append(metric.to_dict())
+
+        # ✅ 2. Check and add missing metrics
+        required_metrics = {
+            "direct": [
+                ("infer.peak_memory_usage", "scalar", "GB"),
+                ("infer.compute_latency", "timeseries", "ms"),
+                ("infer.ttft", "timeseries", "ms"),
+                ("infer.direct_throughput_tps", "timeseries", "tokens/s/gpu")
+            ],
+            "service": [
+                ("infer.accuracy_mmlu", "scalar", None),
+                ("infer.peak_memory_usage", "scalar", "GB"),
+                ("infer.e2e_latency", "timeseries", "ms"),
+                ("infer.ttft", "timeseries", "ms"),
+                ("infer.response_per_second", "timeseries", None),
+                ("infer.compute_latency", "timeseries", "ms"),
+                ("infer.max_throughput_tps", "timeseries", "tokens/s/gpu"),
+                ("infer.success_rate", "scalar", "%")
+            ]
+        }
+    
+        mode = self.config.mode.value
+        if mode in required_metrics:
+            for metric_name, metric_type, unit in required_metrics[mode]:
+                # Check if already exists
+                if not any(m.get('name') == metric_name for m in config_dict['metrics']):
+                    logger.debug(f"Adding missing metric: {metric_name}")
+                
+                    if metric_name == "infer.peak_memory_usage":
+                        # Special handling: try to get peak memory
+                        peak_memory = self.result.peak_memory_usage
+                        if peak_memory is None:
+                            try:
+                                peak_memory = self.adapter.get_peak_memory_usage()
+                            except Exception as e:
+                                logger.warning(f"Failed to get peak memory usage: {e}")
+                                peak_memory = 0.0
+                    
+                        # ✅ Ensure .to_dict() is called
+                        config_dict["metrics"].append(ScalarMetric(
+                            name=metric_name,
+                            value=peak_memory,
+                            unit=unit
+                        ).to_dict())
+                    
+                    elif metric_name == "infer.success_rate":
+                        # Special handling: success rate
+                        success_rate = self.result.success_rate
+                        config_dict["metrics"].append(ScalarMetric(
+                            name=metric_name,
+                            value=success_rate * 100,  # Convert to percentage
+                            unit=unit
+                        ).to_dict())
+                    
+                    elif metric_type == "scalar":
+                        # Other scalar metrics (placeholder)
+                        config_dict["metrics"].append(ScalarMetric(
+                            name=metric_name,
+                            value=0.0,  # Placeholder value
+                            unit=unit
+                        ).to_dict())  # ✅ Ensure .to_dict() is called
+                    
+                    elif metric_type == "timeseries":
+                        # Time series metrics (create placeholder file)
+                        file_created = self._ensure_metric_file(metric_name, safe_run_id)
+                        if file_created:
+                            raw_data_url = f"./infer/{safe_run_id}_{metric_name.replace('.', '_')}.csv"
+                            config_dict["metrics"].append({
+                                "name": metric_name,
+                                "type": "timeseries",
+                                "raw_data_url": raw_data_url,
+                                "unit": unit
+                            })
+
+        # ✅ 3. Ensure no duplicate metrics
+        seen_names = set()
+        unique_metrics = []
+        for metric in config_dict['metrics']:
+            name = metric.get('name')
+            if name and name not in seen_names:
+                seen_names.add(name)
+                unique_metrics.append(metric)
+            elif name:
+                logger.warning(f"Duplicate metric found: {name}")
+    
+        config_dict['metrics'] = unique_metrics
+
+        # ✅ 4. Print debug information
+        logger.info(f"Total metrics in JSON: {len(config_dict['metrics'])}")
+        for metric in config_dict['metrics']:
+            metric_name = metric.get('name', 'unknown')
+            metric_type = metric.get('type', 'unknown')
+            logger.debug(f"  - {metric_name} ({metric_type})")
+
+        # ✅ 5. Save file
+        json_file = self.infer_dir / json_filename
+        with open(json_file, 'w', encoding='utf-8') as f:
+            json.dump(config_dict, f, indent=2, ensure_ascii=False)
+
+        logger.info(f"Results saved to: {json_file}")
+        return str(json_file)
+
+    def _ensure_metric_file(self, metric_name: str, safe_run_id: str) -> bool:
+        """Ensure metric file exists, create placeholder if not"""
+        if not self.infer_dir:
+            return False
+    
+        # Map metric name to filename
+        file_mapping = {
+            "infer.e2e_latency": f"{safe_run_id}_infer_latency.csv",
+            "infer.compute_latency": f"{safe_run_id}_infer_compute_latency.csv",
+            "infer.ttft": f"{safe_run_id}_infer_ttft.csv",
+            "infer.response_per_second": f"{safe_run_id}_infer_throughput.csv",
+            "infer.max_throughput_tps": f"{safe_run_id}_infer_max_throughput.csv",
+            "infer.direct_throughput_tps": f"{safe_run_id}_infer_direct_throughput.csv"
+        }
+    
+        if metric_name not in file_mapping:
+            return False
+    
+        filename = file_mapping[metric_name]
+        file_path = self.infer_dir / filename
+    
+        if not file_path.exists():
+            try:
+                with open(file_path, 'w', newline='') as f:
+                    writer = csv.writer(f)
+                    writer.writerow(['timestamp', 'value'])
+                    writer.writerow([0, 0.0])  # Placeholder data
+                logger.debug(f"Created placeholder file: {filename}")
+                return True
+            except Exception as e:
+                logger.warning(f"Failed to create placeholder file {filename}: {e}")
+                return False
+    
+        return True
+
+    def _create_max_throughput_placeholder(self):
+        if not self.infer_dir:
+            return
+    
+        safe_run_id = self._sanitize_filename(self.config.run_id)
+        max_throughput_file = self.infer_dir / f"{safe_run_id}_infer_max_throughput.csv"
+    
+        with open(max_throughput_file, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow(['timestamp', 'throughput'])
+            writer.writerow([0, 0.0])  
+
+    def _create_compute_latency_placeholder(self):
+        if not self.infer_dir:
+            return
+    
+        safe_run_id = self._sanitize_filename(self.config.run_id)
+        compute_latency_file = self.infer_dir / f"{safe_run_id}_infer_compute_latency.csv"
+    
+        with open(compute_latency_file, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow(['timestamp', 'latency_ms'])
+            writer.writerow([0, 0.0])      
+
+    def _build_command_string(self) -> str:
+        """Build command string"""
+        cmd_parts = []
+        
+        if self.config.framework.value == "infinilm":
+            cmd_parts.append("python scripts/jiuge.py --nvidia")
+            cmd_parts.append(self.config.model_path)
+            cmd_parts.append(str(self.config.infer_args.parallel.tp))
+            
+            # Add batch size parameter (if jiuge.py supports it)
+            if self.config.mode.value == "direct":
+                if hasattr(self.config.infer_args, 'static_batch_size'):
+                    cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}")
+        
+        else:  # vllm
+            if self.config.mode.value == "direct":
+                cmd_parts.append("python -m vllm.benchmarks.benchmark_throughput")
+                cmd_parts.append(f"--model {self.config.model_path}")
+                if hasattr(self.config.infer_args, 'static_batch_size'):
+                    cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}")
+            else:  # service
+                cmd_parts.append("python -m vllm.entrypoints.api_server")
+                cmd_parts.append(f"--model {self.config.model_path}")
+                cmd_parts.append(f"--port 8000")
+        
+        return " ".join(cmd_parts)
+    
+    def _sanitize_filename(self, filename: str) -> str:
+        """
+        Clean filename by removing special characters
+        
+        Args:
+            filename: Original filename
+            
+        Returns:
+            Cleaned filename
+        """
+        # Replace special characters with underscore
+        sanitized = re.sub(r'[^\w\-_.]', '_', filename)
+        # Remove consecutive underscores
+        sanitized = re.sub(r'_+', '_', sanitized)
+        # Remove leading/trailing underscores
+        sanitized = sanitized.strip('_')
+        
+        # Ensure reasonable filename length
+        max_length = 255
+        if len(sanitized) > max_length:
+            # Keep first 100 chars and last 155 chars (add ... in between)
+            prefix = sanitized[:100]
+            suffix = sanitized[-155:] if len(sanitized) > 255 else ""
+            sanitized = f"{prefix}...{suffix}"
+        
+        return sanitized
+    
+    def run(self) -> str:
+        """Run template method - ensure string path is returned"""
+        logger.info(f"Starting inference benchmark: {self.config.run_id}")
+        logger.info(f"Testcase: {self.config.testcase}")
+        logger.info(f"Framework: {self.config.framework.value}")
+        logger.info(f"Mode: {self.config.mode.value}")
+        
+        try:
+            # Prepare output directory
+            self.prepare_output_dir()
+            
+            # Record start time
+            self.result.start_time = time.time()
+            
+            # Template method: to be implemented by subclasses
+            self.setup()
+            self.execute()
+            self.collect_metrics()
+            
+            # Record end time
+            self.result.end_time = time.time()
+            
+            # Save data
+            self.save_timeseries_data()
+            
+            # Output JSON
+            result_file = self.dump_json()
+            
+            # ✅ Double check: ensure string is returned
+            if not isinstance(result_file, str):
+                logger.error(f"dump_json() returned {type(result_file)} instead of str")
+                # Try conversion
+                try:
+                    result_file = str(result_file)
+                except:
+                    # If conversion fails, create default file
+                    safe_run_id = self._sanitize_filename(self.config.run_id)
+                    default_file = self.infer_dir / f"{safe_run_id}_emergency_results.json"
+                    result_file = str(default_file)
+                    logger.warning(f"Created emergency result file: {result_file}")
+            
+            # Print statistics
+            self._print_statistics()
+            
+            logger.info(f"Benchmark completed successfully: {self.config.run_id}")
+            
+            # ✅ Final ensure string return
+            return str(result_file)
+            
+        except Exception as e:
+            logger.error(f"Benchmark failed: {e}", exc_info=True)
+            raise
+    
+    def _print_statistics(self):
+        """Print statistical information"""
+        stats = self.calculate_statistics()
+        
+        logger.info("=" * 60)
+        logger.info("BENCHMARK STATISTICS")
+        logger.info("=" * 60)
+        
+        if 'avg_latency' in stats:
+            logger.info(f"Average latency: {stats['avg_latency']:.2f} ms")
+            if 'p95_latency' in stats:
+                logger.info(f"P95 latency: {stats['p95_latency']:.2f} ms")
+        
+        if 'avg_ttft' in stats:
+            logger.info(f"Average TTFT: {stats['avg_ttft']:.2f} ms")
+            if 'p95_ttft' in stats:
+                logger.info(f"P95 TTFT: {stats['p95_ttft']:.2f} ms")
+        
+        if 'avg_throughput' in stats:
+            if self.config.mode.value == "direct":
+                logger.info(f"Average throughput: {stats['avg_throughput']:.2f} tokens/s/gpu")
+            else:
+                logger.info(f"Average throughput: {stats['avg_throughput']:.2f} requests/s")
+        
+        if 'success_rate' in stats:
+            logger.info(f"Success rate: {stats['success_rate']:.2%}")
+        
+        if 'total_duration' in stats:
+            logger.info(f"Total duration: {stats['total_duration']:.2f} s")
+        
+        logger.info("=" * 60)
+    
+    @abc.abstractmethod
+    def setup(self) -> None:
+        """Set up execution environment (to be implemented by subclasses)"""
+        pass
+    
+    @abc.abstractmethod
+    def execute(self) -> None:
+        """Execute inference test (to be implemented by subclasses)"""
+        pass
+    
+    @abc.abstractmethod
+    def collect_metrics(self) -> None:
+        """Collect performance metrics (to be implemented by subclasses)"""
+        pass
\ No newline at end of file
diff --git a/infinimetrics/inference/infer_runner_factory.py b/infinimetrics/inference/infer_runner_factory.py
new file mode 100644
index 0000000..fb44743
--- /dev/null
+++ b/infinimetrics/inference/infer_runner_factory.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""
+Inference Runner Factory Class
+Creates appropriate Runner and Adapter based on configuration
+"""
+
+import logging
+import sys
+import os
+from typing import Dict, Tuple
+
+from infer_config import InferConfig, InferMode, FrameworkType
+from infer_runner_base import InferRunnerBase
+from adapter_base import InferAdapter
+
+logger = logging.getLogger(__name__)
+
+class InferRunnerFactory:
+    """Inference Runner Factory"""
+    
+    @staticmethod
+    def create_runner_and_adapter(config: InferConfig) -> Tuple[InferRunnerBase, InferAdapter]:
+        """
+        Create Runner and Adapter
+        
+        Returns: (runner, adapter)
+        """
+        # Create adapter based on framework
+        adapter = InferRunnerFactory.create_adapter(config)
+        
+        # Create runner based on mode
+        runner = InferRunnerFactory.create_runner(config, adapter)
+        
+        return runner, adapter
+    
+    @staticmethod
+    def create_adapter(config: InferConfig) -> InferAdapter:
+        """Create adapter"""
+        logger.info(f"Creating adapter for framework: {config.framework.value}")
+        
+        if config.framework == FrameworkType.INFINILM:
+            try:
+                from adapters.infinilm_adapter import InfiniLMAdapter
+                adapter = InfiniLMAdapter(config)
+                logger.info("InfiniLMAdapter created successfully")
+            except ImportError as e:
+                logger.error(f"Failed to import InfiniLMAdapter: {e}")
+                raise ImportError("InfiniLMAdapter is not available. Please check if InfiniLM is installed.")
+        
+        elif config.framework == FrameworkType.VLLM:
+            try:
+                from adapters.vllm_adapter import VLLMAdapter
+                adapter = VLLMAdapter(config)
+                logger.info("VLLMAdapter created successfully")
+            except ImportError as e:
+                logger.error(f"Failed to import VLLMAdapter: {e}")
+                raise ImportError("VLLMAdapter is not available. Please check if vLLM is installed.")
+        
+        else:
+            raise ValueError(f"Unsupported framework: {config.framework}")
+        
+        # Validate adapter configuration
+        errors = adapter.validate_config()
+        if errors:
+            error_msg = "Adapter configuration errors:\n" + "\n".join(f"  - {e}" for e in errors)
+            logger.warning(error_msg)
+        
+        return adapter
+    
+    @staticmethod
+    def create_runner(config: InferConfig, adapter: InferAdapter) -> InferRunnerBase:
+        """Create runner"""
+        logger.info(f"Creating runner for mode: {config.mode.value}")
+        
+        if config.mode == InferMode.DIRECT:
+            try:
+                from direct_infer_runner import DirectInferRunner
+                runner = DirectInferRunner(config, adapter)
+                logger.info("DirectInferRunner created successfully")
+            except ImportError as e:
+                logger.error(f"Failed to import DirectInferRunner: {e}")
+                raise
+        
+        elif config.mode == InferMode.SERVICE:
+            try:
+                from service_infer_runner import ServiceInferRunner
+                runner = ServiceInferRunner(config, adapter)
+                logger.info("ServiceInferRunner created successfully")
+            except ImportError as e:
+                logger.error(f"Failed to import ServiceInferRunner: {e}")
+                raise
+        
+        else:
+            raise ValueError(f"Unsupported inference mode: {config.mode}")
+        
+        return runner
+    
+    @staticmethod
+    def check_dependencies() -> Dict[str, bool]:
+        """Check dependencies"""
+        dependencies = {
+            "infinilm": False,
+            "vllm": False,
+            "numpy": False,
+            "torch": False
+        }
+        
+        # Check InfiniLM
+        try:
+            # Try importing InfiniLM related modules
+            import sys
+            import os
+            
+            # Check if in InfiniLM directory
+            if os.path.exists("scripts/jiuge.py"):
+                dependencies["infinilm"] = True
+            else:
+                # Try to determine via environment variable
+                infinilm_path = os.environ.get("INFINILM_PATH", "")
+                if infinilm_path and os.path.exists(os.path.join(infinilm_path, "scripts/jiuge.py")):
+                    dependencies["infinilm"] = True
+        except:
+            pass
+        
+        # Check vLLM
+        try:
+            import vllm
+            dependencies["vllm"] = True
+        except ImportError:
+            pass
+        
+        # Check numpy
+        try:
+            import numpy
+            dependencies["numpy"] = True
+        except ImportError:
+            pass
+        
+        # Check torch
+        try:
+            import torch
+            dependencies["torch"] = True
+        except ImportError:
+            pass
+        
+        return dependencies
diff --git a/infinimetrics/inference/service_infer_runner.py b/infinimetrics/inference/service_infer_runner.py
new file mode 100644
index 0000000..20f37ad
--- /dev/null
+++ b/infinimetrics/inference/service_infer_runner.py
@@ -0,0 +1,438 @@
+#!/usr/bin/env python3
+"""
+Service Inference Runner Implementation
+Start service and run trace testing
+"""
+
+import asyncio
+import logging
+import time
+import json
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from utils.gpu_monitor import create_gpu_monitor
+
+from infer_runner_base import InferRunnerBase, TimeseriesMetric, ScalarMetric
+from infer_config import InferConfig, ServiceInferArgs
+from utils.trace_client import TraceClient, TraceClientConfig, RequestTrace
+from utils.prompt_generator import create_prompt_generator
+
+logger = logging.getLogger(__name__)
+
+class ServiceInferRunner(InferRunnerBase):
+    """Service Inference Runner"""
+    
+    def __init__(self, config: InferConfig, adapter):
+        super().__init__(config, adapter)
+        self.infer_args: ServiceInferArgs = config.infer_args
+        
+        # Trace related
+        self.traces: List[RequestTrace] = []
+        self.trace_stats: Dict[str, Any] = {}
+        
+        # Add GPU monitor
+        self.gpu_monitor = None
+        
+        logger.info(f"ServiceInferRunner created for trace: {self.infer_args.request_trace}")
+        logger.info(f"Concurrency: {self.infer_args.concurrency}")
+        logger.info(f"Max sequence length: {self.infer_args.max_seq_len}")
+    
+    def setup(self) -> None:
+        """Set up service inference environment"""
+        logger.info("Setting up service inference environment")
+        
+        # 1. Create GPU monitor
+        device_ids = self.config.device.device_ids
+        if self.config.device.cpu_only:
+            logger.info("CPU-only mode, GPU monitoring disabled")
+            self.gpu_monitor = None
+        else:
+            self.gpu_monitor = create_gpu_monitor(
+                gpu_platform=self.config.device.gpu_platform,
+                device_ids=device_ids
+            )
+
+        # 2. Start GPU monitoring
+        if self.gpu_monitor:
+            self.gpu_monitor.start_monitoring()
+            logger.info(f"GPU monitoring started for devices: {device_ids}")
+        
+        # 3. Load trace file
+        self._load_trace_data()
+        
+        # 4. Launch inference service
+        logger.info(f"Launching inference service on port 8000")
+        
+        try:
+            # Use adapter to launch service
+            self.adapter.launch_service(port=8000)
+            
+            # Wait for service readiness
+            max_wait_time = 120  # Maximum 120 seconds
+            wait_interval = 3    # Check every 3 seconds
+            
+            logger.info("Waiting for service to be ready...")
+            for i in range(max_wait_time // wait_interval):
+                if self.adapter.is_service_ready(port=8000):
+                    logger.info("Inference service is ready")
+                    return
+                
+                logger.info(f"  Waiting... ({i * wait_interval}s elapsed)")
+                time.sleep(wait_interval)
+            
+            raise TimeoutError("Inference service failed to start within timeout")
+            
+        except Exception as e:
+            logger.error(f"Failed to setup service: {e}")
+            
+            # Stop GPU monitoring
+            if self.gpu_monitor:
+                self.gpu_monitor.stop_monitoring()
+
+            # Ensure service is stopped
+            try:
+                self.adapter.stop_service()
+            except:
+                pass
+            
+            raise
+    
+    def execute(self) -> None:
+        """Execute service inference test"""
+        logger.info("Executing service inference benchmark")
+        
+        # Run asynchronous trace test
+        asyncio.run(self._run_trace_async())
+    
+    async def _run_trace_async(self):
+        """Asynchronously run trace test"""
+        try:
+            # 1. Create trace client configuration
+            client_config = TraceClientConfig(
+                api_url="http://localhost:8000",
+                model_name=self.config.model,
+                timeout_ms=self.infer_args.timeout_ms,
+                warmup_requests=min(10, len(self.traces) // 10)  # 10% of requests for warmup
+            )
+            
+            # 2. Create prompt generator
+            # First get tokenizer (if adapter has loaded model)
+            tokenizer = None
+            if hasattr(self.adapter, 'tokenizer') and self.adapter.tokenizer:
+                tokenizer = self.adapter.tokenizer
+            
+            prompt_generator = create_prompt_generator(
+                tokenizer=tokenizer,
+                method="random"  # Use random tokens
+            )
+            
+            # 3. Load trace data
+            self.traces = TraceClient.load_trace_file(
+                self.infer_args.request_trace,
+                prompt_generator
+            )
+            
+            # 4. Use trace client to run test
+            async with TraceClient(client_config) as client:
+                # Run trace
+                processed_traces, stats = await client.run_trace(
+                    traces=self.traces,
+                    concurrency=self.infer_args.concurrency,
+                    warmup_requests=client_config.warmup_requests
+                )
+                
+                # Save results
+                self.traces = processed_traces
+                self.trace_stats = stats
+                
+                # Save trace results to CSV
+                if self.infer_dir:
+                    client.save_results_to_csv(
+                        processed_traces,
+                        self.infer_dir,
+                        self.config.run_id
+                    )
+        
+        except Exception as e:
+            logger.error(f"Trace test failed: {e}", exc_info=True)
+            raise
+
+        finally:
+            # ✅ Stop GPU monitoring
+            if self.gpu_monitor:
+                self.gpu_monitor.stop_monitoring()
+                peak_memory_gb = self.gpu_monitor.get_peak_memory_gb()
+                logger.info(f"Peak GPU memory usage during test: {peak_memory_gb} GB")
+                self.result.peak_memory_usage = peak_memory_gb
+
+    def collect_metrics(self) -> None:
+        """Collect service inference metrics"""
+        logger.info("Collecting service inference metrics")
+        
+        # Extract data from trace statistics
+        if self.trace_stats:
+            # TTFT data
+            ttfts = []
+            for trace in self.traces:
+                if trace.success and trace.ttft is not None:
+                    ttfts.append(trace.ttft)
+                    self.result.add_ttft(trace.ttft)
+            
+            # E2E latency data
+            e2e_latencies = []
+            for trace in self.traces:
+                if trace.success and trace.e2e_latency is not None:
+                    e2e_latencies.append(trace.e2e_latency)
+                    self.result.add_latency(trace.e2e_latency)
+            
+            # Throughput data (requests/s)
+            if self.trace_stats.get('total_duration', 0) > 0:
+                rps = self.trace_stats.get('requests_per_second', 0)
+                self.result.add_throughput(rps)
+            
+            # Success rate
+            success_rate = self.trace_stats.get('success_rate', 0)
+            self.result.success_rate = success_rate
+            
+            # Total tokens
+            total_tokens = self.trace_stats.get('total_tokens', 0)
+            self.result.total_tokens = total_tokens
+            
+            # Add scalar metrics
+            if 'avg_ttft' in self.trace_stats:
+                self.result.add_metric(ScalarMetric(
+                    name="infer.avg_ttft",
+                    value=self.trace_stats['avg_ttft'],
+                    unit="ms"
+                ))
+            
+            if 'avg_e2e_latency' in self.trace_stats:
+                self.result.add_metric(ScalarMetric(
+                    name="infer.avg_e2e_latency",
+                    value=self.trace_stats['avg_e2e_latency'],
+                    unit="ms"
+                ))
+            
+            if 'throughput_tps' in self.trace_stats:
+                self.result.add_metric(ScalarMetric(
+                    name="infer.avg_throughput_tps",
+                    value=self.trace_stats['throughput_tps'],
+                    unit="tokens/s"
+                ))
+            
+            # Success rate metric
+            self.result.add_metric(ScalarMetric(
+                name="infer.success_rate",
+                value=success_rate * 100,  # Convert to percentage
+                unit="%"
+            ))
+            
+            # Total requests
+            self.result.add_metric(ScalarMetric(
+                name="infer.total_requests",
+                value=self.trace_stats.get('total_requests', 0),
+                unit="requests"
+            ))
+            
+            # Record peak memory usage (if available)
+            peak_memory = self.adapter.get_peak_memory_usage()
+            if peak_memory:
+                self.result.peak_memory_usage = peak_memory
+                logger.info(f"Peak GPU memory usage: {peak_memory:.2f} GB")
+            
+        else:
+            logger.warning("No trace statistics available")
+    
+    def _load_trace_data(self):
+        """Load trace data"""
+        trace_file = self.infer_args.request_trace
+        
+        if not Path(trace_file).exists():
+            raise FileNotFoundError(f"Trace file not found: {trace_file}")
+        
+        logger.info(f"Loading trace data from: {trace_file}")
+        
+        # Create temporary prompt generator
+        from utils.prompt_generator import create_prompt_generator
+        temp_prompt_generator = create_prompt_generator(method="random")
+        
+        # Use trace client's method to load trace file
+        self.traces = TraceClient.load_trace_file(
+            trace_file, 
+            temp_prompt_generator
+        )
+        
+        logger.info(f"Loaded {len(self.traces)} requests from trace file")
+        
+        # Validate trace data
+        self._validate_trace_data()
+    
+    def _validate_trace_data(self):
+        """Validate trace data"""
+        if not self.traces:
+            raise ValueError("No trace data loaded")
+        
+        # Check if maximum tokens exceed model limits
+        max_input_tokens = max(t.input_token_num for t in self.traces)
+        max_output_tokens = max(t.output_token_num for t in self.traces)
+        
+        if max_input_tokens > self.infer_args.max_seq_len:
+            logger.warning(f"Max input tokens ({max_input_tokens}) exceeds max_seq_len "
+                          f"({self.infer_args.max_seq_len})")
+        
+        if max_output_tokens > self.infer_args.max_seq_len:
+            logger.warning(f"Max output tokens ({max_output_tokens}) exceeds max_seq_len "
+                          f"({self.infer_args.max_seq_len})")
+        
+        # Check timestamp order
+        timestamps = [t.arrival_timestamp_ms for t in self.traces]
+        if not all(timestamps[i] <= timestamps[i+1] for i in range(len(timestamps)-1)):
+            logger.warning("Trace timestamps are not sorted. Sorting now...")
+            self.traces.sort(key=lambda x: x.arrival_timestamp_ms)
+    
+    def dump_json(self) -> str:
+        """Override dump_json to generate standard format metrics - fixed version"""
+        if not self.infer_dir:
+            raise ValueError("Output directory not prepared")
+        
+        # Use base class method to generate basic JSON
+        json_file = super().dump_json()
+        
+        # Read base class generated JSON
+        with open(json_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        
+        # Build standard metrics array
+        standard_metrics = []
+        
+        # 1. accuracy_mmlu (placeholder)
+        standard_metrics.append({
+            "name": "infer.accuracy_mmlu",
+            "type": "scalar",
+            "value": None,  # Placeholder, needs actual calculation
+            "unit": None
+        })
+        
+        # 2. e2e_latency (timeseries)
+        latency_file = self.infer_dir / f"{self.config.run_id}_infer_latency.csv"
+        if latency_file.exists():
+            standard_metrics.append({
+                "name": "infer.e2e_latency",
+                "type": "timeseries",
+                "raw_data_url": f"./infer/{latency_file.name}",
+                "unit": "ms"
+            })
+        
+        # 3. ttft (timeseries)
+        ttft_file = self.infer_dir / f"{self.config.run_id}_infer_ttft.csv"
+        if ttft_file.exists():
+            standard_metrics.append({
+                "name": "infer.ttft",
+                "type": "timeseries",
+                "raw_data_url": f"./infer/{ttft_file.name}",
+                "unit": "ms"
+            })
+        
+        # 4. peak_memory_usage (scalar) - Use GPU monitor to get real data
+        # Get peak memory usage
+        peak_memory = None
+        
+        if self.gpu_monitor:
+            try:
+                peak_memory = self.gpu_monitor.get_peak_memory_gb()
+                logger.info(f"Real peak GPU memory usage: {peak_memory} GB")
+            except Exception as e:
+                logger.warning(f"Failed to get peak memory from GPU monitor: {e}")
+                peak_memory = 0.0
+        else:
+            logger.warning("GPU monitor not available, using 0.0 GB")
+        
+        standard_metrics.append({
+            "name": "infer.peak_memory_usage",
+            "type": "scalar",
+            "value": peak_memory,
+            "unit": "GB"
+        })
+        
+        # 5. response_per_second (timeseries)
+        response_file = self.infer_dir / f"{self.config.run_id}_infer_throughput.csv"
+        if response_file.exists():
+            standard_metrics.append({
+                "name": "infer.response_per_second",
+                "type": "timeseries",
+                "raw_data_url": f"./infer/{response_file.name}",
+                "unit": None
+            })
+        else:
+            # Add placeholder if no file
+            standard_metrics.append({
+                "name": "infer.response_per_second",
+                "type": "timeseries",
+                "raw_data_url": None,
+                "unit": None
+            })
+        
+        # 6. compute_latency (timeseries - placeholder)
+        # Service mode may not have compute_latency file, add placeholder
+        compute_latency_file = self.infer_dir / f"{self.config.run_id}_infer_compute_latency.csv"
+        if compute_latency_file.exists():
+            standard_metrics.append({
+                "name": "infer.compute_latency",
+                "type": "timeseries",
+                "raw_data_url": f"./infer/{compute_latency_file.name}",
+                "unit": "ms"
+            })
+        else:
+            standard_metrics.append({
+                "name": "infer.compute_latency",
+                "type": "timeseries",
+                "raw_data_url": None,
+                "unit": "ms"
+            })
+        
+        # 7. max_throughput_tps (timeseries)
+        max_throughput_file = self.infer_dir / f"{self.config.run_id}_infer_max_throughput.csv"
+        if max_throughput_file.exists():
+            standard_metrics.append({
+                "name": "infer.max_throughput_tps",
+                "type": "timeseries",
+                "raw_data_url": f"./infer/{max_throughput_file.name}",
+                "unit": "tokens/s/gpu"
+            })
+        else:
+            # Calculate max throughput from throughput data
+            if hasattr(self.result, 'throughput_data') and self.result.throughput_data:
+                max_throughput = max(self.result.throughput_data)
+                standard_metrics.append({
+                    "name": "infer.max_throughput_tps",
+                    "type": "scalar",
+                    "value": max_throughput,
+                    "unit": "tokens/s/gpu"
+                })
+            else:
+                standard_metrics.append({
+                    "name": "infer.max_throughput_tps",
+                    "type": "scalar",
+                    "value": 0.0,
+                    "unit": "tokens/s/gpu"
+                })
+        
+        # Update metrics in data
+        data["metrics"] = standard_metrics
+        
+        # Save back to file
+        with open(json_file, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        
+        logger.info(f"Service metrics saved to: {json_file}")
+        return str(json_file)
+    
+    def cleanup(self):
+        """Clean up resources"""
+        logger.info("Cleaning up service inference resources")
+        
+        # Stop inference service
+        try:
+            self.adapter.stop_service()
+        except Exception as e:
+            logger.warning(f"Error stopping service: {e}")
\ No newline at end of file
diff --git a/infinimetrics/inference/utils/gpu_monitor.py b/infinimetrics/inference/utils/gpu_monitor.py
new file mode 100644
index 0000000..a7286ca
--- /dev/null
+++ b/infinimetrics/inference/utils/gpu_monitor.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""
+GPU Monitoring Module - Revised Version
+"""
+import subprocess
+import threading
+import time
+from abc import ABC, abstractmethod
+import logging
+
+logger = logging.getLogger(__name__)
+
+class GPUMonitor(ABC):
+    """Abstract base class for GPU monitoring"""
+
+    def __init__(self, device_ids=None):
+        """
+        Initialize GPU monitor
+        
+        Args:
+            device_ids: List of GPU device IDs to monitor. If None, monitor all GPUs.
+        """
+        self.device_ids = device_ids
+        self.peak_memory_mib = 0  # Peak memory usage (MiB)
+        self.monitor_thread = None
+        self._stop_monitoring_flag = False  # Renamed variable for fix
+        self.poll_interval = 0.5  # Polling interval (seconds)
+        
+        logger.info(f"GPUMonitor initialized for devices: {device_ids}")
+    
+    @abstractmethod
+    def get_current_memory_usage(self):
+        """Get current GPU memory usage for all devices"""
+        pass
+    
+    def start_monitoring(self):
+        """Start monitoring GPU memory usage"""
+        if self.monitor_thread and self.monitor_thread.is_alive():
+            logger.warning("Monitoring already started")
+            return
+        
+        self._stop_monitoring_flag = False
+        self.peak_memory_mib = 0
+        
+        def monitor_loop():
+            logger.info("GPU monitoring started")
+            while not self._stop_monitoring_flag:
+                try:
+                    current_mem = self.get_current_memory_usage()
+                    if current_mem:
+                        current_peak = max(current_mem)
+                        if current_peak > self.peak_memory_mib:
+                            self.peak_memory_mib = current_peak
+                            logger.debug(f"New peak memory: {self.peak_memory_mib} MiB")
+                except Exception as e:
+                    logger.debug(f"Error getting GPU memory: {e}")
+                
+                time.sleep(self.poll_interval)
+            
+            logger.info("GPU monitoring stopped")
+        
+        self.monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
+        self.monitor_thread.start()
+    
+    def stop_monitoring(self):
+        """Stop monitoring GPU memory usage"""
+        self._stop_monitoring_flag = True
+        if self.monitor_thread:
+            self.monitor_thread.join(timeout=5.0)
+            if self.monitor_thread.is_alive():
+                logger.warning("Monitor thread did not stop gracefully")
+    
+    def get_peak_memory_gb(self):
+        """Get peak memory usage in GB"""
+        return round(self.peak_memory_mib / 1024.0, 6)
+    
+    def get_peak_memory_mib(self):
+        """Get peak memory usage in MiB"""
+        return self.peak_memory_mib
+    
+    def get_peak_memory_per_device(self):
+        """Get peak memory usage per device"""
+        total_gb = self.get_peak_memory_gb()
+        if self.device_ids:
+            per_device = total_gb / len(self.device_ids)
+            return {device_id: per_device for device_id in self.device_ids}
+        else:
+            return {"all": total_gb}
+
+class NVIDIAGPUMonitor(GPUMonitor):
+    """NVIDIA GPU monitor implementation"""
+    
+    def get_current_memory_usage(self):
+        try:
+            cmd = ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"]
+            
+            if self.device_ids:
+                device_str = ",".join(str(d) for d in self.device_ids)
+                cmd.extend(["--id", device_str])
+            
+            out = subprocess.check_output(
+                cmd,
+                text=True, 
+                stderr=subprocess.DEVNULL
+            )
+            
+            lines = [l.strip() for l in out.splitlines() if l.strip()]
+            memory_values = [int(x) for x in lines if x.isdigit()]
+            
+            if memory_values:
+                logger.debug(f"Current GPU memory usage: {memory_values} MiB")
+                return memory_values
+            else:
+                return []
+                
+        except subprocess.CalledProcessError as e:
+            logger.error(f"nvidia-smi command failed: {e}")
+            return []
+        except FileNotFoundError:
+            logger.error("nvidia-smi not found. Is NVIDIA driver installed?")
+            return []
+        except Exception as e:
+            logger.error(f"Error getting GPU memory: {e}")
+            return []
+
+class MockGPUMonitor(GPUMonitor):
+    """Mock GPU monitor (for testing or CPU mode)"""
+    
+    def get_current_memory_usage(self):
+        """Return mock memory usage data"""
+        import random
+        if self.device_ids:
+            return [random.randint(100, 1000) for _ in self.device_ids]
+        else:
+            return [random.randint(100, 1000)]
+
+def create_gpu_monitor(gpu_platform="nvidia", device_ids=None):
+    """
+    Factory function to create GPU monitor
+    
+    Args:
+        gpu_platform: GPU platform name, "nvidia" or others
+        device_ids: List of device IDs to monitor
+        
+    Returns:
+        Instance of GPUMonitor
+    """
+    platform_lower = gpu_platform.lower()
+    
+    if platform_lower == "nvidia":
+        logger.info(f"Creating NVIDIA GPU monitor for devices: {device_ids}")
+        return NVIDIAGPUMonitor(device_ids)
+    elif platform_lower == "mock" or platform_lower == "test":
+        logger.info(f"Creating Mock GPU monitor for devices: {device_ids}")
+        return MockGPUMonitor(device_ids)
+    else:
+        logger.warning(f"Unsupported GPU platform: {gpu_platform}, using mock monitor")
+        return MockGPUMonitor(device_ids)
diff --git a/infinimetrics/inference/utils/prompt_generator.py b/infinimetrics/inference/utils/prompt_generator.py
new file mode 100644
index 0000000..7589bad
--- /dev/null
+++ b/infinimetrics/inference/utils/prompt_generator.py
@@ -0,0 +1,543 @@
+# utils/prompt_generator.py (Extended Version)
+#!/usr/bin/env python3
+"""
+Prompt Generator
+Generate test prompts based on input_token_num
+Supports multiple generation methods and configurations
+"""
+
+import random
+import string
+import json
+from pathlib import Path
+from typing import Callable, Optional, Dict, Any, List, Union
+import logging
+
+logger = logging.getLogger(__name__)
+
+# Preset prompt template library
+PRESET_TEMPLATES = {
+    "ai_qa": [
+        "Explain the concept of {topic} in simple terms.",
+        "What are the main applications of {topic} in today's world?",
+        "Describe the history and development of {topic}.",
+        "Compare and contrast {topic} with similar technologies.",
+        "What are the ethical considerations surrounding {topic}?",
+        "How does {topic} impact our daily lives?",
+        "What are the future trends in {topic}?",
+        "What are the key challenges in {topic} research?"
+    ],
+    
+    "general_qa": [
+        "Tell me about {topic}.",
+        "What is {topic}?",
+        "Can you explain {topic}?",
+        "I need information about {topic}.",
+        "Please provide details about {topic}.",
+        "Help me understand {topic}."
+    ],
+    
+    "technical": [
+        "Discuss the technical implementation of {topic}.",
+        "What are the algorithms used in {topic}?",
+        "Explain the architecture of {topic} systems.",
+        "What are the performance considerations for {topic}?",
+        "Describe the scalability challenges in {topic}."
+    ]
+}
+
+# Preset topic library
+PRESET_TOPICS = {
+    "ai_ml": [
+        "artificial intelligence",
+        "machine learning",
+        "deep learning",
+        "natural language processing",
+        "computer vision",
+        "reinforcement learning",
+        "neural networks",
+        "transformers",
+        "large language models",
+        "generative AI"
+    ],
+    
+    "tech": [
+        "cloud computing",
+        "blockchain technology",
+        "quantum computing",
+        "Internet of Things",
+        "edge computing",
+        "distributed systems",
+        "cybersecurity",
+        "databases",
+        "software engineering"
+    ],
+    
+    "science": [
+        "climate change",
+        "genetic engineering",
+        "space exploration",
+        "renewable energy",
+        "quantum physics",
+        "biotechnology",
+        "nanotechnology"
+    ]
+}
+
+
+class PromptGenerator:
+    """Prompt Generator Class (New)"""
+    
+    def __init__(
+        self,
+        method: str = "template",  # template, random, file, fixed
+        template_name: str = "ai_qa",
+        topic_name: str = "ai_ml",
+        fixed_prompt: Optional[str] = None,
+        prompt_file: Optional[str] = None,
+        tokenizer = None,
+        chars_per_token: int = 4
+    ):
+        """
+        Initialize prompt generator
+        """
+        self.method = method
+        self.template_name = template_name
+        self.topic_name = topic_name
+        self.fixed_prompt = fixed_prompt
+        self.prompt_file = prompt_file
+        self.tokenizer = tokenizer
+        self.chars_per_token = chars_per_token
+        
+        # Load templates and topics
+        self.templates = PRESET_TEMPLATES.get(template_name, PRESET_TEMPLATES["ai_qa"])
+        self.topics = PRESET_TOPICS.get(topic_name, PRESET_TOPICS["ai_ml"])
+        
+        # Load prompts from file (if needed)
+        self.file_prompts = []
+        if method == "file" and prompt_file and Path(prompt_file).exists():
+            self._load_prompts_from_file()
+    
+    def _load_prompts_from_file(self):
+        """Load prompts from file"""
+        try:
+            with open(self.prompt_file, 'r', encoding='utf-8') as f:
+                content = f.read().strip()
+                
+                if self.prompt_file.endswith('.json'):
+                    data = json.loads(content)
+                    if isinstance(data, list):
+                        self.file_prompts = [str(item) for item in data]
+                    elif isinstance(data, dict):
+                        # Try to extract all string values
+                        for value in data.values():
+                            if isinstance(value, str):
+                                self.file_prompts.append(value)
+                            elif isinstance(value, list):
+                                self.file_prompts.extend([str(v) for v in value if isinstance(v, str)])
+                else:
+                    # Text file, one prompt per line
+                    self.file_prompts = [line.strip() for line in content.split('\n') if line.strip()]
+                    
+            logger.info(f"Loaded {len(self.file_prompts)} prompts from {self.prompt_file}")
+            
+        except Exception as e:
+            logger.error(f"Failed to load prompts from file: {e}")
+            self.file_prompts = []
+    
+    def generate(self, token_num: int, prompt_id: int = 0) -> str:
+        """Generate single prompt (compatible with original interface)"""
+        return self.generate_prompt(token_num, prompt_id)
+    
+    def generate_prompt(self, token_num: int, prompt_id: int = 0) -> str:
+        """
+        Generate single prompt
+        
+        Args:
+            token_num: Required number of tokens
+            prompt_id: Prompt ID (for generating different prompts)
+            
+        Returns:
+            Prompt text
+        """
+        if self.method == "fixed" and self.fixed_prompt:
+            # Use fixed prompt
+            return self._adjust_length(self.fixed_prompt, token_num)
+        
+        elif self.method == "file" and self.file_prompts:
+            # Select prompt from file
+            prompt_idx = prompt_id % len(self.file_prompts)
+            base_prompt = self.file_prompts[prompt_idx]
+            return self._adjust_length(base_prompt, token_num)
+        
+        elif self.method == "random":
+            # Random generation
+            return self._generate_random_prompt(token_num)
+        
+        else:  # Default to template method
+            # Select template and topic
+            template_idx = prompt_id % len(self.templates)
+            topic_idx = prompt_id % len(self.topics)
+            
+            template = self.templates[template_idx]
+            topic = self.topics[topic_idx]
+            
+            # Generate base prompt
+            base_prompt = template.format(topic=topic)
+            
+            # Adjust length
+            return self._adjust_length(base_prompt, token_num)
+    
+    def _adjust_length(self, prompt: str, token_num: int) -> str:
+        """Adjust prompt length to specified token count"""
+        if self.tokenizer:
+            # Use tokenizer for precise control
+            try:
+                tokens = self.tokenizer.encode(prompt)
+                
+                if len(tokens) >= token_num:
+                    # Truncate
+                    tokens = tokens[:token_num]
+                    return self.tokenizer.decode(tokens, skip_special_tokens=True)
+                else:
+                    # Repeat until desired length is reached
+                    repeat_count = (token_num + len(tokens) - 1) // len(tokens)
+                    repeated_tokens = tokens * repeat_count
+                    repeated_tokens = repeated_tokens[:token_num]
+                    return self.tokenizer.decode(repeated_tokens, skip_special_tokens=True)
+            except Exception as e:
+                logger.warning(f"Tokenizer length adjustment failed: {e}, using fallback")
+                # Fallback to character-level adjustment
+                return self._adjust_length_fallback(prompt, token_num)
+        else:
+            # Fallback method
+            return self._adjust_length_fallback(prompt, token_num)
+    
+    def _adjust_length_fallback(self, prompt: str, token_num: int) -> str:
+        """Fallback method: character-level length adjustment"""
+        estimated_chars = token_num * self.chars_per_token
+        
+        if len(prompt) >= estimated_chars:
+            # Truncate
+            return prompt[:estimated_chars]
+        else:
+            # Repeat until desired length is reached
+            repeat_count = (estimated_chars + len(prompt) - 1) // len(prompt)
+            repeated_prompt = prompt * repeat_count
+            return repeated_prompt[:estimated_chars]
+    
+    def _generate_random_prompt(self, token_num: int) -> str:
+        """Generate random prompt"""
+        if self.tokenizer:
+            # Randomly select token IDs
+            try:
+                vocab_size = self.tokenizer.vocab_size if hasattr(self.tokenizer, 'vocab_size') else 32000
+                token_ids = [random.randint(0, vocab_size-1) for _ in range(token_num)]
+                return self.tokenizer.decode(token_ids, skip_special_tokens=True)
+            except Exception as e:
+                logger.warning(f"Tokenizer random generation failed: {e}, using fallback")
+                # Fallback to character-level generation
+                return self._generate_random_text(token_num)
+        else:
+            # Fallback method
+            return self._generate_random_text(token_num)
+    
+    def _generate_random_text(self, token_num: int) -> str:
+        """Generate random text (fallback method)"""
+        total_chars = token_num * self.chars_per_token
+        
+        # Use letters, digits, and common punctuation
+        chars = string.ascii_letters + string.digits + ' .,!?;:\n-'
+        random_text = ''.join(random.choices(chars, k=total_chars))
+        
+        return random_text
+    
+    def generate_prompts(self, num_prompts: int, token_num: int) -> List[str]:
+        """
+        Generate multiple prompts
+        
+        Args:
+            num_prompts: Number of prompts to generate
+            token_num: Token count for each prompt
+            
+        Returns:
+            List of prompts
+        """
+        prompts = []
+        for i in range(num_prompts):
+            prompt = self.generate_prompt(token_num, i)
+            
+            # Add unique identifier
+            unique_suffix = f" [Request {i+1}:{self._generate_unique_suffix()}]"
+            
+            # Ensure total length doesn't exceed token limit after adding suffix
+            if self.tokenizer:
+                try:
+                    base_tokens = self.tokenizer.encode(prompt)
+                    suffix_tokens = self.tokenizer.encode(unique_suffix)
+                    
+                    if len(base_tokens) + len(suffix_tokens) <= token_num:
+                        prompt += unique_suffix
+                    else:
+                        # Adjust base prompt length to accommodate suffix
+                        adjusted_token_num = token_num - len(suffix_tokens)
+                        if adjusted_token_num > 0:
+                            prompt = self.generate_prompt(adjusted_token_num, i) + unique_suffix
+                except Exception as e:
+                    logger.warning(f"Tokenizer suffix adjustment failed: {e}")
+                    # Simple append without precise control
+                    prompt += unique_suffix
+            else:
+                # Simple estimation
+                suffix_chars = len(unique_suffix)
+                base_chars = token_num * self.chars_per_token - suffix_chars
+                if base_chars > 0:
+                    prompt = prompt[:base_chars] + unique_suffix
+            
+            prompts.append(prompt)
+        
+        logger.info(f"Generated {len(prompts)} prompts using method: {self.method}")
+        if prompts:
+            logger.debug(f"First prompt preview: {prompts[0][:100]}...")
+        
+        return prompts
+    
+    def _generate_unique_suffix(self) -> str:
+        """Generate unique identifier suffix"""
+        return ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+
+
+# ==================== Original Functions (Maintaining Compatibility) ====================
+
+def create_prompt_generator(
+    tokenizer=None,
+    method: str = "random",
+    **kwargs
+) -> Callable[[int], str]:
+    """
+    Create prompt generator function (compatible with original interface)
+    
+    New recommended usage: Create PromptGenerator class instance directly
+    
+    Args:
+        tokenizer: Optional tokenizer for precise token counting
+        method: Generation method, "random" or "template"
+        **kwargs: Additional parameters passed to specific generator
+
+    Returns:
+        Function: (token_num) -> prompt_text
+    """
+    logger.warning("Using deprecated create_prompt_generator function. "
+                   "Consider using PromptGenerator class directly.")
+    
+    if method == "random":
+        return _create_random_prompt_generator(tokenizer, **kwargs)
+    elif method == "template":
+        return _create_template_prompt_generator(**kwargs)
+    else:
+        raise ValueError(f"Unknown prompt generation method: {method}")
+
+
+def _create_random_prompt_generator(
+    tokenizer=None,
+    vocab_size: int = 32000,
+    exclude_special_tokens: bool = True,
+    **kwargs
+) -> Callable[[int], str]:
+    """Create random token generator (compatible with original interface)"""
+    
+    # Extract parameters from kwargs
+    chars_per_token = kwargs.get('chars_per_token', 4)
+
+    def generate_random_prompt(token_num: int) -> str:
+        """Generate random prompt with specified token count"""
+        nonlocal tokenizer, vocab_size, exclude_special_tokens, chars_per_token
+
+        if tokenizer is not None:
+            # Use tokenizer for precise token control
+            # Generate random token ID sequence
+            try:
+                if exclude_special_tokens and hasattr(tokenizer, 'special_tokens_map'):
+                    # Get special tokens
+                    special_tokens = tokenizer.special_tokens_map.values()
+                    special_token_ids = set(tokenizer.convert_tokens_to_ids(special_tokens))
+
+                    # Generate non-special tokens
+                    valid_token_ids = []
+                    for token_id in range(vocab_size):
+                        if token_id not in special_token_ids:
+                            valid_token_ids.append(token_id)
+
+                    if valid_token_ids:
+                        token_ids = random.choices(valid_token_ids, k=token_num)
+                    else:
+                        token_ids = random.choices(list(range(vocab_size)), k=token_num)
+                else:
+                    # Simple random token ID selection
+                    token_ids = random.choices(list(range(vocab_size)), k=token_num)
+
+                # Decode to text
+                prompt = tokenizer.decode(token_ids, skip_special_tokens=True)
+                return prompt
+
+            except Exception as e:
+                # If tokenizer method fails, fall back to random text generation
+                logger.warning(f"Tokenizer-based prompt generation failed: {e}. "
+                               f"Falling back to random text generation.")
+                tokenizer = None  # Mark tokenizer as unavailable
+
+        # Method 1: No tokenizer, generate random text (estimate token count)
+        total_chars = token_num * chars_per_token
+
+        # Generate random text
+        chars = string.ascii_letters + string.digits + ' .,!?;:\n'
+        random_text = ''.join(random.choices(chars, k=total_chars))
+
+        return random_text
+
+    return generate_random_prompt
+
+
+def _create_template_prompt_generator(**kwargs) -> Callable[[int], str]:
+    """Create template prompt generator (compatible with original interface)"""
+    templates = kwargs.get('templates', [
+        "Explain the concept of artificial intelligence in simple terms. ",
+        "What are the main applications of machine learning in today's world? ",
+        "Describe the history and development of deep learning. ",
+        "Compare and contrast natural language processing with computer vision. ",
+    ])
+
+    def generate_template_prompt(token_num: int) -> str:
+        """Generate prompt using template"""
+        # Select a template
+        template = random.choice(templates)
+
+        # Repeat template until desired length is reached
+        estimated_template_tokens = 15
+        repeat_count = max(1, token_num // estimated_template_tokens)
+
+        prompt = template * repeat_count
+
+        # Truncate to approximate length (simple handling)
+        estimated_chars = token_num * 4  # Assume average 4 characters per token
+        if len(prompt) > estimated_chars:
+            prompt = prompt[:estimated_chars]
+
+        return prompt
+
+    return generate_template_prompt
+
+
+# ==================== New Helper Functions ====================
+
+def create_prompt_generator_from_config(
+    config: Dict[str, Any], 
+    tokenizer=None
+) -> PromptGenerator:
+    """
+    Create PromptGenerator instance from configuration (recommended new way)
+    
+    Args:
+        config: Configuration dictionary containing prompt-related settings
+        tokenizer: Optional tokenizer
+        
+    Returns:
+        PromptGenerator instance
+    """
+    # Extract configuration parameters
+    prompt_config = config.get("prompt_config", {})
+    
+    return PromptGenerator(
+        method=prompt_config.get("method", "template"),
+        template_name=prompt_config.get("template_name", "ai_qa"),
+        topic_name=prompt_config.get("topic_name", "ai_ml"),
+        fixed_prompt=prompt_config.get("fixed_prompt"),
+        prompt_file=prompt_config.get("prompt_file"),
+        tokenizer=tokenizer,
+        chars_per_token=prompt_config.get("chars_per_token", 4)
+    )
+
+
+def load_prompts_from_file(file_path: str) -> List[str]:
+    """Load prompt list from file (general function)"""
+    if not Path(file_path).exists():
+        logger.error(f"Prompt file not found: {file_path}")
+        return []
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+            
+            if file_path.endswith('.json'):
+                data = json.loads(content)
+                if isinstance(data, list):
+                    return [str(item) for item in data]
+                elif isinstance(data, dict):
+                    # Try to extract all string values
+                    prompts = []
+                    for value in data.values():
+                        if isinstance(value, str):
+                            prompts.append(value)
+                        elif isinstance(value, list):
+                            prompts.extend([str(v) for v in value if isinstance(v, str)])
+                    return prompts
+            else:
+                # Text file, one prompt per line
+                return [line.strip() for line in content.split('\n') if line.strip()]
+                
+    except Exception as e:
+        logger.error(f"Failed to load prompts from file {file_path}: {e}")
+    
+    return []
+
+
+# ==================== Test Functions ====================
+
+def test_prompt_generator():
+    """Test prompt generator"""
+    print("Testing prompt generators...")
+
+    # Test original functions (maintain compatibility)
+    print("\n1. Testing legacy functions:")
+    
+    # Test random generator (no tokenizer)
+    print("\n   Random generator (no tokenizer):")
+    random_gen = create_prompt_generator(method="random", chars_per_token=3)
+    prompt1 = random_gen(10)  # Generate prompt with 10 tokens
+    print(f"     Generated prompt (10 tokens): {prompt1[:50]}...")
+
+    # Test template generator
+    print("\n   Template generator:")
+    template_gen = create_prompt_generator(method="template")
+    prompt2 = template_gen(20)  # Generate prompt with 20 tokens
+    print(f"     Generated prompt (20 tokens): {prompt2[:50]}...")
+
+    # Test new PromptGenerator class
+    print("\n2. Testing new PromptGenerator class:")
+    
+    config = {
+        "prompt_config": {
+            "method": "template",
+            "template_name": "ai_qa",
+            "topic_name": "ai_ml",
+            "chars_per_token": 4
+        }
+    }
+    
+    generator = create_prompt_generator_from_config(config)
+    
+    # Generate single prompt
+    single_prompt = generator.generate(15)
+    print(f"   Single prompt (15 tokens): {single_prompt[:50]}...")
+    
+    # Generate multiple prompts
+    prompts = generator.generate_prompts(3, 10)
+    print(f"   Generated {len(prompts)} prompts:")
+    for i, prompt in enumerate(prompts):
+        print(f"     Prompt {i+1}: {prompt[:50]}...")
+
+    print("\nPrompt generators test completed.")
+
+
+if __name__ == "__main__":
+    test_prompt_generator()
\ No newline at end of file
diff --git a/infinimetrics/inference/utils/token_generator.py b/infinimetrics/inference/utils/token_generator.py
new file mode 100644
index 0000000..13c443a
--- /dev/null
+++ b/infinimetrics/inference/utils/token_generator.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""
+Random Token Generation Utility
+Used for performance testing to generate random token sequences
+"""
+import random
+import logging
+from typing import Dict, Any, List, Set, Optional
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class TokenGeneratorConfig:
+    """Configuration for token generator"""
+    exclude_special_tokens: bool = True
+    min_token_id: int = 0
+    max_token_id: Optional[int] = None
+    custom_excluded_ids: Set[int] = None
+    
+    def __post_init__(self):
+        if self.custom_excluded_ids is None:
+            self.custom_excluded_ids = set()
+
+class TokenGenerator:
+    """Random token generator"""
+    
+    def __init__(self, tokenizer, config: Optional[TokenGeneratorConfig] = None):
+        self.tokenizer = tokenizer
+        self.config = config or TokenGeneratorConfig()
+        
+        # Initialize token information
+        self._init_token_info()
+    
+    def _init_token_info(self):
+        """Initialize token information"""
+        # Get vocabulary size
+        if hasattr(self.tokenizer, 'vocab_size'):
+            self.vocab_size = self.tokenizer.vocab_size
+        elif hasattr(self.tokenizer, 'get_vocab_size'):
+            self.vocab_size = self.tokenizer.get_vocab_size()
+        else:
+            # Try to get via len
+            try:
+                self.vocab_size = len(self.tokenizer)
+            except:
+                self.vocab_size = 32000  # Default value
+                logger.warning(f"Cannot determine vocab_size, using default: {self.vocab_size}")
+        
+        # Set maximum token ID
+        if self.config.max_token_id is None:
+            self.config.max_token_id = self.vocab_size - 1
+        
+        # Get special token IDs
+        self.special_token_ids = self._get_special_token_ids()
+        
+        # Calculate valid token range
+        self.valid_token_ids = self._get_valid_token_ids()
+        
+        logger.info(f"TokenGenerator initialized: vocab_size={self.vocab_size}, "
+                   f"valid_tokens={len(self.valid_token_ids)}, "
+                   f"special_tokens={len(self.special_token_ids)}")
+    
+    def _get_special_token_ids(self) -> Set[int]:
+        """Get set of special token IDs"""
+        special_ids = set()
+        
+        # Get special tokens from tokenizer
+        special_tokens_map = getattr(self.tokenizer, 'special_tokens_map', {})
+        
+        # Process special token mappings
+        for key, token in special_tokens_map.items():
+            token_id = self._convert_to_token_id(token)
+            if token_id is not None:
+                special_ids.add(token_id)
+        
+        # Check common special token attributes
+        common_special_attrs = [
+            'bos_token', 'eos_token', 'pad_token', 'unk_token',
+            'sep_token', 'cls_token', 'mask_token'
+        ]
+        
+        for attr_name in common_special_attrs:
+            token = getattr(self.tokenizer, attr_name, None)
+            if token is not None:
+                token_id = self._convert_to_token_id(token)
+                if token_id is not None:
+                    special_ids.add(token_id)
+        
+        # Add custom excluded tokens
+        special_ids.update(self.config.custom_excluded_ids)
+        
+        return special_ids
+    
+    def _convert_to_token_id(self, token) -> Optional[int]:
+        """Convert token to ID"""
+        if isinstance(token, int):
+            return token
+        elif isinstance(token, str):
+            try:
+                return self.tokenizer.convert_tokens_to_ids(token)
+            except:
+                return None
+        elif hasattr(token, 'content'):  # Handle special token objects
+            try:
+                return self.tokenizer.convert_tokens_to_ids(token.content)
+            except:
+                return None
+        return None
+    
+    def _get_valid_token_ids(self) -> List[int]:
+        """Get list of valid token IDs (excluding special tokens)"""
+        if not self.config.exclude_special_tokens:
+            # If not excluding special tokens, return all tokens
+            return list(range(self.config.min_token_id, self.config.max_token_id + 1))
+        
+        # Exclude special tokens
+        all_ids = set(range(self.config.min_token_id, self.config.max_token_id + 1))
+        valid_ids = sorted(list(all_ids - self.special_token_ids))
+        
+        if not valid_ids:
+            logger.warning("No valid tokens after exclusion, using all tokens")
+            valid_ids = list(range(self.config.min_token_id, self.config.max_token_id + 1))
+        
+        return valid_ids
+    
+    def generate_tokens(self, num_tokens: int) -> List[int]:
+        """
+        Generate random token sequence
+        
+        Args:
+            num_tokens: Number of tokens to generate
+        
+        Returns:
+            List of token IDs
+        """
+        if not self.valid_token_ids:
+            raise ValueError("No valid tokens available for generation")
+        
+        tokens = random.choices(self.valid_token_ids, k=num_tokens)
+        
+        logger.debug(f"Generated {num_tokens} random tokens from {len(self.valid_token_ids)} valid tokens")
+        return tokens
+    
+    def generate_token_batch(self, batch_size: int, tokens_per_sample: int) -> List[List[int]]:
+        """
+        Generate batch of token sequences
+        
+        Args:
+            batch_size: Batch size
+            tokens_per_sample: Number of tokens per sample
+        
+        Returns:
+            List of token ID batches
+        """
+        batch = []
+        for i in range(batch_size):
+            tokens = self.generate_tokens(tokens_per_sample)
+            batch.append(tokens)
+        
+        logger.info(f"Generated {batch_size} samples, {tokens_per_sample} tokens each")
+        return batch
+    
+    def tokens_to_text(self, tokens: List[int]) -> str:
+        """Convert token IDs to text"""
+        return self.tokenizer.decode(tokens, skip_special_tokens=True)
+    
+    def batch_to_text(self, batch_tokens: List[List[int]]) -> List[str]:
+        """Batch convert token IDs to text"""
+        texts = []
+        for tokens in batch_tokens:
+            text = self.tokens_to_text(tokens)
+            texts.append(text)
+        return texts
+    
+    def get_token_info(self) -> Dict[str, Any]:
+        """Get token information statistics"""
+        return {
+            "vocab_size": self.vocab_size,
+            "valid_token_count": len(self.valid_token_ids),
+            "special_token_count": len(self.special_token_ids),
+            "min_token_id": min(self.valid_token_ids) if self.valid_token_ids else 0,
+            "max_token_id": max(self.valid_token_ids) if self.valid_token_ids else 0,
+            "exclude_special": self.config.exclude_special_tokens
+        }
+
+def create_token_generator(tokenizer, **kwargs):
+    """
+    Convenience function to create token generator
+    
+    Args:
+        tokenizer: Tokenizer object
+        **kwargs: Parameters passed to TokenGeneratorConfig
+    
+    Returns:
+        TokenGenerator instance
+    """
+    config = TokenGeneratorConfig(**kwargs)
+    return TokenGenerator(tokenizer, config)
+
+# Unit tests
+if __name__ == "__main__":
+    # Mock a simple tokenizer for testing
+    class MockTokenizer:
+        vocab_size = 50000
+        special_tokens_map = {
+            "bos_token": 0,
+            "eos_token": 1,
+            "pad_token": 2,
+            "unk_token": 3
+        }
+        
+        def convert_tokens_to_ids(self, token):
+            if isinstance(token, str):
+                # Simple mapping
+                return hash(token) % 100
+            return token
+        
+        def decode(self, tokens, skip_special_tokens=True):
+            return f"Decoded text with {len(tokens)} tokens"
+    
+    # Test token generator
+    tokenizer = MockTokenizer()
+    generator = TokenGenerator(tokenizer)
+    
+    print("Token Info:", generator.get_token_info())
+    
+    # Generate some tokens
+    tokens = generator.generate_tokens(10)
+    print(f"Generated tokens: {tokens}")
+    print(f"Decoded: {generator.tokens_to_text(tokens)}")
+    
+    # Generate batch
+    batch = generator.generate_token_batch(3, 5)
+    print(f"Generated batch: {batch}")
\ No newline at end of file
diff --git a/infinimetrics/inference/utils/trace_client.py b/infinimetrics/inference/utils/trace_client.py
new file mode 100644
index 0000000..be4a181
--- /dev/null
+++ b/infinimetrics/inference/utils/trace_client.py
@@ -0,0 +1,527 @@
+# utils/trace_client.py
+#!/usr/bin/env python3
+"""
+Trace client implementation
+Reads requests from a trace file and sends them to the inference service according to time intervals.
+"""
+
+import asyncio
+import aiohttp
+import csv
+import json
+import time
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any
+from dataclasses import dataclass
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class RequestTrace:
+    """Single request entry in the trace file"""
+    request_id: str
+    arrival_timestamp_ms: float  # timestamp in milliseconds (relative or absolute)
+    input_token_num: int
+    output_token_num: int
+    
+    # runtime computed fields
+    actual_prompt: Optional[str] = None
+    start_time: Optional[float] = None
+    ttft: Optional[float] = None  # Time To First Token (ms)
+    e2e_latency: Optional[float] = None  # End-to-end latency (ms)
+    total_tokens: int = 0
+    success: bool = False
+    error: Optional[str] = None
+
+@dataclass
+class TraceClientConfig:
+    """Trace client configuration"""
+    api_url: str
+    model_name: str
+    timeout_ms: int = 30000
+    max_retries: int = 3
+    warmup_requests: int = 10
+
+class TraceClient:
+    """Trace client"""
+    
+    def __init__(self, config: TraceClientConfig):
+        self.config = config
+        self.session: Optional[aiohttp.ClientSession] = None
+        self.request_counter = 0
+        self.semaphore: Optional[asyncio.Semaphore] = None
+        
+        # performance statistics
+        self.total_requests = 0
+        self.successful_requests = 0
+        self.failed_requests = 0
+        
+    async def __aenter__(self):
+        """Async context manager entry"""
+        timeout = aiohttp.ClientTimeout(total=self.config.timeout_ms/1000 + 10)
+        self.session = aiohttp.ClientSession(timeout=timeout)
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        if self.session:
+            await self.session.close()
+    
+    @staticmethod
+    def load_trace_file(trace_file: str, prompt_generator) -> List[RequestTrace]:
+       """
+        Load trace file
+        
+        Args:
+            trace_file: Path to the trace file
+            prompt_generator: Function to generate actual prompts
+            
+        Returns:
+            List of request traces
+        """
+        traces = []
+        
+        try:
+            with open(trace_file, 'r', encoding='utf-8') as f:
+                # Auto detect file format
+                first_line = f.readline().strip()
+                f.seek(0)
+                
+                if trace_file.endswith('.csv') or ',' in first_line:
+                    # CSV format
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        # support variations of column names
+                        request_id = row.get('request_id') or row.get('RequestID') or f"req-{len(traces):04d}"
+                        
+                        # parse timestamp
+                        timestamp_str = row.get('arrival_timestamp_ms') or row.get('timestamp') or '0'
+                        try:
+                            arrival_timestamp_ms = float(timestamp_str)
+                        except ValueError:
+                            logger.warning(f"Invalid timestamp for {request_id}: {timestamp_str}, using 0")
+                            arrival_timestamp_ms = 0
+                        
+                        # parse token counts
+                        try:
+                            input_token_num = int(row.get('input_token_num') or row.get('input_tokens') or 128)
+                            output_token_num = int(row.get('output_token_num') or row.get('output_tokens') or 128)
+                        except ValueError:
+                            logger.warning(f"Invalid token numbers for {request_id}, using defaults")
+                            input_token_num = 128
+                            output_token_num = 128
+                        
+                        # generate prompt content
+                        actual_prompt = prompt_generator(input_token_num)
+                        
+                        trace = RequestTrace(
+                            request_id=request_id,
+                            arrival_timestamp_ms=arrival_timestamp_ms,
+                            input_token_num=input_token_num,
+                            output_token_num=output_token_num,
+                            actual_prompt=actual_prompt
+                        )
+                        traces.append(trace)
+                
+                elif trace_file.endswith('.json') or first_line.startswith('[') or first_line.startswith('{'):
+                    # JSON format
+                    data = json.load(f)
+                    
+                    if isinstance(data, dict):
+                        data = [data]
+                    
+                    for i, item in enumerate(data):
+                        request_id = item.get('request_id') or f"req-{i:04d}"
+                        
+                        arrival_timestamp_ms = float(item.get('arrival_timestamp_ms', 0))
+                        input_token_num = int(item.get('input_token_num', 128))
+                        output_token_num = int(item.get('output_token_num', 128))
+                        
+                        actual_prompt = prompt_generator(input_token_num)
+                        
+                        trace = RequestTrace(
+                            request_id=request_id,
+                            arrival_timestamp_ms=arrival_timestamp_ms,
+                            input_token_num=input_token_num,
+                            output_token_num=output_token_num,
+                            actual_prompt=actual_prompt
+                        )
+                        traces.append(trace)
+                
+                else:
+                    raise ValueError(f"Unsupported trace file format: {trace_file}")
+            
+            logger.info(f"Loaded {len(traces)} requests from trace file: {trace_file}")
+            
+            # sort by timestamp
+            traces.sort(key=lambda x: x.arrival_timestamp_ms)
+            
+            # print debug info for first few entries
+            if traces:
+                logger.info(f"First request: ID={traces[0].request_id}, "
+                          f"time={traces[0].arrival_timestamp_ms}ms, "
+                          f"input={traces[0].input_token_num}, "
+                          f"output={traces[0].output_token_num}")
+                if len(traces) > 1:
+                    logger.info(f"Last request: ID={traces[-1].request_id}, "
+                              f"time={traces[-1].arrival_timestamp_ms}ms")
+            
+            return traces
+            
+        except Exception as e:
+            logger.error(f"Failed to load trace file {trace_file}: {e}")
+            raise
+    
+    async def send_request(
+        self, 
+        trace: RequestTrace, 
+        semaphore: asyncio.Semaphore
+    ) -> RequestTrace:
+        """
+        Send a single request and record metrics
+        
+        Args:
+            trace: Request trace
+            semaphore: Concurrency control semaphore
+            
+        Returns:
+            Updated request trace with performance metrics
+        """
+        async with semaphore:
+            self.total_requests += 1
+            request_start = time.perf_counter()
+            ttft = None
+            total_tokens = 0
+            
+            try:
+                 # Simulate request arrival timing (assuming timestamps are relative)
+                if trace.arrival_timestamp_ms > 0:
+                    await asyncio.sleep(trace.arrival_timestamp_ms / 1000)
+                
+                # record start time
+                trace.start_time = time.perf_counter()
+                
+                # construct request payload
+                payload = {
+                    "model": self.config.model_name,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": trace.actual_prompt
+                        }
+                    ],
+                    "max_tokens": trace.output_token_num,
+                    "temperature": 0.7,
+                    "top_p": 0.9,
+                    "stream": True   # use streaming to measure TTFT
+                }
+                
+                logger.debug(f"Sending request {trace.request_id}: "
+                           f"input={trace.input_token_num}, "
+                           f"output={trace.output_token_num}")
+                
+                # send request
+                async with self.session.post(
+                    f"{self.config.api_url}/chat/completions",
+                    json=payload,
+                    headers={"Content-Type": "application/json"}
+                ) as response:
+                    
+                    if response.status != 200:
+                        error_msg = f"HTTP {response.status}: {await response.text()}"
+                        trace.error = error_msg
+                        trace.success = False
+                        self.failed_requests += 1
+                        
+                        logger.error(f"Request {trace.request_id} failed: {error_msg}")
+                        return trace
+                    
+                    # process streaming response
+                    first_token_received = False
+                    first_token_time = None
+                    
+                    async for line in response.content:
+                        if line:
+                            line = line.decode('utf-8').strip()
+                            if line.startswith('data: '):
+                                data_str = line[6:]
+                                
+                                if data_str == '[DONE]':
+                                    break
+                                
+                                try:
+                                    data = json.loads(data_str)
+                                    if 'choices' in data and len(data['choices']) > 0:
+                                        choice = data['choices'][0]
+                                        
+                                        # check for content
+                                        if 'delta' in choice and 'content' in choice['delta']:
+                                            content = choice['delta']['content']
+                                            
+                                            if not first_token_received:
+                                                first_token_time = time.perf_counter()
+                                                ttft = (first_token_time - trace.start_time) * 1000
+                                                first_token_received = True
+                                                logger.debug(f"First token received for {trace.request_id}: "
+                                                           f"TTFT={ttft:.2f}ms")
+                                            
+                                            # count tokens (rough estimate)
+                                            if content.strip():
+                                                total_tokens += 1
+                                        
+                                        # check finish condition
+                                        if choice.get('finish_reason'):
+                                            break
+                                            
+                                except json.JSONDecodeError:
+                                    logger.warning(f"Invalid JSON in stream: {line}")
+                                    continue
+                    
+                    # compute e2e latency
+                    e2e_latency = (time.perf_counter() - trace.start_time) * 1000
+                    
+                    # update trace
+                    trace.ttft = ttft if ttft else e2e_latency  
+                    trace.e2e_latency = e2e_latency
+                    trace.total_tokens = total_tokens
+                    trace.success = True
+                    
+                    self.successful_requests += 1
+                    
+                    logger.debug(f"Request {trace.request_id} completed: "
+                               f"TTFT={trace.ttft:.2f}ms, "
+                               f"E2E={trace.e2e_latency:.2f}ms, "
+                               f"tokens={total_tokens}")
+                
+            except asyncio.TimeoutError:
+                error_msg = f"Timeout after {self.config.timeout_ms}ms"
+                trace.error = error_msg
+                trace.success = False
+                self.failed_requests += 1
+                logger.error(f"Request {trace.request_id} timeout: {error_msg}")
+                
+            except Exception as e:
+                error_msg = str(e)
+                trace.error = error_msg
+                trace.success = False
+                self.failed_requests += 1
+                logger.error(f"Request {trace.request_id} failed: {error_msg}")
+            
+            return trace
+    
+    async def run_trace(
+        self, 
+        traces: List[RequestTrace], 
+        concurrency: int = 32,
+        warmup_requests: int = 10
+    ) -> Tuple[List[RequestTrace], Dict[str, Any]]:
+        """
+        Run all requests in the trace
+        
+        Args:
+            traces: List of request traces
+            concurrency: Max concurrency
+            warmup_requests: Number of warmup requests
+            
+        Returns:
+            (updated traces, statistics dictionary)
+        """
+        # reset counters
+        self.total_requests = 0
+        self.successful_requests = 0
+        self.failed_requests = 0
+        
+        logger.info(f"Starting trace run: {len(traces)} requests, concurrency={concurrency}")
+        
+        # warmup phase
+        if warmup_requests > 0:
+            logger.info(f"Warmup phase: {warmup_requests} requests")
+            
+            warmup_traces = traces[:min(warmup_requests, len(traces))]
+            semaphore = asyncio.Semaphore(concurrency)
+            warmup_tasks = []
+            
+            for trace in warmup_traces:
+                # Warm-up requests do not calculate arrival interval
+                warmup_trace = RequestTrace(
+                    request_id=f"warmup-{trace.request_id}",
+                    arrival_timestamp_ms=0,
+                    input_token_num=trace.input_token_num,
+                    output_token_num=min(10, trace.output_token_num), # Warm-up generates fewer tokens
+                    actual_prompt=trace.actual_prompt
+                )
+                
+                task = self.send_request(warmup_trace, semaphore)
+                warmup_tasks.append(task)
+            
+            warmup_results = await asyncio.gather(*warmup_tasks, return_exceptions=True)
+            
+            # Handle exceptions
+            for i, result in enumerate(warmup_results):
+                if isinstance(result, Exception):
+                    logger.warning(f"Warmup request {i} failed: {result}")
+            
+            logger.info("Warmup completed")
+        
+    	# Measurement phase
+        logger.info(f"Measurement phase: {len(traces)} requests")
+        
+        # Create semaphore to control concurrency
+        self.semaphore = asyncio.Semaphore(concurrency)
+        
+        # Record start time
+        test_start_time = time.perf_counter()
+        
+        # Send all requests
+        tasks = []
+        for trace in traces:
+            task = self.send_request(trace, self.semaphore)
+            tasks.append(task)
+        
+        # Wait for all requests to complete
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Record end time
+        test_end_time = time.perf_counter()
+        total_test_duration = test_end_time - test_start_time
+        
+        # Process results
+        processed_traces = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.error(f"Request {i} raised exception: {result}")
+                if i < len(traces):
+                    failed_trace = traces[i]
+                    failed_trace.error = str(result)
+                    failed_trace.success = False
+                    processed_traces.append(failed_trace)
+            else:
+                processed_traces.append(result)
+        
+        # Calculate statistics
+        stats = self._calculate_statistics(processed_traces, total_test_duration)
+        
+        # Log summary
+        logger.info(f"Trace run completed: "
+                   f"{stats['success_rate']:.2%} success rate, "
+                   f"{stats['avg_ttft']:.2f}ms avg TTFT, "
+                   f"{stats['avg_e2e_latency']:.2f}ms avg E2E latency")
+        
+        return processed_traces, stats
+    
+    def _calculate_statistics(
+        self, 
+        traces: List[RequestTrace], 
+        total_duration: float
+    ) -> Dict[str, Any]:
+        stats = {}
+        
+        # Filter successful requests
+        successful_traces = [t for t in traces if t.success]
+        
+        if not successful_traces:
+            logger.warning("No successful requests in trace run")
+            return stats
+        
+        # Basic statistics
+        stats['total_requests'] = len(traces)
+        stats['successful_requests'] = len(successful_traces)
+        stats['failed_requests'] = len(traces) - len(successful_traces)
+        stats['success_rate'] = len(successful_traces) / len(traces) if traces else 0
+        
+        #  Time statistics
+        stats['total_duration'] = total_duration
+        stats['requests_per_second'] = len(traces) / total_duration if total_duration > 0 else 0
+        
+        # TTFT statistics
+        ttfts = [t.ttft for t in successful_traces if t.ttft is not None]
+        if ttfts:
+            stats['avg_ttft'] = np.mean(ttfts)
+            stats['p50_ttft'] = np.percentile(ttfts, 50)
+            stats['p95_ttft'] = np.percentile(ttfts, 95)
+            stats['p99_ttft'] = np.percentile(ttfts, 99)
+            stats['min_ttft'] = np.min(ttfts)
+            stats['max_ttft'] = np.max(ttfts)
+            stats['std_ttft'] = np.std(ttfts)
+        
+        # E2E latency statistics
+        e2e_latencies = [t.e2e_latency for t in successful_traces if t.e2e_latency is not None]
+        if e2e_latencies:
+            stats['avg_e2e_latency'] = np.mean(e2e_latencies)
+            stats['p50_e2e_latency'] = np.percentile(e2e_latencies, 50)
+            stats['p95_e2e_latency'] = np.percentile(e2e_latencies, 95)
+            stats['p99_e2e_latency'] = np.percentile(e2e_latencies, 99)
+            stats['min_e2e_latency'] = np.min(e2e_latencies)
+            stats['max_e2e_latency'] = np.max(e2e_latencies)
+            stats['std_e2e_latency'] = np.std(e2e_latencies)
+        
+        # Token statistics
+        total_tokens = sum(t.total_tokens for t in successful_traces)
+        stats['total_tokens'] = total_tokens
+        stats['avg_tokens_per_request'] = total_tokens / len(successful_traces) if successful_traces else 0
+        
+        # Throughput
+        if total_duration > 0:
+            stats['throughput_tps'] = total_tokens / total_duration  # tokens per second
+        
+        # Input/output token statistics
+        input_tokens = sum(t.input_token_num for t in successful_traces)
+        output_tokens = sum(t.output_token_num for t in successful_traces)
+        stats['total_input_tokens'] = input_tokens
+        stats['total_output_tokens'] = output_tokens
+        
+        return stats
+    
+    def save_results_to_csv(
+        self, 
+        traces: List[RequestTrace], 
+        output_dir: Path, 
+        run_id: str
+    ):
+        # Save detailed results
+        detailed_file = output_dir / f"{run_id}_trace_detailed.csv"
+        with open(detailed_file, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                'request_id', 'arrival_timestamp_ms', 'input_token_num',
+                'output_token_num', 'ttft_ms', 'e2e_latency_ms',
+                'total_tokens', 'success', 'error'
+            ])
+            
+            for trace in traces:
+                writer.writerow([
+                    trace.request_id,
+                    trace.arrival_timestamp_ms,
+                    trace.input_token_num,
+                    trace.output_token_num,
+                    trace.ttft if trace.ttft is not None else '',
+                    trace.e2e_latency if trace.e2e_latency is not None else '',
+                    trace.total_tokens,
+                    trace.success,
+                    trace.error or ''
+                ])
+        
+        # Save time-series files (for plotting)
+        if traces:
+            # TTFT time series
+            ttft_file = output_dir / f"{run_id}_trace_ttft.csv"
+            with open(ttft_file, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(['request_index', 'ttft_ms'])
+                for i, trace in enumerate(traces):
+                    if trace.ttft is not None:
+                        writer.writerow([i, trace.ttft])
+            
+            # E2E latency time series
+            latency_file = output_dir / f"{run_id}_trace_latency.csv"
+            with open(latency_file, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(['request_index', 'e2e_latency_ms'])
+                for i, trace in enumerate(traces):
+                    if trace.e2e_latency is not None:
+                        writer.writerow([i, trace.e2e_latency])
+        
+        logger.info(f"Trace results saved to {output_dir}")
+        return detailed_file

From 225c458560fe73843876315d1834328cb00942a9 Mon Sep 17 00:00:00 2001
From: sunjqa1 <sunjinge@node1.cluster.local>
Date: Thu, 11 Dec 2025 15:51:52 +0800
Subject: [PATCH 2/6] fix: correct service command string in
 _build_command_string

---
 .../configs/real_infinilm_service_small.json  | 38 ++++++++++++++
 .../configs/test_direct_infinilm.json         | 36 ++++++++++++++
 .../configs/test_direct_infinilm_real.json    | 39 +++++++++++++++
 .../configs/test_direct_infinilm_v2.json      | 41 ++++++++++++++++
 infinimetrics/inference/infer_runner_base.py  | 49 ++++++++++++++++---
 5 files changed, 195 insertions(+), 8 deletions(-)
 create mode 100644 infinimetrics/inference/configs/real_infinilm_service_small.json
 create mode 100644 infinimetrics/inference/configs/test_direct_infinilm.json
 create mode 100644 infinimetrics/inference/configs/test_direct_infinilm_real.json
 create mode 100644 infinimetrics/inference/configs/test_direct_infinilm_v2.json

diff --git a/infinimetrics/inference/configs/real_infinilm_service_small.json b/infinimetrics/inference/configs/real_infinilm_service_small.json
new file mode 100644
index 0000000..0252565
--- /dev/null
+++ b/infinimetrics/inference/configs/real_infinilm_service_small.json
@@ -0,0 +1,38 @@
+{
+  "run_id": "my.custom.runid",
+  "testcase": "infer.InfiniLM.Service",
+  "config": {
+    "model": "Qwen3-1.7B",
+    "model_path":"/var/qy_home/sunjinge/models/Qwen3-1.7B",
+    "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B",
+    
+    "device": {
+      "gpu_platform": "nvidia",
+      "device_ids": [0],
+      "cpu_only": false
+    },
+    
+    "train_dataset": null,
+    "validation_dataset": null,
+    "test_dataset": null,
+    "output_dir": "./test_output_service",
+    
+    "infer_args": {
+      "parallel": {
+        "dp": 1,
+        "tp": 1,
+        "pp": 1,
+        "sp": 1
+      },
+      "request_trace": "./test_trace.csv",
+      "concurrency": 4,
+      "max_seq_len": 2048,
+      "stream": true,
+      "timeout_ms": 30000
+    },
+    
+    "timeout_ms": 60000,
+    "warmup_iterations": 10,
+    "measured_iterations": 50
+  }
+}
diff --git a/infinimetrics/inference/configs/test_direct_infinilm.json b/infinimetrics/inference/configs/test_direct_infinilm.json
new file mode 100644
index 0000000..63f8222
--- /dev/null
+++ b/infinimetrics/inference/configs/test_direct_infinilm.json
@@ -0,0 +1,36 @@
+{
+  "config": {
+    "run_id": "infer.infinilm.direct.test.2024120601",
+    "testcase": "infer.InfiniLM.Direct",
+    "model": "Qwen3-1.7B",
+    "model_path": "/home/sunjinge/model/Qwen3-1.7B",
+    "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json",
+    "device": {
+      "gpu_platform": "nvidia",
+      "device_ids": [0],
+      "cpu_only": false
+    },
+    "train_dataset": null,
+    "validation_dataset": null,
+    "test_dataset": null,
+    "output_dir": "./test_output",
+    "infer_args": {
+      "parallel": {
+        "dp": 1,
+        "tp": 1,
+        "pp": 1,
+        "sp": 1
+      },
+      "static_batch_size": 1,
+      "prompt_token_num": 100,
+      "output_token_num": 50,
+      "max_seq_len": 2048,
+      "temperature": 0.7,
+      "top_p": 0.9,
+      "top_k": 50
+    },
+    "timeout_ms": 30000,
+    "warmup_iterations": 2,
+    "measured_iterations": 5
+  }
+}
diff --git a/infinimetrics/inference/configs/test_direct_infinilm_real.json b/infinimetrics/inference/configs/test_direct_infinilm_real.json
new file mode 100644
index 0000000..2c575f5
--- /dev/null
+++ b/infinimetrics/inference/configs/test_direct_infinilm_real.json
@@ -0,0 +1,39 @@
+{
+  "run_id": "my.custom.runid",
+  "testcase": "infer.InfiniLM.Direct",
+  "config": {
+    "model": "Qwen3-1.7B",
+    "model_path": "/var/qy_home/sunjinge/models/Qwen3-1.7B",
+    "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B",
+    
+    "device": {
+      "gpu_platform": "nvidia",
+      "device_ids": [0],
+      "cpu_only": false
+    },
+    
+    "train_dataset": null,
+    "validation_dataset": null,
+    "test_dataset": "./test_perplexity_data.json",
+    "output_dir": "./test_output_real",
+    
+    "infer_args": {
+      "parallel": {
+        "dp": 1,
+        "tp": 1,
+        "pp": 1,
+        "sp": 1
+      },
+      "static_batch_size": 4,
+      "prompt_token_num": 128,      
+      "output_token_num": 128,      
+      "max_seq_len": 512,
+      "temperature": 0.7,
+      "top_p": 0.9,
+      "top_k": 50
+    },
+    
+    "warmup_iterations": 1,        
+    "measured_iterations": 2       
+  }
+}
diff --git a/infinimetrics/inference/configs/test_direct_infinilm_v2.json b/infinimetrics/inference/configs/test_direct_infinilm_v2.json
new file mode 100644
index 0000000..50bdbce
--- /dev/null
+++ b/infinimetrics/inference/configs/test_direct_infinilm_v2.json
@@ -0,0 +1,41 @@
+{
+  "config": {
+    "run_id": "infer.infinilm.direct.test.v2",
+    "testcase": "infer.InfiniLM.Direct",
+    
+    "model": "Qwen3-1.7B",
+    "model_path": "/home/sunjinge/model/Qwen3-1.7B", 
+    "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json",
+    
+    "device": {
+      "gpu_platform": "nvidia",
+      "device_ids": [0],
+      "cpu_only": false
+    },
+    
+    "train_dataset": null,
+    "validation_dataset": null,
+    "test_dataset": null,
+    "output_dir": "./test_output",
+    
+    "infer_args": {
+      "parallel": {
+        "dp": 1,
+        "tp": 1,
+        "pp": 1,
+        "sp": 1
+      },
+      "static_batch_size": 1,
+      "prompt_token_num": 100,
+      "output_token_num": 50,
+      "max_seq_len": 2048,
+      "temperature": 0.7,
+      "top_p": 0.9,
+      "top_k": 50
+    },
+    
+    "timeout_ms": 30000,
+    "warmup_iterations": 2,
+    "measured_iterations": 5
+  }
+}
diff --git a/infinimetrics/inference/infer_runner_base.py b/infinimetrics/inference/infer_runner_base.py
index 219d272..3999b7e 100644
--- a/infinimetrics/inference/infer_runner_base.py
+++ b/infinimetrics/inference/infer_runner_base.py
@@ -432,15 +432,39 @@ def _build_command_string(self) -> str:
         cmd_parts = []
         
         if self.config.framework.value == "infinilm":
-            cmd_parts.append("python scripts/jiuge.py --nvidia")
-            cmd_parts.append(self.config.model_path)
-            cmd_parts.append(str(self.config.infer_args.parallel.tp))
-            
-            # Add batch size parameter (if jiuge.py supports it)
             if self.config.mode.value == "direct":
-                if hasattr(self.config.infer_args, 'static_batch_size'):
-                    cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}")
+                # Direct mode: use jiuge.py for direct inference
+                cmd_parts.append("python scripts/jiuge.py --nvidia")
+                cmd_parts.append(self.config.model_path)
+                cmd_parts.append(str(self.config.infer_args.parallel.tp))
+            
+                # Add batch size parameter (if jiuge.py supports it)
+                if self.config.mode.value == "direct":
+                    if hasattr(self.config.infer_args, 'static_batch_size'):
+                        cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}")
         
+            else:  # service mode
+                # Service mode: use launch_server.py to start inference service
+                cmd_parts.append("python scripts/launch_server.py")
+                cmd_parts.append(f"--model-path {self.config.model_path}")
+                cmd_parts.append(f"--dev nvidia")
+                cmd_parts.append(f"--ndev {self.config.infer_args.parallel.tp}")
+
+                # Add common arguments
+                if hasattr(self.config.infer_args, 'max_batch'):
+                    cmd_parts.append(f"--max-batch {self.config.infer_args.max_batch}")
+                elif hasattr(self.config.infer_args, 'max_seq_len'):
+                    cmd_parts.append(f"--max-tokens {self.config.infer_args.max_seq_len}")
+
+                # Add trace testing command (if trace is provided)
+                if hasattr(self.config.infer_args, 'request_trace'):
+                    trace_cmd = (
+                        f"# Trace test: python trace_client.py "
+                        f"--trace {self.config.infer_args.request_trace} "
+                        f"--concurrency {self.config.infer_args.concurrency}"
+                    )
+                    cmd_parts.append(trace_cmd)
+
         else:  # vllm
             if self.config.mode.value == "direct":
                 cmd_parts.append("python -m vllm.benchmarks.benchmark_throughput")
@@ -451,6 +475,15 @@ def _build_command_string(self) -> str:
                 cmd_parts.append("python -m vllm.entrypoints.api_server")
                 cmd_parts.append(f"--model {self.config.model_path}")
                 cmd_parts.append(f"--port 8000")
+                cmd_parts.append(f"--tensor-parallel-size {self.config.infer_args.parallel.tp}")
+
+                # Add trace serving command
+                if hasattr(self.config.infer_args, 'request_trace'):
+                    trace_cmd = (
+                        f"# Trace test: python -m vllm.benchmarks.benchmark_serving "
+                        f"--trace {self.config.infer_args.request_trace}"
+                    )
+                    cmd_parts.append(trace_cmd)
         
         return " ".join(cmd_parts)
     
@@ -579,4 +612,4 @@ def execute(self) -> None:
     @abc.abstractmethod
     def collect_metrics(self) -> None:
         """Collect performance metrics (to be implemented by subclasses)"""
-        pass
\ No newline at end of file
+        pass

From 01b71993dd1b9ff87bfc420d619af927fb9ca912 Mon Sep 17 00:00:00 2001
From: sunjinge <94775646+zzhfz@users.noreply.github.com>
Date: Thu, 11 Dec 2025 16:09:17 +0800
Subject: [PATCH 3/6] Delete
 infinimetrics/inference/configs/real_infinilm_service_small.json

---
 .../configs/real_infinilm_service_small.json  | 38 -------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 infinimetrics/inference/configs/real_infinilm_service_small.json

diff --git a/infinimetrics/inference/configs/real_infinilm_service_small.json b/infinimetrics/inference/configs/real_infinilm_service_small.json
deleted file mode 100644
index 0252565..0000000
--- a/infinimetrics/inference/configs/real_infinilm_service_small.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  "run_id": "my.custom.runid",
-  "testcase": "infer.InfiniLM.Service",
-  "config": {
-    "model": "Qwen3-1.7B",
-    "model_path":"/var/qy_home/sunjinge/models/Qwen3-1.7B",
-    "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B",
-    
-    "device": {
-      "gpu_platform": "nvidia",
-      "device_ids": [0],
-      "cpu_only": false
-    },
-    
-    "train_dataset": null,
-    "validation_dataset": null,
-    "test_dataset": null,
-    "output_dir": "./test_output_service",
-    
-    "infer_args": {
-      "parallel": {
-        "dp": 1,
-        "tp": 1,
-        "pp": 1,
-        "sp": 1
-      },
-      "request_trace": "./test_trace.csv",
-      "concurrency": 4,
-      "max_seq_len": 2048,
-      "stream": true,
-      "timeout_ms": 30000
-    },
-    
-    "timeout_ms": 60000,
-    "warmup_iterations": 10,
-    "measured_iterations": 50
-  }
-}

From cae31d143a35f1df63418337fe340a6dd8fd864b Mon Sep 17 00:00:00 2001
From: sunjinge <94775646+zzhfz@users.noreply.github.com>
Date: Thu, 11 Dec 2025 16:09:59 +0800
Subject: [PATCH 4/6] Delete
 infinimetrics/inference/configs/test_direct_infinilm_real.json

---
 .../configs/test_direct_infinilm_real.json    | 39 -------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 infinimetrics/inference/configs/test_direct_infinilm_real.json

diff --git a/infinimetrics/inference/configs/test_direct_infinilm_real.json b/infinimetrics/inference/configs/test_direct_infinilm_real.json
deleted file mode 100644
index 2c575f5..0000000
--- a/infinimetrics/inference/configs/test_direct_infinilm_real.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "run_id": "my.custom.runid",
-  "testcase": "infer.InfiniLM.Direct",
-  "config": {
-    "model": "Qwen3-1.7B",
-    "model_path": "/var/qy_home/sunjinge/models/Qwen3-1.7B",
-    "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B",
-    
-    "device": {
-      "gpu_platform": "nvidia",
-      "device_ids": [0],
-      "cpu_only": false
-    },
-    
-    "train_dataset": null,
-    "validation_dataset": null,
-    "test_dataset": "./test_perplexity_data.json",
-    "output_dir": "./test_output_real",
-    
-    "infer_args": {
-      "parallel": {
-        "dp": 1,
-        "tp": 1,
-        "pp": 1,
-        "sp": 1
-      },
-      "static_batch_size": 4,
-      "prompt_token_num": 128,      
-      "output_token_num": 128,      
-      "max_seq_len": 512,
-      "temperature": 0.7,
-      "top_p": 0.9,
-      "top_k": 50
-    },
-    
-    "warmup_iterations": 1,        
-    "measured_iterations": 2       
-  }
-}

From 9d011b8c81175d1a35e1fe6f8766f637bac04425 Mon Sep 17 00:00:00 2001
From: sunjinge <94775646+zzhfz@users.noreply.github.com>
Date: Thu, 11 Dec 2025 16:10:19 +0800
Subject: [PATCH 5/6] Delete
 infinimetrics/inference/configs/test_direct_infinilm_v2.json

---
 .../configs/test_direct_infinilm_v2.json      | 41 -------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 infinimetrics/inference/configs/test_direct_infinilm_v2.json

diff --git a/infinimetrics/inference/configs/test_direct_infinilm_v2.json b/infinimetrics/inference/configs/test_direct_infinilm_v2.json
deleted file mode 100644
index 50bdbce..0000000
--- a/infinimetrics/inference/configs/test_direct_infinilm_v2.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-  "config": {
-    "run_id": "infer.infinilm.direct.test.v2",
-    "testcase": "infer.InfiniLM.Direct",
-    
-    "model": "Qwen3-1.7B",
-    "model_path": "/home/sunjinge/model/Qwen3-1.7B", 
-    "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json",
-    
-    "device": {
-      "gpu_platform": "nvidia",
-      "device_ids": [0],
-      "cpu_only": false
-    },
-    
-    "train_dataset": null,
-    "validation_dataset": null,
-    "test_dataset": null,
-    "output_dir": "./test_output",
-    
-    "infer_args": {
-      "parallel": {
-        "dp": 1,
-        "tp": 1,
-        "pp": 1,
-        "sp": 1
-      },
-      "static_batch_size": 1,
-      "prompt_token_num": 100,
-      "output_token_num": 50,
-      "max_seq_len": 2048,
-      "temperature": 0.7,
-      "top_p": 0.9,
-      "top_k": 50
-    },
-    
-    "timeout_ms": 30000,
-    "warmup_iterations": 2,
-    "measured_iterations": 5
-  }
-}

From 266abdfa1ffe25e4fe3e86ea61e7ad3578689a6e Mon Sep 17 00:00:00 2001
From: sunjinge <94775646+zzhfz@users.noreply.github.com>
Date: Thu, 11 Dec 2025 16:10:37 +0800
Subject: [PATCH 6/6] Delete
 infinimetrics/inference/configs/test_direct_infinilm.json

---
 .../configs/test_direct_infinilm.json         | 36 -------------------
 1 file changed, 36 deletions(-)
 delete mode 100644 infinimetrics/inference/configs/test_direct_infinilm.json

diff --git a/infinimetrics/inference/configs/test_direct_infinilm.json b/infinimetrics/inference/configs/test_direct_infinilm.json
deleted file mode 100644
index 63f8222..0000000
--- a/infinimetrics/inference/configs/test_direct_infinilm.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-  "config": {
-    "run_id": "infer.infinilm.direct.test.2024120601",
-    "testcase": "infer.InfiniLM.Direct",
-    "model": "Qwen3-1.7B",
-    "model_path": "/home/sunjinge/model/Qwen3-1.7B",
-    "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json",
-    "device": {
-      "gpu_platform": "nvidia",
-      "device_ids": [0],
-      "cpu_only": false
-    },
-    "train_dataset": null,
-    "validation_dataset": null,
-    "test_dataset": null,
-    "output_dir": "./test_output",
-    "infer_args": {
-      "parallel": {
-        "dp": 1,
-        "tp": 1,
-        "pp": 1,
-        "sp": 1
-      },
-      "static_batch_size": 1,
-      "prompt_token_num": 100,
-      "output_token_num": 50,
-      "max_seq_len": 2048,
-      "temperature": 0.7,
-      "top_p": 0.9,
-      "top_k": 50
-    },
-    "timeout_ms": 30000,
-    "warmup_iterations": 2,
-    "measured_iterations": 5
-  }
-}