From b40f2a3df9279d54cf2528bd17a0181390f8b145 Mon Sep 17 00:00:00 2001 From: sunjqa1 Date: Thu, 11 Dec 2025 14:49:01 +0800 Subject: [PATCH 1/6] feat(inference): Add InfiniLM inference benchmark framework --- infinimetrics/inference/.gitkeep | 0 infinimetrics/inference/__init__.py | 37 ++ infinimetrics/inference/adapter_base.py | 272 ++++++++ .../inference/adapters/infinilm_adapter.py | 613 ++++++++++++++++++ infinimetrics/inference/create_test_trace.py | 30 + .../inference/direct_infer_runner.py | 326 ++++++++++ infinimetrics/inference/infer_config.py | 517 +++++++++++++++ infinimetrics/inference/infer_main.py | 262 ++++++++ infinimetrics/inference/infer_runner_base.py | 582 +++++++++++++++++ .../inference/infer_runner_factory.py | 146 +++++ .../inference/service_infer_runner.py | 438 +++++++++++++ infinimetrics/inference/utils/gpu_monitor.py | 158 +++++ .../inference/utils/prompt_generator.py | 543 ++++++++++++++++ .../inference/utils/token_generator.py | 235 +++++++ infinimetrics/inference/utils/trace_client.py | 527 +++++++++++++++ 15 files changed, 4686 insertions(+) delete mode 100644 infinimetrics/inference/.gitkeep create mode 100644 infinimetrics/inference/__init__.py create mode 100644 infinimetrics/inference/adapter_base.py create mode 100644 infinimetrics/inference/adapters/infinilm_adapter.py create mode 100644 infinimetrics/inference/create_test_trace.py create mode 100644 infinimetrics/inference/direct_infer_runner.py create mode 100644 infinimetrics/inference/infer_config.py create mode 100644 infinimetrics/inference/infer_main.py create mode 100644 infinimetrics/inference/infer_runner_base.py create mode 100644 infinimetrics/inference/infer_runner_factory.py create mode 100644 infinimetrics/inference/service_infer_runner.py create mode 100644 infinimetrics/inference/utils/gpu_monitor.py create mode 100644 infinimetrics/inference/utils/prompt_generator.py create mode 100644 infinimetrics/inference/utils/token_generator.py create mode 100644 infinimetrics/inference/utils/trace_client.py diff --git a/infinimetrics/inference/.gitkeep b/infinimetrics/inference/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/infinimetrics/inference/__init__.py b/infinimetrics/inference/__init__.py new file mode 100644 index 0000000..6d1812f --- /dev/null +++ b/infinimetrics/inference/__init__.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +""" +infinimetrics inference package +Unified Reasoning Evaluation Framework +""" + +__version__ = "1.0.0" +__author__ = "Infinimetrics Team" + +from .infer_config import ( + InferConfig, InferConfigManager, + InferMode, FrameworkType, + DirectInferArgs, ServiceInferArgs +) +from .infer_runner_base import ( + InferRunnerBase, BenchmarkResult, + Metric, ScalarMetric, TimeseriesMetric +) +from .adapter_base import InferAdapter +from .infer_runner_factory import InferRunnerFactory + +__all__ = [ + # config + "InferConfig", "InferConfigManager", + "InferMode", "FrameworkType", + "DirectInferArgs", "ServiceInferArgs", + + # Runner + "InferRunnerBase", "BenchmarkResult", + "Metric", "ScalarMetric", "TimeseriesMetric", + + # adapter + "InferAdapter", + + # factory + "InferRunnerFactory", +] diff --git a/infinimetrics/inference/adapter_base.py b/infinimetrics/inference/adapter_base.py new file mode 100644 index 0000000..ee96549 --- /dev/null +++ b/infinimetrics/inference/adapter_base.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +Inference Adapter Base Class +Defines a unified interface for different frameworks (InfiniLM/vLLM) +""" + +import abc +import logging +import random +from typing import List, Tuple, Optional, Dict, Any, Set +from typing import Set + +from infer_config import InferConfig + +logger = logging.getLogger(__name__) + +class InferAdapter(abc.ABC): + """Base class for inference adapters""" + + def __init__(self, config: InferConfig): + self.config = config + self.model_loaded = False + self.service_started = False + self.tokenizer = None + + @abc.abstractmethod + def load_model(self) -> None: + """ + Load model + Subclasses must implement the actual model loading logic + """ + pass + + @abc.abstractmethod + def unload_model(self) -> None: + """ + Unload model + Subclasses must implement model unloading and resource cleanup + """ + pass + + @abc.abstractmethod + def generate( + self, + prompts: List[str], + max_tokens: int, + temperature: float = 0.7, + top_p: float = 0.9, + top_k: int = 50 + ) -> Tuple[List[str], List[float], List[float]]: + """ + Generate text + Returns: (list of generated texts, list of latencies (ms), list of TTFT values (ms)) + """ + pass + + @abc.abstractmethod + def batch_generate( + self, + batch_prompts: List[List[str]], + max_tokens: int, + temperature: float = 0.7, + top_p: float = 0.9, + top_k: int = 50 + ) -> Tuple[List[List[str]], List[List[float]], List[List[float]]]: + """ + Batch text generation + Returns: (batch of generated texts, batch of latencies, batch of TTFT values) + """ + pass + + @abc.abstractmethod + def calculate_perplexity(self, test_data: List[str]) -> float: + """ + Calculate perplexity + """ + pass + + @abc.abstractmethod + def launch_service(self, port: int = 8000) -> None: + """ + Launch inference service + """ + pass + + @abc.abstractmethod + def stop_service(self) -> None: + """ + Stop inference service + """ + pass + + @abc.abstractmethod + def is_service_ready(self, port: int = 8000) -> bool: + """ + Check whether the service is ready + """ + pass + + @abc.abstractmethod + def get_service_url(self) -> str: + """ + Get service URL + """ + pass + + def get_vocab_size(self) -> int: + """Get vocabulary size""" + if self.tokenizer is None: + raise ValueError("Tokenizer not loaded") + return len(self.tokenizer) + + def get_special_token_ids(self) -> Set[int]: + """Get a set of special token IDs that should be excluded""" + if self.tokenizer is None: + return set() + + special_ids = set() + + # Retrieve tokenizer's special token map + special_tokens_map = getattr(self.tokenizer, 'special_tokens_map', {}) + + # Collect IDs of all special tokens + for key, token in special_tokens_map.items(): + if isinstance(token, int): + special_ids.add(token) + elif isinstance(token, str): + token_id = self.tokenizer.convert_tokens_to_ids(token) + if token_id is not None: + special_ids.add(token_id) + + # Add commonly used special tokens + common_special_tokens = [ + "bos_token", "eos_token", "pad_token", "unk_token", + "sep_token", "cls_token", "mask_token" + ] + + for token_name in common_special_tokens: + token = getattr(self.tokenizer, token_name, None) + if token is not None: + if isinstance(token, str): + token_id = self.tokenizer.convert_tokens_to_ids(token) + if token_id is not None: + special_ids.add(token_id) + elif hasattr(token, 'content'): + # Handle special token objects + token_id = self.tokenizer.convert_tokens_to_ids(token.content) + if token_id is not None: + special_ids.add(token_id) + + logger.debug(f"Found {len(special_ids)} special token IDs: {sorted(list(special_ids))}") + return special_ids + + def generate_random_tokens(self, num_tokens: int, exclude_special: bool = True) -> List[int]: + """ + Generate a sequence of random token IDs + + Args: + num_tokens: Number of tokens to generate + exclude_special: Whether to exclude special tokens + + Returns: + List of random token IDs + """ + if self.tokenizer is None: + raise ValueError("Tokenizer not loaded") + + vocab_size = self.get_vocab_size() + + if exclude_special: + special_ids = self.get_special_token_ids() + # Create valid token range (excluding special tokens) + all_ids = set(range(vocab_size)) + valid_ids = sorted(list(all_ids - special_ids)) + + if not valid_ids: + logger.warning("No valid tokens after excluding special tokens, using all tokens") + valid_ids = list(range(vocab_size)) + else: + valid_ids = list(range(vocab_size)) + + # Random sampling + tokens = random.choices(valid_ids, k=num_tokens) + + logger.debug(f"Generated {num_tokens} random tokens (vocab_size={vocab_size}, " + f"exclude_special={exclude_special})") + + return tokens + + def tokens_to_text(self, tokens: List[int]) -> str: + """Convert token IDs to text (for debugging)""" + if self.tokenizer is None: + raise ValueError("Tokenizer not loaded") + return self.tokenizer.decode(tokens, skip_special_tokens=True) + + def generate_random_prompt(self, token_count: int, exclude_special: bool = True) -> str: + """ + Generate a random prompt text + + Args: + token_count: Number of tokens in the prompt + exclude_special: Whether to exclude special tokens + + Returns: + A random prompt string + """ + tokens = self.generate_random_tokens(token_count, exclude_special) + return self.tokens_to_text(tokens) + + def generate_random_prompts(self, num_prompts: int, token_count: int, + exclude_special: bool = True) -> List[str]: + """ + Generate multiple random prompts + + Args: + num_prompts: Number of prompts to generate + token_count: Number of tokens in each prompt + exclude_special: Whether to exclude special tokens + + Returns: + List of random prompt strings + """ + prompts = [] + for i in range(num_prompts): + prompt = self.generate_random_prompt(token_count, exclude_special) + prompts.append(prompt) + + logger.info(f"Generated {num_prompts} random prompts, {token_count} tokens each") + return prompts + + def validate_config(self) -> List[str]: + """ + Validate adapter configuration + Returns: List of error messages + """ + errors = [] + + # Validate model path + import os + if not os.path.exists(self.config.model_path): + errors.append(f"Model path does not exist: {self.config.model_path}") + + # Validate framework-specific configuration + errors.extend(self._validate_framework_config()) + + return errors + + @abc.abstractmethod + def _validate_framework_config(self) -> List[str]: + """ + Validate framework-specific configuration + Subclasses must implement this + """ + pass + + def get_peak_memory_usage(self) -> Optional[float]: + """ + Get peak GPU memory usage (GB) + Subclasses can override this to implement specific memory monitoring + """ + try: + import torch + if torch.cuda.is_available(): + # Get peak memory usage across all GPUs + max_memory = 0 + for i in range(torch.cuda.device_count()): + max_memory = max(max_memory, torch.cuda.max_memory_allocated(i)) + return max_memory / (1024 ** 3) # Convert to GB + except ImportError: + logger.warning("PyTorch not available, cannot get GPU memory usage") + + return None diff --git a/infinimetrics/inference/adapters/infinilm_adapter.py b/infinimetrics/inference/adapters/infinilm_adapter.py new file mode 100644 index 0000000..f80607b --- /dev/null +++ b/infinimetrics/inference/adapters/infinilm_adapter.py @@ -0,0 +1,613 @@ +#!/usr/bin/env python3 +""" +InfiniLM Adapter Implementation +""" + +import os +import re +import torch +import sys +import time +import logging +import random +from pathlib import Path +from typing import List, Tuple, Optional, Dict, Any +import subprocess +import threading + +# Try to import InfiniLM related modules +try: + # Add scripts directory to path + scripts_dir = Path.cwd() / "scripts" + if scripts_dir.exists(): + sys.path.insert(0, str(scripts_dir)) + + # Import jiuge module + from jiuge import JiugeForCauslLM + from libinfinicore_infer import DeviceType + from infer_task import InferTask, KVCache + + INFINILM_AVAILABLE = True + logger = logging.getLogger(__name__) + logger.info("InfiniLM modules imported successfully") + +except ImportError as e: + INFINILM_AVAILABLE = False + logger = logging.getLogger(__name__) + logger.error(f"Failed to import InfiniLM modules: {e}") + raise + +from adapter_base import InferAdapter +from infer_config import InferConfig, InferMode, DirectInferArgs +from utils.token_generator import TokenGenerator, create_token_generator + +class InfiniLMAdapter(InferAdapter): + """InfiniLM adapter implementation - Fixed version (using API correctly)""" + + def __init__(self, config: InferConfig): + super().__init__(config) + + # InfiniLM specific attributes + self.jiuge_model: Optional[JiugeForCauslLM] = None + self.token_generator: Optional[TokenGenerator] = None + + # Service related + self.server_process: Optional[subprocess.Popen] = None + self.server_port = 8000 + + logger.info(f"InfiniLMAdapter created for model: {config.model}") + logger.info(f"Model path: {config.model_path}") + + def load_model(self) -> None: + """Load real InfiniLM model (fixed API usage)""" + if not INFINILM_AVAILABLE: + logger.error("InfiniLM modules not available. Cannot load model.") + raise ImportError("InfiniLM modules not available") + + logger.info(f"Loading real InfiniLM model from: {self.config.model_path}") + + try: + # Determine device type + device_type = self._get_device_type() + + # Get tp size (from infer_args.parallel) + tp_size = self.config.infer_args.parallel.tp + + # ✅ Fix 1: Correctly call JiugeForCauslLM constructor + # Original API: JiugeForCauslLM(model_dir_path, device, ndev, max_tokens=None) + self.jiuge_model = JiugeForCauslLM( + self.config.model_path, + device_type, # ✅ Not device=device_type + tp_size, + max_tokens=self.config.infer_args.max_seq_len # ✅ This parameter is optional + ) + + # Get tokenizer + self.tokenizer = self.jiuge_model.tokenizer + + # Create token generator + self.token_generator = create_token_generator( + self.tokenizer, + exclude_special_tokens=True + ) + + self.model_loaded = True + logger.info("Real InfiniLM model loaded successfully") + logger.info(f"Tokenizer vocab size: {self.get_vocab_size()}") + logger.info(f"Model max context length: {self.jiuge_model.max_context_len()}") + logger.info(f"EOS token ID: {self.jiuge_model.eos_token_id}") + + except Exception as e: + logger.error(f"Failed to load real InfiniLM model: {e}", exc_info=True) + raise + + def unload_model(self) -> None: + """Unload model""" + if self.jiuge_model: + try: + self.jiuge_model.destroy_model_instance() + logger.info("InfiniLM model unloaded") + except Exception as e: + logger.warning(f"Error unloading model: {e}") + + self.jiuge_model = None + + self.model_loaded = False + self.tokenizer = None + self.token_generator = None + + def generate( + self, + prompts: List[str], + max_tokens: int, + temperature: float = 0.7, + top_p: float = 0.9, + top_k: int = 50 + ) -> Tuple[List[str], List[float], List[float]]: + """ + Real InfiniLM inference implementation (fixed API usage) + + Args: + prompts: List of input prompts + max_tokens: Maximum tokens to generate per prompt + temperature, top_p, top_k: Sampling parameters + + Returns: + (List of generated texts, Latency list (ms), TTFT list (ms)) + """ + if not self.model_loaded or not self.jiuge_model: + raise RuntimeError("Model not loaded") + + logger.info(f"Real InfiniLM batch inference for {len(prompts)} prompts") + logger.info(f" Max tokens per prompt: {max_tokens}") + logger.info(f" Temperature: {temperature}, Top-p: {top_p}, Top-k: {top_k}") + + # 1. Encode prompts + token_lists = [] + for i, prompt in enumerate(prompts): + tokens = self.tokenizer.encode(prompt) + token_lists.append(tokens) + if i == 0: # Record first prompt information + logger.info(f" First prompt: {len(tokens)} tokens") + logger.debug(f" First prompt preview: {prompt[:100]}...") + + # 2. Create InferTask and KVCache for each prompt + tasks = [] + kv_caches = [] + + for i, tokens in enumerate(token_lists): + try: + # ✅ Fix 2: Correctly create InferTask + # InferTask parameters: id, tokens, max_tokens, temperature, topk, topp, topa, end_tokens + # topa parameter might be needed, set to 0 for now + topa = 0 # Assuming no top-a sampling needed + + # ✅ Fix 3: end_tokens should be a list + if isinstance(self.jiuge_model.eos_token_id, list): + end_tokens = self.jiuge_model.eos_token_id + else: + end_tokens = [self.jiuge_model.eos_token_id] + + # ✅ Fix 4: Use max_seq_len from config, not max_context_len + max_seq_len = min( + self.config.infer_args.max_seq_len, + self.jiuge_model.max_context_len() + ) + + task = InferTask( + id=i, + tokens=tokens, + max_tokens=max_seq_len, + temperature=temperature, + topk=top_k, + topp=top_p, + end_tokens=end_tokens + ) + + # Create and bind KVCache + kv_cache = KVCache(self.jiuge_model) + task.bind_kvcache(kv_cache) + + tasks.append(task) + kv_caches.append(kv_cache) + + logger.debug(f" Created InferTask {i}: {len(tokens)} input tokens") + + except Exception as e: + logger.error(f"Failed to create InferTask for prompt {i}: {e}") + raise + + logger.info(f"Created {len(tasks)} InferTasks for batch inference") + + # 3. Execute batch inference + generated_texts = [] + latencies = [] + ttfts = [] + + # Pre-allocate result lists for each prompt + all_generated_tokens = [[] for _ in range(len(tasks))] + + try: + # Measure TTFT (first batch inference) + start_time = time.perf_counter() + output_tokens_batch = self.jiuge_model.batch_infer_one_round(tasks) + ttft = (time.perf_counter() - start_time) * 1000 + + # ✅ Fix 5: batch_infer_one_round returns List[List[int]] + # Each task returns a token list (may contain multiple tokens) + for i, task_output in enumerate(output_tokens_batch): + if task_output: # May have output + # Take first token (if multiple tokens returned, take first) + first_token = task_output[0] if isinstance(task_output, list) else task_output + all_generated_tokens[i].append(first_token) + + # Record TTFT for this task (all tasks use same TTFT since batch inference) + ttfts.append(ttft) + else: + all_generated_tokens[i].append(0) # Placeholder + ttfts.append(0.0) + + # 4. Continue generating remaining tokens (token by token) + total_generated = 1 # Already generated first token + + while total_generated < max_tokens: + # Update status for all tasks + active_tasks = [] + active_indices = [] + + for i, task in enumerate(tasks): + if len(all_generated_tokens[i]) > 0: + last_token = all_generated_tokens[i][-1] + + # ✅ Fix 6: Correctly check EOS + if isinstance(self.jiuge_model.eos_token_id, list): + is_eos = last_token in self.jiuge_model.eos_token_id + else: + is_eos = last_token == self.jiuge_model.eos_token_id + + if not is_eos and len(all_generated_tokens[i]) < max_tokens: + task.next(last_token) + active_tasks.append(task) + active_indices.append(i) + + # If no active tasks, stop generation + if not active_tasks: + logger.info("All tasks reached EOS or max tokens") + break + + # Batch inference for active tasks + iteration_start = time.perf_counter() + active_outputs = self.jiuge_model.batch_infer_one_round(active_tasks) + iteration_time = (time.perf_counter() - iteration_start) * 1000 + + # Process outputs + for idx, task_idx in enumerate(active_indices): + if idx < len(active_outputs) and active_outputs[idx]: + next_token = active_outputs[idx][0] if isinstance(active_outputs[idx], list) else active_outputs[idx] + all_generated_tokens[task_idx].append(next_token) + + total_generated += 1 + + if total_generated % 10 == 0: + logger.debug(f" Generated {total_generated}/{max_tokens} tokens") + + # 5. Calculate total latency and decode text + total_latency = (time.perf_counter() - start_time) * 1000 + + for i, generated_tokens in enumerate(all_generated_tokens): + # Calculate latency for this prompt (for batch inference, all prompts have same latency) + latencies.append(total_latency) + + # Decode text + if generated_tokens: + generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) + generated_texts.append(generated_text) + + logger.debug(f" Prompt {i}: {len(generated_tokens)} tokens generated") + if i == 0 and generated_text: + logger.debug(f" First generated text preview: {generated_text[:100]}...") + else: + generated_texts.append("") + logger.warning(f" Prompt {i}: No tokens generated") + + except Exception as e: + logger.error(f"Error during batch inference: {e}", exc_info=True) + raise + + finally: + # 6. Clean up KVCaches + logger.info("Cleaning up KVCaches...") + for i, kv_cache in enumerate(kv_caches): + try: + if kv_cache and self.jiuge_model: + kv_cache.drop(self.jiuge_model) + except Exception as e: + logger.warning(f"Failed to drop KV cache {i}: {e}") + + # 7. Return results + logger.info(f"Inference completed: {len(generated_texts)} prompts processed") + if latencies: + avg_latency = sum(latencies) / len(latencies) + avg_ttft = sum(ttfts) / len(ttfts) if ttfts else 0 + logger.info(f" Avg latency: {avg_latency:.2f}ms") + logger.info(f" Avg TTFT: {avg_ttft:.2f}ms") + + return generated_texts, latencies, ttfts + + def batch_generate( + self, + batch_prompts: List[List[str]], + max_tokens: int, + temperature: float = 0.7, + top_p: float = 0.9, + top_k: int = 50 + ) -> Tuple[List[List[str]], List[List[float]], List[List[float]]]: + """ + Batch text generation (multiple batches) + + Note: For large batches, we may need to split to avoid OOM + """ + logger.info(f"Batch generating for {len(batch_prompts)} batches") + + all_texts = [] + all_latencies = [] + all_ttfts = [] + + for batch_idx, prompts in enumerate(batch_prompts): + logger.info(f"Processing batch {batch_idx+1}/{len(batch_prompts)} " + f"({len(prompts)} prompts)") + + # Check batch size to avoid OOM + max_batch_size = 8 # Safe value, can adjust based on GPU memory + if len(prompts) > max_batch_size: + logger.warning(f"Batch size {len(prompts)} too large, splitting...") + + # Split processing + split_texts = [] + split_latencies = [] + split_ttfts = [] + + for i in range(0, len(prompts), max_batch_size): + sub_prompts = prompts[i:i + max_batch_size] + logger.info(f" Processing sub-batch {i//max_batch_size + 1}") + + texts, latencies, ttfts = self.generate( + sub_prompts, max_tokens, temperature, top_p, top_k + ) + + split_texts.extend(texts) + split_latencies.extend(latencies) + split_ttfts.extend(ttfts) + + all_texts.append(split_texts) + all_latencies.append(split_latencies) + all_ttfts.append(split_ttfts) + else: + texts, latencies, ttfts = self.generate( + prompts, max_tokens, temperature, top_p, top_k + ) + + all_texts.append(texts) + all_latencies.append(latencies) + all_ttfts.append(ttfts) + + return all_texts, all_latencies, all_ttfts + + def get_peak_memory_usage(self) -> Optional[float]: + """Get peak memory usage (GB)""" + try: + if torch.cuda.is_available(): + torch.cuda.synchronize() + + # Get peak memory (bytes) + max_memory_bytes = 0 + for i in range(torch.cuda.device_count()): + device_max = torch.cuda.max_memory_allocated(i) + if device_max > max_memory_bytes: + max_memory_bytes = device_max + + # Convert to GB + max_memory_gb = max_memory_bytes / (1024 ** 3) + + # Also get current memory usage + current_memory_bytes = torch.cuda.memory_allocated() + current_memory_gb = current_memory_bytes / (1024 ** 3) + + logger.info(f"GPU memory - Peak: {max_memory_gb:.2f} GB, Current: {current_memory_gb:.2f} GB") + return max_memory_gb + + except ImportError: + logger.warning("PyTorch not available, cannot get GPU memory usage") + except Exception as e: + logger.warning(f"Failed to get GPU memory usage: {e}") + + # Try to get via nvidia-smi (fallback method) + try: + + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"], + capture_output=True, + text=True, + timeout=5 + ) + + if result.returncode == 0: + # Parse output, e.g.: "1234, 24576\n" + lines = result.stdout.strip().split('\n') + max_memory_mb = 0 + + for line in lines: + if line: + used_str, total_str = line.split(',') + used_mb = float(used_str.strip()) + total_mb = float(total_str.strip()) + + if used_mb > max_memory_mb: + max_memory_mb = used_mb + + if max_memory_mb > 0: + max_memory_gb = max_memory_mb / 1024 + logger.info(f"GPU memory (nvidia-smi): {max_memory_gb:.2f} GB") + return max_memory_gb + + except Exception as e: + logger.debug(f"nvidia-smi fallback failed: {e}") + + return None + + def calculate_perplexity(self, test_data: List[str]) -> float: + """Calculate perplexity""" + if not self.model_loaded or not self.jiuge_model: + raise RuntimeError("Model not loaded") + + logger.info(f"Calculating perplexity for {len(test_data)} test samples") + + try: + # Convert text to token sequences + test_sequences = [] + for text in test_data: + tokens = self.tokenizer.encode(text) + if len(tokens) > self.config.infer_args.max_seq_len: + tokens = tokens[:self.config.infer_args.max_seq_len] + test_sequences.append(tokens) + + # Use jiuge model's perplexity method + # Note: Need to test batch_size to avoid OOM + batch_size = min(4, len(test_sequences)) + perplexity = self.jiuge_model.perplexity(test_sequences, batch_size=batch_size) + + logger.info(f"Perplexity calculated: {perplexity:.4f}") + return perplexity + + except Exception as e: + logger.error(f"Failed to calculate perplexity: {e}") + # Return a default value, don't interrupt tests + return 0.0 + + def launch_service(self, port: int = 8000) -> None: + """Launch InfiniLM inference service""" + logger.info(f"Launching InfiniLM service on port {port}") + + # Build launch command + cmd = [ + sys.executable, "scripts/launch_server.py", + "--model-path", self.config.model_path, + "--dev", "nvidia", + "--ndev", str(self.config.infer_args.parallel.tp), + "--max-batch", "4" + ] + + if port != 8000: + cmd.extend(["--port", str(port)]) + + # Start service process + try: + self.server_process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, + universal_newlines=True + ) + + self.server_port = port + self.service_started = True + + # Start thread to read output + self._start_output_reader() + + logger.info(f"InfiniLM service started with PID: {self.server_process.pid}") + logger.info(f"Command: {' '.join(cmd)}") + + except Exception as e: + logger.error(f"Failed to launch InfiniLM service: {e}") + raise + + def stop_service(self) -> None: + """Stop inference service""" + if self.server_process: + logger.info("Stopping InfiniLM service") + + try: + self.server_process.terminate() + self.server_process.wait(timeout=10) + logger.info("InfiniLM service stopped") + except subprocess.TimeoutExpired: + logger.warning("Service did not stop gracefully, forcing kill") + self.server_process.kill() + + self.server_process = None + + self.service_started = False + + def is_service_ready(self, port: int = 8000) -> bool: + """Simplified service readiness check - only check port""" + if not self.service_started or not self.server_process: + logger.debug("Service not started or no server process") + return False + + # Check if process is alive + if self.server_process.poll() is not None: + logger.error(f"Server process died with return code: {self.server_process.returncode}") + return False + + # Only check if port is open + import socket + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(1) + result = sock.connect_ex(('localhost', port)) + sock.close() + + if result == 0: + logger.debug(f"Port {port} is open, service is ready") + return True + else: + logger.debug(f"Port {port} not open yet (result={result})") + return False + except Exception as e: + logger.debug(f"Port check failed: {e}") + return False + + def get_service_url(self) -> str: + """Get service URL""" + return f"http://localhost:{self.server_port}" + + def _get_device_type(self): + """Get device type based on configuration""" + gpu_platform = self.config.device.gpu_platform.lower() + + if gpu_platform == "nvidia": + return DeviceType.DEVICE_TYPE_NVIDIA + elif gpu_platform == "cpu": + return DeviceType.DEVICE_TYPE_CPU + else: + logger.warning(f"Unknown GPU platform: {gpu_platform}, using NVIDIA as default") + return DeviceType.DEVICE_TYPE_NVIDIA + + def _start_output_reader(self): + """Start output reading thread""" + def read_output(): + if self.server_process: + for line in self.server_process.stdout: + logger.info(f"[InfiniLM Server] {line.strip()}") + + self.output_thread = threading.Thread(target=read_output, daemon=True) + self.output_thread.start() + + def _validate_framework_config(self) -> List[str]: + """Validate InfiniLM specific configuration""" + errors = [] + + # Check if scripts directory exists + scripts_dir = Path("scripts") + if not scripts_dir.exists(): + errors.append("scripts directory not found in current directory") + else: + # Check for necessary script files + required_scripts = ["jiuge.py", "launch_server.py"] + for script in required_scripts: + if not (scripts_dir / script).exists(): + errors.append(f"Required script not found: {script}") + + # Check parallel configuration + if self.config.infer_args.parallel.tp <= 0: + errors.append("Tensor parallel size (tp) must be positive") + + # Check model directory + model_dir = Path(self.config.model_path) + if not model_dir.exists(): + errors.append(f"Model directory does not exist: {model_dir}") + else: + # Check for necessary model files + model_files = list(model_dir.glob("*.safetensors")) + list(model_dir.glob("*.bin")) + if not model_files: + errors.append(f"No model files found in {model_dir}") + + # Check config.json + config_file = model_dir / "config.json" + if not config_file.exists(): + errors.append(f"config.json not found in {model_dir}") + + return errors \ No newline at end of file diff --git a/infinimetrics/inference/create_test_trace.py b/infinimetrics/inference/create_test_trace.py new file mode 100644 index 0000000..a5525bc --- /dev/null +++ b/infinimetrics/inference/create_test_trace.py @@ -0,0 +1,30 @@ +import csv +import random +from pathlib import Path + +def create_test_trace(output_file: str = "test_trace.csv", num_requests: int = 50): + """Create a test trace file""" + + with open(output_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['request_id', 'arrival_timestamp_ms', 'input_token_num', 'output_token_num']) + + current_time = 0 + + for i in range(num_requests): + request_id = f"req-{i:04d}" + + # Random arrival interval (0–100ms) + interval = random.uniform(0, 100) + current_time += interval + + # Random token counts + input_tokens = random.choice([64, 128, 256, 512]) + output_tokens = random.choice([64, 128, 256]) + + writer.writerow([request_id, round(current_time, 2), input_tokens, output_tokens]) + + print(f"Created test trace file: {output_file} with {num_requests} requests") + +if __name__ == "__main__": + create_test_trace() diff --git a/infinimetrics/inference/direct_infer_runner.py b/infinimetrics/inference/direct_infer_runner.py new file mode 100644 index 0000000..25580c2 --- /dev/null +++ b/infinimetrics/inference/direct_infer_runner.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +""" +Direct Inference Runner Implementation +Launch real model for batch inference testing +""" + +import logging +import time +import json +from pathlib import Path +from typing import List, Dict, Any +from datetime import datetime + +from infer_runner_base import InferRunnerBase, ScalarMetric, TimeseriesMetric +from infer_config import InferConfig, DirectInferArgs + +logger = logging.getLogger(__name__) + +class DirectInferRunner(InferRunnerBase): + """Direct Inference Runner""" + + def __init__(self, config: InferConfig, adapter): + super().__init__(config, adapter) + self.infer_args: DirectInferArgs = config.infer_args + + # GPU monitor + self.gpu_monitor = None + + logger.info(f"DirectInferRunner created for batch_size={self.infer_args.static_batch_size}") + + def setup(self) -> None: + """Set up direct inference environment""" + logger.info("Setting up direct inference environment") + + # Create GPU monitor + device_ids = self.config.device.device_ids + if self.config.device.cpu_only: + logger.info("CPU-only mode, GPU monitoring disabled") + self.gpu_monitor = None + else: + from utils.gpu_monitor import create_gpu_monitor + self.gpu_monitor = create_gpu_monitor( + gpu_platform=self.config.device.gpu_platform, + device_ids=device_ids + ) + + # Start GPU monitoring + if self.gpu_monitor: + self.gpu_monitor.start_monitoring() + logger.info(f"GPU monitoring started for devices: {device_ids}") + + # Validate configuration + if self.infer_args.static_batch_size <= 0: + raise ValueError(f"Invalid batch size: {self.infer_args.static_batch_size}") + + # Load model + logger.info("Loading model via adapter") + self.adapter.load_model() + + def execute(self) -> None: + """Execute direct inference test""" + logger.info("Executing direct inference benchmark") + + # Generate test prompts + prompts = self._generate_test_prompts() + total_prompts = len(prompts) + + logger.info(f"Generated {total_prompts} test prompts") + logger.info(f"Input tokens per prompt: {self.infer_args.prompt_token_num}") + logger.info(f"Output tokens per prompt: {self.infer_args.output_token_num}") + + # Warmup phase + if self.config.warmup_iterations > 0: + logger.info(f"Warmup phase ({self.config.warmup_iterations} iterations)") + + warmup_prompts = prompts[:self.config.warmup_iterations * self.infer_args.static_batch_size] + + # Perform warmup in batches + for i in range(0, len(warmup_prompts), self.infer_args.static_batch_size): + batch_end = min(i + self.infer_args.static_batch_size, len(warmup_prompts)) + batch_prompts = warmup_prompts[i:batch_end] + + if not batch_prompts: + break + + logger.debug(f"Warmup batch {i//self.infer_args.static_batch_size + 1}") + + try: + _, _, _ = self.adapter.generate( + batch_prompts, + self.infer_args.output_token_num, + self.infer_args.temperature, + self.infer_args.top_p, + self.infer_args.top_k + ) + except Exception as e: + logger.warning(f"Warmup batch failed: {e}") + + # Actual testing phase + logger.info(f"Measurement phase ({self.config.measured_iterations} iterations)") + + measurement_start = time.perf_counter() + + for i in range(self.config.measured_iterations): + # Calculate prompt indices for current batch + start_idx = (self.config.warmup_iterations + i) * self.infer_args.static_batch_size + end_idx = start_idx + self.infer_args.static_batch_size + + if start_idx >= total_prompts: + logger.warning(f"Iteration {i+1}: Not enough prompts, reusing earlier prompts") + start_idx = i * self.infer_args.static_batch_size % total_prompts + end_idx = start_idx + self.infer_args.static_batch_size + + batch_prompts = prompts[start_idx:end_idx] + + logger.info(f"Running measurement iteration {i+1}/{self.config.measured_iterations}") + + # Call adapter to generate + generated_texts, latencies, ttfts = self.adapter.generate( + batch_prompts, + self.infer_args.output_token_num, + self.infer_args.temperature, + self.infer_args.top_p, + self.infer_args.top_k + ) + + # Collect data + for latency in latencies: + self.result.add_latency(latency) + for ttft in ttfts: + self.result.add_ttft(ttft) + + # Calculate throughput (tokens/s) + if latencies: + avg_latency = sum(latencies) / len(latencies) + # Calculate total tokens in this batch + batch_tokens = len(batch_prompts) * self.infer_args.output_token_num + throughput = (batch_tokens * 1000) / avg_latency if avg_latency > 0 else 0 + self.result.add_throughput(throughput) + + logger.info(f" Iteration {i+1}: avg_latency={avg_latency:.2f}ms, throughput={throughput:.2f} tokens/s") + else: + logger.warning(f" Iteration {i+1}: No latency data collected") + + # Show progress + progress = (i + 1) / self.config.measured_iterations * 100 + logger.info(f"Measurement progress: {progress:.1f}%") + + measurement_end = time.perf_counter() + total_measurement_time = measurement_end - measurement_start + logger.info(f"Measurement phase completed in {total_measurement_time:.2f}s") + + def collect_metrics(self) -> None: + """Collect direct inference metrics""" + logger.info("Collecting direct inference metrics") + + # Get peak memory usage + peak_memory = 0.0 + if self.gpu_monitor: + try: + self.gpu_monitor.stop_monitoring() + peak_memory = self.gpu_monitor.get_peak_memory_gb() + logger.info(f"Peak GPU memory usage: {peak_memory:.6f} GB") + except Exception as e: + logger.warning(f"Failed to get peak memory from GPU monitor: {e}") + + # Save to result + self.result.peak_memory_usage = peak_memory + + # Calculate total tokens + total_iterations = self.config.warmup_iterations + self.config.measured_iterations + total_batches = total_iterations * self.infer_args.static_batch_size + self.result.total_tokens = total_batches * self.infer_args.output_token_num + + logger.info(f"Total tokens generated: {self.result.total_tokens}") + + # Calculate throughput statistics + if self.result.throughput_data: + avg_throughput = sum(self.result.throughput_data) / len(self.result.throughput_data) + logger.info(f"Average throughput: {avg_throughput:.2f} tokens/s") + + # Add ppl and accuracy placeholders + # 1. Calculate perplexity (if test dataset exists) + if self.config.test_dataset: + try: + perplexity = self._calculate_perplexity() + # Use correct ScalarMetric class + self.result.add_metric(ScalarMetric( + name="infer.ppl", + value=perplexity, + unit=None + )) + logger.info(f"Perplexity calculated: {perplexity:.4f}") + except Exception as e: + logger.warning(f"Failed to calculate perplexity: {e}") + # Add placeholder + self.result.add_metric(ScalarMetric( + name="infer.ppl", + value=0.0, # placeholder + unit=None + )) + else: + # No test dataset, add placeholder + self.result.add_metric(ScalarMetric( + name="infer.ppl", + value=0.0, # placeholder + unit=None + )) + + # 2. Add accuracy placeholder + self.result.add_metric(ScalarMetric( + name="infer.accuracy", + value=0.0, # TODO: Actually calculate accuracy + unit=None + )) + + # Calculate statistics + stats = self.calculate_statistics() + + if 'avg_latency' in stats: + logger.info(f"Average latency: {stats['avg_latency']:.2f} ms") + + if 'avg_ttft' in stats: + logger.info(f"Average TTFT: {stats['avg_ttft']:.2f} ms") + + def _calculate_perplexity(self) -> float: + """Calculate perplexity""" + if not hasattr(self.adapter, 'calculate_perplexity'): + logger.warning("Adapter does not support perplexity calculation") + return 0.0 + + # Load test data + test_data = self._load_test_data() + if not test_data: + logger.warning("No test data available for perplexity calculation") + return 0.0 + + try: + return self.adapter.calculate_perplexity(test_data) + except Exception as e: + logger.error(f"Error calculating perplexity: {e}") + return 0.0 + + def _load_test_data(self) -> List[str]: + """Load test data""" + test_file = self.config.test_dataset + if not test_file: + logger.warning("No test dataset specified") + return [] + + # Ensure Path is imported when using it + test_path = Path(test_file) + if not test_path.exists(): + logger.warning(f"Test dataset file not found: {test_file}") + return [] + + try: + import json + with open(test_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Extract text based on data format + test_texts = [] + if isinstance(data, list): + for item in data: + if isinstance(item, dict): + text = item.get('text') or item.get('content') or item.get('prompt') + if text: + test_texts.append(str(text)) + elif isinstance(item, str): + test_texts.append(item) + elif isinstance(data, dict): + # Could be multiple keys + for key, value in data.items(): + if isinstance(value, str): + test_texts.append(value) + elif isinstance(value, list): + test_texts.extend([str(v) for v in value if isinstance(v, str)]) + + logger.info(f"Loaded {len(test_texts)} test samples from {test_file}") + return test_texts[:100] # Limit quantity to avoid excessive computation time + + except Exception as e: + logger.error(f"Failed to load test data: {e}") + return [] + + def _generate_test_prompts(self) -> List[str]: + """Generate test prompts""" + prompts = [] + base_template = "Please provide a detailed explanation about {topic}. " + + topics = [ + "artificial intelligence and its applications in healthcare", + "machine learning algorithms and their use cases", + "deep learning and neural networks", + "natural language processing techniques", + "computer vision and image recognition", + "reinforcement learning and autonomous systems", + "quantum computing and its potential impact", + "blockchain technology and decentralized applications", + "Internet of Things and smart devices", + "cloud computing and distributed systems" + ] + + total_prompts_needed = (self.config.warmup_iterations + self.config.measured_iterations) * self.infer_args.static_batch_size + + for i in range(total_prompts_needed): + topic = topics[i % len(topics)] + + # Create prompt of specified length + base_prompt = base_template.format(topic=topic) + repeat_count = max(1, self.infer_args.prompt_token_num // len(base_prompt)) + + prompt = base_prompt * repeat_count + prompt = prompt[:self.infer_args.prompt_token_num] + + # Add unique identifier + import random + import string + random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6)) + prompt += f" [Request {i+1}:{random_suffix}]" + + prompts.append(prompt) + + logger.info(f"Generated {len(prompts)} test prompts") + return prompts \ No newline at end of file diff --git a/infinimetrics/inference/infer_config.py b/infinimetrics/inference/infer_config.py new file mode 100644 index 0000000..bb242be --- /dev/null +++ b/infinimetrics/inference/infer_config.py @@ -0,0 +1,517 @@ +#!/usr/bin/env python3 +""" +Inference Configuration Manager +Parses config.json, identifies direct/service mode, identifies infinilm/vllm framework +""" + +import json +import logging +from pathlib import Path +from typing import Dict, Any, Optional, List +from dataclasses import dataclass +from enum import Enum +from datetime import datetime +import random +import string + +logger = logging.getLogger(__name__) + +class InferMode(Enum): + """Inference mode enumeration""" + DIRECT = "direct" + SERVICE = "service" + +class FrameworkType(Enum): + """Framework type enumeration""" + INFINILM = "infinilm" + VLLM = "vllm" + +@dataclass +class ParallelConfig: + """Parallel configuration""" + dp: int = 1 + tp: int = 1 + pp: int = 1 + sp: int = 1 + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'ParallelConfig': + """Create parallel configuration from dictionary""" + if not data: + return cls() + return cls( + dp=data.get("dp", 1), + tp=data.get("tp", 1), + pp=data.get("pp", 1), + sp=data.get("sp", 1) + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + "dp": self.dp, + "tp": self.tp, + "pp": self.pp, + "sp": self.sp + } + +@dataclass +class DeviceConfig: + """Device configuration""" + gpu_platform: str = "nvidia" + device_ids: List[int] = None + cpu_only: bool = False + + def __post_init__(self): + if self.device_ids is None: + self.device_ids = [0] + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'DeviceConfig': + """Create device configuration from dictionary""" + if not data: + return cls() + return cls( + gpu_platform=data.get("gpu_platform", "nvidia"), + device_ids=data.get("device_ids", [0]), + cpu_only=data.get("cpu_only", False) + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + "gpu_platform": self.gpu_platform, + "device_ids": self.device_ids, + "cpu_only": self.cpu_only + } + +@dataclass +class DirectInferArgs: + """Direct inference arguments""" + parallel: ParallelConfig + static_batch_size: int + prompt_token_num: int + output_token_num: int = 128 + max_seq_len: int = 4096 + temperature: float = 0.7 + top_p: float = 0.9 + top_k: int = 50 + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'DirectInferArgs': + """Create direct inference arguments from dictionary""" + return cls( + parallel=ParallelConfig.from_dict(data.get("parallel", {})), + static_batch_size=data.get("static_batch_size", 1), + prompt_token_num=data.get("prompt_token_num", 1024), + output_token_num=data.get("output_token_num", 128), + max_seq_len=data.get("max_seq_len", 4096), + temperature=data.get("temperature", 0.7), + top_p=data.get("top_p", 0.9), + top_k=data.get("top_k", 50) + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + "parallel": self.parallel.to_dict(), + "static_batch_size": self.static_batch_size, + "prompt_token_num": self.prompt_token_num, + "output_token_num": self.output_token_num, + "max_seq_len": self.max_seq_len, + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k + } + +@dataclass +class ServiceInferArgs: + """Service inference arguments""" + parallel: ParallelConfig + request_trace: str + concurrency: int = 32 + max_seq_len: int = 4096 + stream: bool = True + timeout_ms: int = 30000 + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'ServiceInferArgs': + """Create service inference arguments from dictionary""" + return cls( + parallel=ParallelConfig.from_dict(data.get("parallel", {})), + request_trace=data.get("request_trace", ""), + concurrency=data.get("concurrency", 32), + max_seq_len=data.get("max_seq_len", 4096), + stream=data.get("stream", True), + timeout_ms=data.get("timeout_ms", 30000) + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + "parallel": self.parallel.to_dict(), + "request_trace": self.request_trace, + "concurrency": self.concurrency, + "max_seq_len": self.max_seq_len, + "stream": self.stream, + "timeout_ms": self.timeout_ms + } + +@dataclass +class InferConfig: + """Main inference configuration class""" + # Basic information + run_id: str + testcase: str + model: str + model_path: str + model_config: Optional[str] + + # Dataset + train_dataset: Optional[str] + validation_dataset: Optional[str] + test_dataset: Optional[str] + + # Output + output_dir: str + + # Execution mode + mode: InferMode + framework: FrameworkType + + # Device configuration + device: DeviceConfig + + # Inference arguments (varies by mode) + infer_args: Any # DirectInferArgs or ServiceInferArgs + + # Execution parameters + timeout_ms: int + warmup_iterations: int + measured_iterations: int + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any]) -> 'InferConfig': + """Create configuration object from dictionary - implemented with clear logic""" + # 1. Read outer level + outer_run_id = config_dict.get("run_id") + outer_testcase = config_dict.get("testcase") + config_data = config_dict.get("config", {}) + + # 2. Read inner level (for error checking) + inner_run_id = config_data.get("run_id") + inner_testcase = config_data.get("testcase") + + # 3. Process testcase + # Rule 2.1: If outer level has testcase → use it + if outer_testcase: + testcase = outer_testcase + logger.info(f"Using outer testcase: {testcase}") + + # Rule 2.2: If inner level has testcase → raise error + elif inner_testcase: + raise ValueError( + "testcase must be at the outer level, not inside 'config'. " + f"Found: '{inner_testcase}' inside 'config'. " + "Please move it to the outer level." + ) + + # Rule 2.3: If no testcase at either level → raise error + else: + raise ValueError( + "testcase is required at the outer level of the config. " + "Example: {\"testcase\": \"infer.InfiniLM.Direct\", ...}" + ) + + testcase = outer_testcase + logger.info(f"Using testcase: {testcase}") + + # 4. Process run_id + # Rule 1.2: If inner level has run_id → raise error + if inner_run_id: + raise ValueError( + "run_id must be at the outer level, not inside 'config'. " + f"Found: '{inner_run_id}' inside 'config'. " + "Please move it to the outer level or remove it to auto-generate." + ) + + # Rule 1.1: If outer level has run_id → use it (add timestamp+random code to prevent overwriting) + elif outer_run_id: + run_id = cls._enhance_user_run_id(outer_run_id) + logger.info(f"Using enhanced user-provided run_id: {run_id}") + else: + # Rule 1.3: Auto-generate run_id + run_id = cls._generate_auto_run_id(testcase) + logger.info(f"Auto-generated run_id: {run_id}") + + # 5. Parse mode and framework from testcase + testcase_lower = testcase.lower() + + # Determine inference mode + if "service" in testcase_lower: + mode = InferMode.SERVICE + elif "direct" in testcase_lower: + mode = InferMode.DIRECT + else: + mode = InferMode.DIRECT + + # Determine framework + if "vllm" in testcase_lower: + framework = FrameworkType.VLLM + elif "infinilm" in testcase_lower: + framework = FrameworkType.INFINILM + else: + raise ValueError(f"Cannot determine framework from testcase: {testcase}") + + # 6. Parse model_path + model_path = config_data.get("model_path") + model_config = config_data.get("model_config") + + if not model_path and model_config: + model_path = str(Path(model_config).parent) + logger.info(f"Inferred model_path from model_config: {model_path}") + elif not model_path: + raise ValueError("Either model_path or model_config must be provided in config") + + # 7. Parse inference arguments + infer_args_dict = config_data.get("infer_args", {}) + if mode == InferMode.DIRECT: + infer_args = DirectInferArgs.from_dict(infer_args_dict) + else: + infer_args = ServiceInferArgs.from_dict(infer_args_dict) + + # 8. Parse device configuration + device_config = DeviceConfig.from_dict(config_data.get("device", {})) + + return cls( + run_id=run_id, + testcase=testcase, + model=config_data.get("model", "unknown"), + model_path=model_path, + model_config=model_config, + + train_dataset=config_data.get("train_dataset"), + validation_dataset=config_data.get("validation_dataset"), + test_dataset=config_data.get("test_dataset"), + + output_dir=config_data.get("output_dir", "./test_output"), + + mode=mode, + framework=framework, + device=device_config, + infer_args=infer_args, + + timeout_ms=config_data.get("timeout_ms", 30000)if mode == InferMode.SERVICE else None, + warmup_iterations=config_data.get("warmup_iterations", 10), + measured_iterations=config_data.get("measured_iterations", 100) + ) + + def _enhance_user_run_id(user_run_id: str) -> str: + """ + Enhance user-provided run_id by adding timestamp and random code to prevent overwriting + + Args: + user_run_id: User-provided run_id + + Returns: + Enhanced run_id: {user_run_id}.{timestamp}.{random8} + """ + # If already contains timestamp and random code, return directly (prevent duplicate addition) + import re + timestamp_pattern = r'\.\d{8}_\d{6}\.[a-z0-9]{8}$' + if re.search(timestamp_pattern, user_run_id): + logger.info(f"User run_id already contains timestamp and random code: {user_run_id}") + return user_run_id + + # Add timestamp and random code + from datetime import datetime + import random + import string + + # Clean user run_id + cleaned_user_id = user_run_id.strip().strip(".").replace("..", ".") + + # Timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 8-character random code + random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)) + + # Combine + enhanced_run_id = f"{cleaned_user_id}.{timestamp}.{random_suffix}" + + logger.info(f"Enhanced user run_id: {user_run_id} -> {enhanced_run_id}") + return enhanced_run_id + + @staticmethod + def _generate_auto_run_id(testcase: str) -> str: + """ + Auto-generate run_id + Format: {testcase}.{timestamp}.{random8} + Example: infer.InfiniLM.Direct.20251210_143025.a1b2c3d4 + """ + # Clean testcase + cleaned_testcase = testcase.strip().strip(".").replace("..", ".") + + # Timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 8-character random code + random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)) + + # Combine + run_id = f"{cleaned_testcase}.{timestamp}.{random_suffix}" + + return run_id + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary (for JSON output)""" + return { + "run_id": self.run_id, + "testcase": self.testcase, + "mode": self.mode.value, + "framework": self.framework.value, + "model": self.model, + "model_path": self.model_path, + "model_config": self.model_config, + "device": self.device.to_dict(), + "infer_args": self.infer_args.to_dict() if hasattr(self.infer_args, 'to_dict') else {}, + "warmup_iterations": self.warmup_iterations, + "measured_iterations": self.measured_iterations + } + if self.mode == InferMode.SERVICE and self.timeout_ms is not None: + result["timeout_ms"] = self.timeout_ms + + return result + +class InferConfigManager: + """Inference Configuration Manager""" + + @staticmethod + def load_config(config_file: str) -> InferConfig: + """Load configuration from config file""" + try: + with open(config_file, 'r', encoding='utf-8') as f: + config_data = json.load(f) + + # Support single config or config list + if isinstance(config_data, dict): + config_dict = config_data + elif isinstance(config_data, list) and len(config_data) > 0: + config_dict = config_data[0] + else: + raise ValueError(f"Invalid config format in {config_file}") + + # Create configuration object + config = InferConfig.from_dict(config_dict) + + # Validate configuration + errors = InferConfigManager.validate_config(config) + if errors: + error_msg = "Configuration validation errors:\n" + "\n".join(f" - {e}" for e in errors) + logger.error(error_msg) + raise ValueError(error_msg) + + return config + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON config file: {e}") + raise + except Exception as e: + logger.error(f"Failed to load config from {config_file}: {e}") + raise + + @staticmethod + def validate_config(config: InferConfig) -> List[str]: + """Validate configuration effectiveness""" + errors = [] + + # Basic validation + if not config.run_id: + errors.append("run_id cannot be empty") + + if not config.testcase: + errors.append("testcase cannot be empty") + + if not config.model or config.model == "unknown": + errors.append("model name is required") + + # Model path validation + model_dir = Path(config.model_path) + if not model_dir.exists(): + errors.append(f"Model directory does not exist: {model_dir}") + else: + # Check config.json + config_file = model_dir / "config.json" + if not config_file.exists(): + errors.append(f"config.json not found in model directory: {model_dir}") + + # Mode-specific validation + if config.mode == InferMode.DIRECT: + if not isinstance(config.infer_args, DirectInferArgs): + errors.append("Direct mode requires DirectInferArgs") + else: + if config.infer_args.static_batch_size <= 0: + errors.append("Batch size must be positive") + if config.infer_args.prompt_token_num <= 0: + errors.append("Prompt token number must be positive") + + elif config.mode == InferMode.SERVICE: + if not isinstance(config.infer_args, ServiceInferArgs): + errors.append("Service mode requires ServiceInferArgs") + else: + if not config.infer_args.request_trace: + errors.append("Request trace is required for service mode") + if config.infer_args.concurrency <= 0: + errors.append("Concurrency must be positive") + + # Verify trace file exists + trace_path = Path(config.infer_args.request_trace) + if not trace_path.exists(): + errors.append(f"Trace file not found: {trace_path}") + + # Output directory validation + output_dir = Path(config.output_dir) + try: + output_dir.mkdir(parents=True, exist_ok=True) + test_file = output_dir / ".test_write" + test_file.write_text("test") + test_file.unlink() + except Exception as e: + errors.append(f"Output directory is not writable: {e}") + + return errors + + @staticmethod + def generate_auto_run_id(testcase: str) -> str: + """ + Auto-generate run_id (public method) + + Format: {testcase}.{timestamp}.{random8} + Example: infer.InfiniLM.Direct.20251210_143025.a1b2c3d4 + + Args: + testcase: testcase string + + Returns: + Generated run_id + """ + # Clean testcase + cleaned_testcase = testcase.strip().strip(".").replace("..", ".") + + # Timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 8-character random code + random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)) + + # Combine + run_id = f"{cleaned_testcase}.{timestamp}.{random_suffix}" + + return run_id + + # Original private method calls public method + @staticmethod + def _generate_auto_run_id(testcase: str) -> str: + """Private method, calls public method (maintain backward compatibility)""" + return InferConfig.generate_auto_run_id(testcase) \ No newline at end of file diff --git a/infinimetrics/inference/infer_main.py b/infinimetrics/inference/infer_main.py new file mode 100644 index 0000000..2eab84b --- /dev/null +++ b/infinimetrics/inference/infer_main.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +""" +Main entry point for inference evaluation +Usage: +cd ~/InfiniLM # or vLLM directory +python /path/to/infinimetrics/inference/infer_main.py --config config.json +""" + +import argparse +import os +import sys +import json +import logging +import traceback +from pathlib import Path + +# Add current directory to Python path +current_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, current_dir) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler('infer_benchmark.log', mode='w') + ] +) + +logger = logging.getLogger(__name__) + +def setup_logging(verbose: bool): + """Set logging level""" + if verbose: + logging.getLogger().setLevel(logging.DEBUG) + logger.debug("Verbose logging enabled") + else: + logging.getLogger().setLevel(logging.INFO) + +def check_current_directory(): + """Check current directory and provide guidance""" + current_dir = os.getcwd() + logger.info(f"Current working directory: {current_dir}") + + # Check if we're in InfiniLM or vLLM directory + possible_frameworks = [] + + if os.path.exists("scripts/jiuge.py"): + possible_frameworks.append("InfiniLM") + + if os.path.exists("vllm") or "site-packages/vllm" in current_dir: + possible_frameworks.append("vLLM") + + if possible_frameworks: + logger.info(f"Detected framework(s): {', '.join(possible_frameworks)}") + else: + logger.warning("No known inference framework detected in current directory") + logger.warning("Please run this script from either:") + logger.warning(" 1. InfiniLM directory (contains scripts/jiuge.py)") + logger.warning(" 2. vLLM directory or vLLM installation directory") + +def load_config(config_file: str): + """Load configuration file""" + from infer_config import InferConfigManager + + if not os.path.exists(config_file): + logger.error(f"Config file not found: {config_file}") + sys.exit(1) + + try: + config = InferConfigManager.load_config(config_file) + logger.info(f"Configuration loaded successfully: {config.run_id}") + return config + except Exception as e: + logger.error(f"Failed to load configuration: {e}") + sys.exit(1) + +def check_dependencies(): + """Check for required dependencies""" + from infer_runner_factory import InferRunnerFactory + + dependencies = InferRunnerFactory.check_dependencies() + + logger.info("Dependency check:") + for dep, available in dependencies.items(): + status = "✓" if available else "✗" + logger.info(f" {status} {dep}") + + # Check for essential dependencies + if not dependencies["numpy"]: + logger.warning("NumPy is not installed. Some statistics may not be available.") + + return dependencies + +def main(): + """Main function""" + parser = argparse.ArgumentParser( + description="Unified Inference Benchmark Framework", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Usage examples: + 1. Run from InfiniLM directory: + cd ~/InfiniLM + python /path/to/infinimetrics/inference/infer_main.py --config config.json + + 2. Run from vLLM directory: + cd ~/vllm + python /path/to/infinimetrics/inference/infer_main.py --config config.json + + 3. Enable verbose logging: + python infer_main.py --config config.json --verbose + """ + ) + + parser.add_argument( + "--config", + type=str, + required=False, + help="Path to configuration file (JSON format)" + ) + + parser.add_argument( + "--validate-only", + action="store_true", + help="Only validate configuration, do not execute tests" + ) + + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose logging" + ) + + parser.add_argument( + "--check-deps", + action="store_true", + help="Check dependencies and exit" + ) + + parser.add_argument( + "--output-dir", + type=str, + help="Override output directory from configuration" + ) + + args = parser.parse_args() + + # Set logging level + setup_logging(args.verbose) + + logger.info("=" * 60) + logger.info("Unified Inference Benchmark Framework") + logger.info("=" * 60) + + # Check current directory + check_current_directory() + + # Check dependencies + if args.check_deps: + check_dependencies() + sys.exit(0) + + if not args.config: + parser.error("the following arguments are required: --config") + + # Load configuration + config = load_config(args.config) + + # Override output directory if specified + if args.output_dir: + config.output_dir = args.output_dir + logger.info(f"Output directory overridden: {config.output_dir}") + + # Validate configuration + from infer_config import InferConfigManager + errors = InferConfigManager.validate_config(config) + + if errors: + logger.warning("Configuration validation warnings:") + for error in errors: + logger.warning(f" - {error}") + + if args.validate_only: + logger.info("Configuration validation completed") + if not errors: + logger.info("✓ Configuration is valid") + else: + logger.warning("⚠ Configuration has warnings but may still work") + sys.exit(0) + + # Check dependencies + dependencies = check_dependencies() + + # Check if framework is available + if config.framework.value == "infinilm" and not dependencies["infinilm"]: + logger.error("InfiniLM not detected in current directory") + logger.error("Please run this script from InfiniLM directory") + sys.exit(1) + + if config.framework.value == "vllm" and not dependencies["vllm"]: + logger.error("vLLM not detected or not installed") + logger.error("Please install vLLM or run from vLLM directory") + sys.exit(1) + + try: + # Create Runner and Adapter + from infer_runner_factory import InferRunnerFactory + runner, adapter = InferRunnerFactory.create_runner_and_adapter(config) + + # Run benchmark + logger.info(f"Starting benchmark: {config.run_id}") + logger.info(f"Mode: {config.mode.value}, Framework: {config.framework.value}") + + result_file = runner.run() + + if not isinstance(result_file, str): + logger.error(f"Expected string result file path, got: {type(result_file)}") + if isinstance(result_file, dict): + # Emergency handling: print results directly + logger.info("Results (dict format):") + logger.info(json.dumps(result_file, indent=2)) + # Attempt to save to file + emergency_file = Path(config.output_dir) / "infer" / f"emergency_{config.run_id}_results.json" + with open(emergency_file, 'w') as f: + json.dump(result_file, f, indent=2) + result_file = str(emergency_file) + else: + raise TypeError(f"Result file must be string, got {type(result_file)}") + + logger.info("=" * 60) + logger.info("BENCHMARK COMPLETED SUCCESSFULLY") + logger.info("=" * 60) + logger.info(f"Results saved to: {result_file}") + + # Output result location + result_path = Path(result_file) + if result_path.exists(): + with open(result_path, 'r') as f: + result_data = json.load(f) + success = result_data.get("success", 0) + logger.info(f"Benchmark success status: {success}") + else: + logger.warning(f"Result file not found: {result_file}") + + return 0 + + except ImportError as e: + logger.error(f"Import error: {e}") + logger.error("This may be because:") + logger.error(" 1. You're not in the correct framework directory") + logger.error(" 2. The framework is not properly installed") + logger.error(" 3. The adapter implementation is missing") + return 1 + + except Exception as e: + logger.error(f"Benchmark failed with error: {e}") + logger.error(traceback.format_exc()) + return 1 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/infinimetrics/inference/infer_runner_base.py b/infinimetrics/inference/infer_runner_base.py new file mode 100644 index 0000000..219d272 --- /dev/null +++ b/infinimetrics/inference/infer_runner_base.py @@ -0,0 +1,582 @@ +#!/usr/bin/env python3 +""" +Inference Runner Base Class +Defines common interfaces and template methods for all Runners +""" + +import abc +import csv +import json +import time +import logging +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple +from datetime import datetime +import re + +from infer_config import InferConfig + +logger = logging.getLogger(__name__) + +class Metric: + """Base metric class""" + + def __init__(self, name: str, metric_type: str, unit: Optional[str] = None): + self.name = name + self.type = metric_type + self.unit = unit + self.value = None + self.raw_data_url = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + result = { + "name": self.name, + "type": self.type, + "unit": self.unit + } + + if self.type == "scalar": + result["value"] = self.value + elif self.type == "timeseries": + result["raw_data_url"] = self.raw_data_url + + return result + +class ScalarMetric(Metric): + """Scalar metric""" + + def __init__(self, name: str, value: Any, unit: Optional[str] = None): + super().__init__(name, "scalar", unit) + self.value = value + + +class TimeseriesMetric(Metric): + """Time series metric""" + + def __init__(self, name: str, raw_data_url: str, unit: Optional[str] = None): + super().__init__(name, "timeseries", unit) + self.raw_data_url = raw_data_url + +class BenchmarkResult: + """Benchmark result container""" + + def __init__(self): + self.metrics: List[Metric] = [] + self.start_time: Optional[float] = None + self.end_time: Optional[float] = None + + # Performance data + self.latency_data: List[float] = [] # Latency data (milliseconds) + self.ttft_data: List[float] = [] # Time to first token data (milliseconds) + self.throughput_data: List[float] = [] # Throughput data + + # Additional data + self.peak_memory_usage: Optional[float] = None # Peak memory usage (GB) + self.total_tokens: int = 0 # Total token count + self.success_rate: float = 1.0 # Success rate + + def add_metric(self, metric: Metric): + """Add a metric""" + self.metrics.append(metric) + + def add_latency(self, latency_ms: float): + """Add latency data""" + self.latency_data.append(latency_ms) + + def add_ttft(self, ttft_ms: float): + """Add TTFT data""" + self.ttft_data.append(ttft_ms) + + def add_throughput(self, throughput: float): + """Add throughput data""" + self.throughput_data.append(throughput) + +class InferRunnerBase(abc.ABC): + """Inference Runner base class (Template Method Pattern)""" + + def __init__(self, config: InferConfig, adapter): + self.config = config + self.adapter = adapter + self.result = BenchmarkResult() + self.infer_dir: Optional[Path] = None + + def prepare_output_dir(self) -> Path: + """Prepare output directory""" + output_dir = Path(self.config.output_dir) + self.infer_dir = output_dir / "infer" + + # Create directory + self.infer_dir.mkdir(parents=True, exist_ok=True) + + logger.info(f"Output directory prepared: {self.infer_dir}") + return self.infer_dir + + def save_timeseries_data(self): + """Save time series data to CSV files""" + if not self.infer_dir: + raise ValueError("Output directory not prepared") + + # Save latency data + if self.result.latency_data: + # ✅ Clean filename + safe_run_id = self._sanitize_filename(self.config.run_id) + latency_file = self.infer_dir / f"{safe_run_id}_infer_latency.csv" + + with open(latency_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['timestamp', 'latency_ms']) + for i, latency in enumerate(self.result.latency_data): + writer.writerow([i, latency]) + + # Add different metrics based on mode + if self.config.mode.value == "direct": + metric_name = "infer.compute_latency" + else: + metric_name = "infer.e2e_latency" + + self.result.add_metric(TimeseriesMetric( + name=metric_name, + raw_data_url=f"./infer/{latency_file.name}", + unit="ms" + )) + + # Save TTFT data + if self.result.ttft_data: + safe_run_id = self._sanitize_filename(self.config.run_id) + ttft_file = self.infer_dir / f"{safe_run_id}_infer_ttft.csv" + + with open(ttft_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['timestamp', 'ttft_ms']) + for i, ttft in enumerate(self.result.ttft_data): + writer.writerow([i, ttft]) + + self.result.add_metric(TimeseriesMetric( + name="infer.ttft", + raw_data_url=f"./infer/{ttft_file.name}", + unit="ms" + )) + + # Save throughput data + if self.result.throughput_data: + safe_run_id = self._sanitize_filename(self.config.run_id) + throughput_file = self.infer_dir / f"{safe_run_id}_infer_throughput.csv" + + with open(throughput_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['timestamp', 'throughput']) + for i, throughput in enumerate(self.result.throughput_data): + writer.writerow([i, throughput]) + + if self.config.mode.value == "direct": + metric_name = "infer.direct_throughput_tps" + unit = "tokens/s/gpu" + else: + metric_name = "infer.response_per_second" + unit = "requests/s" + + self.result.add_metric(TimeseriesMetric( + name=metric_name, + raw_data_url=f"./infer/{throughput_file.name}", + unit=unit + )) + + def calculate_statistics(self) -> Dict[str, float]: + """Calculate statistical information""" + stats = {} + + try: + import numpy as np + + # Calculate latency statistics + if self.result.latency_data: + stats['avg_latency'] = np.mean(self.result.latency_data) + stats['p50_latency'] = np.percentile(self.result.latency_data, 50) + stats['p95_latency'] = np.percentile(self.result.latency_data, 95) + stats['p99_latency'] = np.percentile(self.result.latency_data, 99) + stats['min_latency'] = np.min(self.result.latency_data) + stats['max_latency'] = np.max(self.result.latency_data) + stats['std_latency'] = np.std(self.result.latency_data) + + # Calculate TTFT statistics + if self.result.ttft_data: + stats['avg_ttft'] = np.mean(self.result.ttft_data) + stats['p50_ttft'] = np.percentile(self.result.ttft_data, 50) + stats['p95_ttft'] = np.percentile(self.result.ttft_data, 95) + stats['p99_ttft'] = np.percentile(self.result.ttft_data, 99) + stats['min_ttft'] = np.min(self.result.ttft_data) + stats['max_ttft'] = np.max(self.result.ttft_data) + + # Calculate throughput statistics + if self.result.throughput_data: + stats['avg_throughput'] = np.mean(self.result.throughput_data) + stats['max_throughput'] = np.max(self.result.throughput_data) + stats['min_throughput'] = np.min(self.result.throughput_data) + + # Calculate total time and throughput rate + if self.result.start_time and self.result.end_time: + total_duration = self.result.end_time - self.result.start_time + stats['total_duration'] = total_duration + + if self.config.mode.value == "direct": + if self.result.total_tokens > 0 and total_duration > 0: + stats['overall_throughput'] = self.result.total_tokens / total_duration + + elif self.config.mode.value == "service": + if len(self.result.latency_data) > 0 and total_duration > 0: + stats['requests_per_second'] = len(self.result.latency_data) / total_duration + + # Success rate + stats['success_rate'] = self.result.success_rate + + except ImportError: + logger.warning("NumPy not available, skipping statistics calculation") + + return stats + + def dump_json(self) -> str: + """Output JSON results - ensure string path is returned""" + if not self.infer_dir: + raise ValueError("Output directory not prepared") + + safe_run_id = self._sanitize_filename(self.config.run_id) + json_filename = f"{safe_run_id}_results.json" + + config_dict = { + "run_id": self.config.run_id, + "testcase": self.config.testcase, + "success": 1 if self.result.success_rate >= 0.95 else 0, + "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "config": { + "command": self._build_command_string(), + "framework": self.config.framework.value, + "model": self.config.model, + "model_config": self.config.model_config, + "train_dataset": self.config.train_dataset, + "validation_dataset": self.config.validation_dataset, + "test_dataset": self.config.test_dataset, + "infer_args": self.config.infer_args.to_dict() if hasattr(self.config.infer_args, 'to_dict') else {}, + "warmup_iterations": self.config.warmup_iterations, + "measured_iterations": self.config.measured_iterations + }, + "metrics": [] + } + + # ✅ 1. First add all existing metrics + for metric in self.result.metrics: + config_dict["metrics"].append(metric.to_dict()) + + # ✅ 2. Check and add missing metrics + required_metrics = { + "direct": [ + ("infer.peak_memory_usage", "scalar", "GB"), + ("infer.compute_latency", "timeseries", "ms"), + ("infer.ttft", "timeseries", "ms"), + ("infer.direct_throughput_tps", "timeseries", "tokens/s/gpu") + ], + "service": [ + ("infer.accuracy_mmlu", "scalar", None), + ("infer.peak_memory_usage", "scalar", "GB"), + ("infer.e2e_latency", "timeseries", "ms"), + ("infer.ttft", "timeseries", "ms"), + ("infer.response_per_second", "timeseries", None), + ("infer.compute_latency", "timeseries", "ms"), + ("infer.max_throughput_tps", "timeseries", "tokens/s/gpu"), + ("infer.success_rate", "scalar", "%") + ] + } + + mode = self.config.mode.value + if mode in required_metrics: + for metric_name, metric_type, unit in required_metrics[mode]: + # Check if already exists + if not any(m.get('name') == metric_name for m in config_dict['metrics']): + logger.debug(f"Adding missing metric: {metric_name}") + + if metric_name == "infer.peak_memory_usage": + # Special handling: try to get peak memory + peak_memory = self.result.peak_memory_usage + if peak_memory is None: + try: + peak_memory = self.adapter.get_peak_memory_usage() + except Exception as e: + logger.warning(f"Failed to get peak memory usage: {e}") + peak_memory = 0.0 + + # ✅ Ensure .to_dict() is called + config_dict["metrics"].append(ScalarMetric( + name=metric_name, + value=peak_memory, + unit=unit + ).to_dict()) + + elif metric_name == "infer.success_rate": + # Special handling: success rate + success_rate = self.result.success_rate + config_dict["metrics"].append(ScalarMetric( + name=metric_name, + value=success_rate * 100, # Convert to percentage + unit=unit + ).to_dict()) + + elif metric_type == "scalar": + # Other scalar metrics (placeholder) + config_dict["metrics"].append(ScalarMetric( + name=metric_name, + value=0.0, # Placeholder value + unit=unit + ).to_dict()) # ✅ Ensure .to_dict() is called + + elif metric_type == "timeseries": + # Time series metrics (create placeholder file) + file_created = self._ensure_metric_file(metric_name, safe_run_id) + if file_created: + raw_data_url = f"./infer/{safe_run_id}_{metric_name.replace('.', '_')}.csv" + config_dict["metrics"].append({ + "name": metric_name, + "type": "timeseries", + "raw_data_url": raw_data_url, + "unit": unit + }) + + # ✅ 3. Ensure no duplicate metrics + seen_names = set() + unique_metrics = [] + for metric in config_dict['metrics']: + name = metric.get('name') + if name and name not in seen_names: + seen_names.add(name) + unique_metrics.append(metric) + elif name: + logger.warning(f"Duplicate metric found: {name}") + + config_dict['metrics'] = unique_metrics + + # ✅ 4. Print debug information + logger.info(f"Total metrics in JSON: {len(config_dict['metrics'])}") + for metric in config_dict['metrics']: + metric_name = metric.get('name', 'unknown') + metric_type = metric.get('type', 'unknown') + logger.debug(f" - {metric_name} ({metric_type})") + + # ✅ 5. Save file + json_file = self.infer_dir / json_filename + with open(json_file, 'w', encoding='utf-8') as f: + json.dump(config_dict, f, indent=2, ensure_ascii=False) + + logger.info(f"Results saved to: {json_file}") + return str(json_file) + + def _ensure_metric_file(self, metric_name: str, safe_run_id: str) -> bool: + """Ensure metric file exists, create placeholder if not""" + if not self.infer_dir: + return False + + # Map metric name to filename + file_mapping = { + "infer.e2e_latency": f"{safe_run_id}_infer_latency.csv", + "infer.compute_latency": f"{safe_run_id}_infer_compute_latency.csv", + "infer.ttft": f"{safe_run_id}_infer_ttft.csv", + "infer.response_per_second": f"{safe_run_id}_infer_throughput.csv", + "infer.max_throughput_tps": f"{safe_run_id}_infer_max_throughput.csv", + "infer.direct_throughput_tps": f"{safe_run_id}_infer_direct_throughput.csv" + } + + if metric_name not in file_mapping: + return False + + filename = file_mapping[metric_name] + file_path = self.infer_dir / filename + + if not file_path.exists(): + try: + with open(file_path, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['timestamp', 'value']) + writer.writerow([0, 0.0]) # Placeholder data + logger.debug(f"Created placeholder file: {filename}") + return True + except Exception as e: + logger.warning(f"Failed to create placeholder file {filename}: {e}") + return False + + return True + + def _create_max_throughput_placeholder(self): + if not self.infer_dir: + return + + safe_run_id = self._sanitize_filename(self.config.run_id) + max_throughput_file = self.infer_dir / f"{safe_run_id}_infer_max_throughput.csv" + + with open(max_throughput_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['timestamp', 'throughput']) + writer.writerow([0, 0.0]) + + def _create_compute_latency_placeholder(self): + if not self.infer_dir: + return + + safe_run_id = self._sanitize_filename(self.config.run_id) + compute_latency_file = self.infer_dir / f"{safe_run_id}_infer_compute_latency.csv" + + with open(compute_latency_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['timestamp', 'latency_ms']) + writer.writerow([0, 0.0]) + + def _build_command_string(self) -> str: + """Build command string""" + cmd_parts = [] + + if self.config.framework.value == "infinilm": + cmd_parts.append("python scripts/jiuge.py --nvidia") + cmd_parts.append(self.config.model_path) + cmd_parts.append(str(self.config.infer_args.parallel.tp)) + + # Add batch size parameter (if jiuge.py supports it) + if self.config.mode.value == "direct": + if hasattr(self.config.infer_args, 'static_batch_size'): + cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}") + + else: # vllm + if self.config.mode.value == "direct": + cmd_parts.append("python -m vllm.benchmarks.benchmark_throughput") + cmd_parts.append(f"--model {self.config.model_path}") + if hasattr(self.config.infer_args, 'static_batch_size'): + cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}") + else: # service + cmd_parts.append("python -m vllm.entrypoints.api_server") + cmd_parts.append(f"--model {self.config.model_path}") + cmd_parts.append(f"--port 8000") + + return " ".join(cmd_parts) + + def _sanitize_filename(self, filename: str) -> str: + """ + Clean filename by removing special characters + + Args: + filename: Original filename + + Returns: + Cleaned filename + """ + # Replace special characters with underscore + sanitized = re.sub(r'[^\w\-_.]', '_', filename) + # Remove consecutive underscores + sanitized = re.sub(r'_+', '_', sanitized) + # Remove leading/trailing underscores + sanitized = sanitized.strip('_') + + # Ensure reasonable filename length + max_length = 255 + if len(sanitized) > max_length: + # Keep first 100 chars and last 155 chars (add ... in between) + prefix = sanitized[:100] + suffix = sanitized[-155:] if len(sanitized) > 255 else "" + sanitized = f"{prefix}...{suffix}" + + return sanitized + + def run(self) -> str: + """Run template method - ensure string path is returned""" + logger.info(f"Starting inference benchmark: {self.config.run_id}") + logger.info(f"Testcase: {self.config.testcase}") + logger.info(f"Framework: {self.config.framework.value}") + logger.info(f"Mode: {self.config.mode.value}") + + try: + # Prepare output directory + self.prepare_output_dir() + + # Record start time + self.result.start_time = time.time() + + # Template method: to be implemented by subclasses + self.setup() + self.execute() + self.collect_metrics() + + # Record end time + self.result.end_time = time.time() + + # Save data + self.save_timeseries_data() + + # Output JSON + result_file = self.dump_json() + + # ✅ Double check: ensure string is returned + if not isinstance(result_file, str): + logger.error(f"dump_json() returned {type(result_file)} instead of str") + # Try conversion + try: + result_file = str(result_file) + except: + # If conversion fails, create default file + safe_run_id = self._sanitize_filename(self.config.run_id) + default_file = self.infer_dir / f"{safe_run_id}_emergency_results.json" + result_file = str(default_file) + logger.warning(f"Created emergency result file: {result_file}") + + # Print statistics + self._print_statistics() + + logger.info(f"Benchmark completed successfully: {self.config.run_id}") + + # ✅ Final ensure string return + return str(result_file) + + except Exception as e: + logger.error(f"Benchmark failed: {e}", exc_info=True) + raise + + def _print_statistics(self): + """Print statistical information""" + stats = self.calculate_statistics() + + logger.info("=" * 60) + logger.info("BENCHMARK STATISTICS") + logger.info("=" * 60) + + if 'avg_latency' in stats: + logger.info(f"Average latency: {stats['avg_latency']:.2f} ms") + if 'p95_latency' in stats: + logger.info(f"P95 latency: {stats['p95_latency']:.2f} ms") + + if 'avg_ttft' in stats: + logger.info(f"Average TTFT: {stats['avg_ttft']:.2f} ms") + if 'p95_ttft' in stats: + logger.info(f"P95 TTFT: {stats['p95_ttft']:.2f} ms") + + if 'avg_throughput' in stats: + if self.config.mode.value == "direct": + logger.info(f"Average throughput: {stats['avg_throughput']:.2f} tokens/s/gpu") + else: + logger.info(f"Average throughput: {stats['avg_throughput']:.2f} requests/s") + + if 'success_rate' in stats: + logger.info(f"Success rate: {stats['success_rate']:.2%}") + + if 'total_duration' in stats: + logger.info(f"Total duration: {stats['total_duration']:.2f} s") + + logger.info("=" * 60) + + @abc.abstractmethod + def setup(self) -> None: + """Set up execution environment (to be implemented by subclasses)""" + pass + + @abc.abstractmethod + def execute(self) -> None: + """Execute inference test (to be implemented by subclasses)""" + pass + + @abc.abstractmethod + def collect_metrics(self) -> None: + """Collect performance metrics (to be implemented by subclasses)""" + pass \ No newline at end of file diff --git a/infinimetrics/inference/infer_runner_factory.py b/infinimetrics/inference/infer_runner_factory.py new file mode 100644 index 0000000..fb44743 --- /dev/null +++ b/infinimetrics/inference/infer_runner_factory.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +""" +Inference Runner Factory Class +Creates appropriate Runner and Adapter based on configuration +""" + +import logging +import sys +import os +from typing import Dict, Tuple + +from infer_config import InferConfig, InferMode, FrameworkType +from infer_runner_base import InferRunnerBase +from adapter_base import InferAdapter + +logger = logging.getLogger(__name__) + +class InferRunnerFactory: + """Inference Runner Factory""" + + @staticmethod + def create_runner_and_adapter(config: InferConfig) -> Tuple[InferRunnerBase, InferAdapter]: + """ + Create Runner and Adapter + + Returns: (runner, adapter) + """ + # Create adapter based on framework + adapter = InferRunnerFactory.create_adapter(config) + + # Create runner based on mode + runner = InferRunnerFactory.create_runner(config, adapter) + + return runner, adapter + + @staticmethod + def create_adapter(config: InferConfig) -> InferAdapter: + """Create adapter""" + logger.info(f"Creating adapter for framework: {config.framework.value}") + + if config.framework == FrameworkType.INFINILM: + try: + from adapters.infinilm_adapter import InfiniLMAdapter + adapter = InfiniLMAdapter(config) + logger.info("InfiniLMAdapter created successfully") + except ImportError as e: + logger.error(f"Failed to import InfiniLMAdapter: {e}") + raise ImportError("InfiniLMAdapter is not available. Please check if InfiniLM is installed.") + + elif config.framework == FrameworkType.VLLM: + try: + from adapters.vllm_adapter import VLLMAdapter + adapter = VLLMAdapter(config) + logger.info("VLLMAdapter created successfully") + except ImportError as e: + logger.error(f"Failed to import VLLMAdapter: {e}") + raise ImportError("VLLMAdapter is not available. Please check if vLLM is installed.") + + else: + raise ValueError(f"Unsupported framework: {config.framework}") + + # Validate adapter configuration + errors = adapter.validate_config() + if errors: + error_msg = "Adapter configuration errors:\n" + "\n".join(f" - {e}" for e in errors) + logger.warning(error_msg) + + return adapter + + @staticmethod + def create_runner(config: InferConfig, adapter: InferAdapter) -> InferRunnerBase: + """Create runner""" + logger.info(f"Creating runner for mode: {config.mode.value}") + + if config.mode == InferMode.DIRECT: + try: + from direct_infer_runner import DirectInferRunner + runner = DirectInferRunner(config, adapter) + logger.info("DirectInferRunner created successfully") + except ImportError as e: + logger.error(f"Failed to import DirectInferRunner: {e}") + raise + + elif config.mode == InferMode.SERVICE: + try: + from service_infer_runner import ServiceInferRunner + runner = ServiceInferRunner(config, adapter) + logger.info("ServiceInferRunner created successfully") + except ImportError as e: + logger.error(f"Failed to import ServiceInferRunner: {e}") + raise + + else: + raise ValueError(f"Unsupported inference mode: {config.mode}") + + return runner + + @staticmethod + def check_dependencies() -> Dict[str, bool]: + """Check dependencies""" + dependencies = { + "infinilm": False, + "vllm": False, + "numpy": False, + "torch": False + } + + # Check InfiniLM + try: + # Try importing InfiniLM related modules + import sys + import os + + # Check if in InfiniLM directory + if os.path.exists("scripts/jiuge.py"): + dependencies["infinilm"] = True + else: + # Try to determine via environment variable + infinilm_path = os.environ.get("INFINILM_PATH", "") + if infinilm_path and os.path.exists(os.path.join(infinilm_path, "scripts/jiuge.py")): + dependencies["infinilm"] = True + except: + pass + + # Check vLLM + try: + import vllm + dependencies["vllm"] = True + except ImportError: + pass + + # Check numpy + try: + import numpy + dependencies["numpy"] = True + except ImportError: + pass + + # Check torch + try: + import torch + dependencies["torch"] = True + except ImportError: + pass + + return dependencies diff --git a/infinimetrics/inference/service_infer_runner.py b/infinimetrics/inference/service_infer_runner.py new file mode 100644 index 0000000..20f37ad --- /dev/null +++ b/infinimetrics/inference/service_infer_runner.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python3 +""" +Service Inference Runner Implementation +Start service and run trace testing +""" + +import asyncio +import logging +import time +import json +from pathlib import Path +from typing import Dict, Any, List, Optional +from utils.gpu_monitor import create_gpu_monitor + +from infer_runner_base import InferRunnerBase, TimeseriesMetric, ScalarMetric +from infer_config import InferConfig, ServiceInferArgs +from utils.trace_client import TraceClient, TraceClientConfig, RequestTrace +from utils.prompt_generator import create_prompt_generator + +logger = logging.getLogger(__name__) + +class ServiceInferRunner(InferRunnerBase): + """Service Inference Runner""" + + def __init__(self, config: InferConfig, adapter): + super().__init__(config, adapter) + self.infer_args: ServiceInferArgs = config.infer_args + + # Trace related + self.traces: List[RequestTrace] = [] + self.trace_stats: Dict[str, Any] = {} + + # Add GPU monitor + self.gpu_monitor = None + + logger.info(f"ServiceInferRunner created for trace: {self.infer_args.request_trace}") + logger.info(f"Concurrency: {self.infer_args.concurrency}") + logger.info(f"Max sequence length: {self.infer_args.max_seq_len}") + + def setup(self) -> None: + """Set up service inference environment""" + logger.info("Setting up service inference environment") + + # 1. Create GPU monitor + device_ids = self.config.device.device_ids + if self.config.device.cpu_only: + logger.info("CPU-only mode, GPU monitoring disabled") + self.gpu_monitor = None + else: + self.gpu_monitor = create_gpu_monitor( + gpu_platform=self.config.device.gpu_platform, + device_ids=device_ids + ) + + # 2. Start GPU monitoring + if self.gpu_monitor: + self.gpu_monitor.start_monitoring() + logger.info(f"GPU monitoring started for devices: {device_ids}") + + # 3. Load trace file + self._load_trace_data() + + # 4. Launch inference service + logger.info(f"Launching inference service on port 8000") + + try: + # Use adapter to launch service + self.adapter.launch_service(port=8000) + + # Wait for service readiness + max_wait_time = 120 # Maximum 120 seconds + wait_interval = 3 # Check every 3 seconds + + logger.info("Waiting for service to be ready...") + for i in range(max_wait_time // wait_interval): + if self.adapter.is_service_ready(port=8000): + logger.info("Inference service is ready") + return + + logger.info(f" Waiting... ({i * wait_interval}s elapsed)") + time.sleep(wait_interval) + + raise TimeoutError("Inference service failed to start within timeout") + + except Exception as e: + logger.error(f"Failed to setup service: {e}") + + # Stop GPU monitoring + if self.gpu_monitor: + self.gpu_monitor.stop_monitoring() + + # Ensure service is stopped + try: + self.adapter.stop_service() + except: + pass + + raise + + def execute(self) -> None: + """Execute service inference test""" + logger.info("Executing service inference benchmark") + + # Run asynchronous trace test + asyncio.run(self._run_trace_async()) + + async def _run_trace_async(self): + """Asynchronously run trace test""" + try: + # 1. Create trace client configuration + client_config = TraceClientConfig( + api_url="http://localhost:8000", + model_name=self.config.model, + timeout_ms=self.infer_args.timeout_ms, + warmup_requests=min(10, len(self.traces) // 10) # 10% of requests for warmup + ) + + # 2. Create prompt generator + # First get tokenizer (if adapter has loaded model) + tokenizer = None + if hasattr(self.adapter, 'tokenizer') and self.adapter.tokenizer: + tokenizer = self.adapter.tokenizer + + prompt_generator = create_prompt_generator( + tokenizer=tokenizer, + method="random" # Use random tokens + ) + + # 3. Load trace data + self.traces = TraceClient.load_trace_file( + self.infer_args.request_trace, + prompt_generator + ) + + # 4. Use trace client to run test + async with TraceClient(client_config) as client: + # Run trace + processed_traces, stats = await client.run_trace( + traces=self.traces, + concurrency=self.infer_args.concurrency, + warmup_requests=client_config.warmup_requests + ) + + # Save results + self.traces = processed_traces + self.trace_stats = stats + + # Save trace results to CSV + if self.infer_dir: + client.save_results_to_csv( + processed_traces, + self.infer_dir, + self.config.run_id + ) + + except Exception as e: + logger.error(f"Trace test failed: {e}", exc_info=True) + raise + + finally: + # ✅ Stop GPU monitoring + if self.gpu_monitor: + self.gpu_monitor.stop_monitoring() + peak_memory_gb = self.gpu_monitor.get_peak_memory_gb() + logger.info(f"Peak GPU memory usage during test: {peak_memory_gb} GB") + self.result.peak_memory_usage = peak_memory_gb + + def collect_metrics(self) -> None: + """Collect service inference metrics""" + logger.info("Collecting service inference metrics") + + # Extract data from trace statistics + if self.trace_stats: + # TTFT data + ttfts = [] + for trace in self.traces: + if trace.success and trace.ttft is not None: + ttfts.append(trace.ttft) + self.result.add_ttft(trace.ttft) + + # E2E latency data + e2e_latencies = [] + for trace in self.traces: + if trace.success and trace.e2e_latency is not None: + e2e_latencies.append(trace.e2e_latency) + self.result.add_latency(trace.e2e_latency) + + # Throughput data (requests/s) + if self.trace_stats.get('total_duration', 0) > 0: + rps = self.trace_stats.get('requests_per_second', 0) + self.result.add_throughput(rps) + + # Success rate + success_rate = self.trace_stats.get('success_rate', 0) + self.result.success_rate = success_rate + + # Total tokens + total_tokens = self.trace_stats.get('total_tokens', 0) + self.result.total_tokens = total_tokens + + # Add scalar metrics + if 'avg_ttft' in self.trace_stats: + self.result.add_metric(ScalarMetric( + name="infer.avg_ttft", + value=self.trace_stats['avg_ttft'], + unit="ms" + )) + + if 'avg_e2e_latency' in self.trace_stats: + self.result.add_metric(ScalarMetric( + name="infer.avg_e2e_latency", + value=self.trace_stats['avg_e2e_latency'], + unit="ms" + )) + + if 'throughput_tps' in self.trace_stats: + self.result.add_metric(ScalarMetric( + name="infer.avg_throughput_tps", + value=self.trace_stats['throughput_tps'], + unit="tokens/s" + )) + + # Success rate metric + self.result.add_metric(ScalarMetric( + name="infer.success_rate", + value=success_rate * 100, # Convert to percentage + unit="%" + )) + + # Total requests + self.result.add_metric(ScalarMetric( + name="infer.total_requests", + value=self.trace_stats.get('total_requests', 0), + unit="requests" + )) + + # Record peak memory usage (if available) + peak_memory = self.adapter.get_peak_memory_usage() + if peak_memory: + self.result.peak_memory_usage = peak_memory + logger.info(f"Peak GPU memory usage: {peak_memory:.2f} GB") + + else: + logger.warning("No trace statistics available") + + def _load_trace_data(self): + """Load trace data""" + trace_file = self.infer_args.request_trace + + if not Path(trace_file).exists(): + raise FileNotFoundError(f"Trace file not found: {trace_file}") + + logger.info(f"Loading trace data from: {trace_file}") + + # Create temporary prompt generator + from utils.prompt_generator import create_prompt_generator + temp_prompt_generator = create_prompt_generator(method="random") + + # Use trace client's method to load trace file + self.traces = TraceClient.load_trace_file( + trace_file, + temp_prompt_generator + ) + + logger.info(f"Loaded {len(self.traces)} requests from trace file") + + # Validate trace data + self._validate_trace_data() + + def _validate_trace_data(self): + """Validate trace data""" + if not self.traces: + raise ValueError("No trace data loaded") + + # Check if maximum tokens exceed model limits + max_input_tokens = max(t.input_token_num for t in self.traces) + max_output_tokens = max(t.output_token_num for t in self.traces) + + if max_input_tokens > self.infer_args.max_seq_len: + logger.warning(f"Max input tokens ({max_input_tokens}) exceeds max_seq_len " + f"({self.infer_args.max_seq_len})") + + if max_output_tokens > self.infer_args.max_seq_len: + logger.warning(f"Max output tokens ({max_output_tokens}) exceeds max_seq_len " + f"({self.infer_args.max_seq_len})") + + # Check timestamp order + timestamps = [t.arrival_timestamp_ms for t in self.traces] + if not all(timestamps[i] <= timestamps[i+1] for i in range(len(timestamps)-1)): + logger.warning("Trace timestamps are not sorted. Sorting now...") + self.traces.sort(key=lambda x: x.arrival_timestamp_ms) + + def dump_json(self) -> str: + """Override dump_json to generate standard format metrics - fixed version""" + if not self.infer_dir: + raise ValueError("Output directory not prepared") + + # Use base class method to generate basic JSON + json_file = super().dump_json() + + # Read base class generated JSON + with open(json_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Build standard metrics array + standard_metrics = [] + + # 1. accuracy_mmlu (placeholder) + standard_metrics.append({ + "name": "infer.accuracy_mmlu", + "type": "scalar", + "value": None, # Placeholder, needs actual calculation + "unit": None + }) + + # 2. e2e_latency (timeseries) + latency_file = self.infer_dir / f"{self.config.run_id}_infer_latency.csv" + if latency_file.exists(): + standard_metrics.append({ + "name": "infer.e2e_latency", + "type": "timeseries", + "raw_data_url": f"./infer/{latency_file.name}", + "unit": "ms" + }) + + # 3. ttft (timeseries) + ttft_file = self.infer_dir / f"{self.config.run_id}_infer_ttft.csv" + if ttft_file.exists(): + standard_metrics.append({ + "name": "infer.ttft", + "type": "timeseries", + "raw_data_url": f"./infer/{ttft_file.name}", + "unit": "ms" + }) + + # 4. peak_memory_usage (scalar) - Use GPU monitor to get real data + # Get peak memory usage + peak_memory = None + + if self.gpu_monitor: + try: + peak_memory = self.gpu_monitor.get_peak_memory_gb() + logger.info(f"Real peak GPU memory usage: {peak_memory} GB") + except Exception as e: + logger.warning(f"Failed to get peak memory from GPU monitor: {e}") + peak_memory = 0.0 + else: + logger.warning("GPU monitor not available, using 0.0 GB") + + standard_metrics.append({ + "name": "infer.peak_memory_usage", + "type": "scalar", + "value": peak_memory, + "unit": "GB" + }) + + # 5. response_per_second (timeseries) + response_file = self.infer_dir / f"{self.config.run_id}_infer_throughput.csv" + if response_file.exists(): + standard_metrics.append({ + "name": "infer.response_per_second", + "type": "timeseries", + "raw_data_url": f"./infer/{response_file.name}", + "unit": None + }) + else: + # Add placeholder if no file + standard_metrics.append({ + "name": "infer.response_per_second", + "type": "timeseries", + "raw_data_url": None, + "unit": None + }) + + # 6. compute_latency (timeseries - placeholder) + # Service mode may not have compute_latency file, add placeholder + compute_latency_file = self.infer_dir / f"{self.config.run_id}_infer_compute_latency.csv" + if compute_latency_file.exists(): + standard_metrics.append({ + "name": "infer.compute_latency", + "type": "timeseries", + "raw_data_url": f"./infer/{compute_latency_file.name}", + "unit": "ms" + }) + else: + standard_metrics.append({ + "name": "infer.compute_latency", + "type": "timeseries", + "raw_data_url": None, + "unit": "ms" + }) + + # 7. max_throughput_tps (timeseries) + max_throughput_file = self.infer_dir / f"{self.config.run_id}_infer_max_throughput.csv" + if max_throughput_file.exists(): + standard_metrics.append({ + "name": "infer.max_throughput_tps", + "type": "timeseries", + "raw_data_url": f"./infer/{max_throughput_file.name}", + "unit": "tokens/s/gpu" + }) + else: + # Calculate max throughput from throughput data + if hasattr(self.result, 'throughput_data') and self.result.throughput_data: + max_throughput = max(self.result.throughput_data) + standard_metrics.append({ + "name": "infer.max_throughput_tps", + "type": "scalar", + "value": max_throughput, + "unit": "tokens/s/gpu" + }) + else: + standard_metrics.append({ + "name": "infer.max_throughput_tps", + "type": "scalar", + "value": 0.0, + "unit": "tokens/s/gpu" + }) + + # Update metrics in data + data["metrics"] = standard_metrics + + # Save back to file + with open(json_file, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + logger.info(f"Service metrics saved to: {json_file}") + return str(json_file) + + def cleanup(self): + """Clean up resources""" + logger.info("Cleaning up service inference resources") + + # Stop inference service + try: + self.adapter.stop_service() + except Exception as e: + logger.warning(f"Error stopping service: {e}") \ No newline at end of file diff --git a/infinimetrics/inference/utils/gpu_monitor.py b/infinimetrics/inference/utils/gpu_monitor.py new file mode 100644 index 0000000..a7286ca --- /dev/null +++ b/infinimetrics/inference/utils/gpu_monitor.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +GPU Monitoring Module - Revised Version +""" +import subprocess +import threading +import time +from abc import ABC, abstractmethod +import logging + +logger = logging.getLogger(__name__) + +class GPUMonitor(ABC): + """Abstract base class for GPU monitoring""" + + def __init__(self, device_ids=None): + """ + Initialize GPU monitor + + Args: + device_ids: List of GPU device IDs to monitor. If None, monitor all GPUs. + """ + self.device_ids = device_ids + self.peak_memory_mib = 0 # Peak memory usage (MiB) + self.monitor_thread = None + self._stop_monitoring_flag = False # Renamed variable for fix + self.poll_interval = 0.5 # Polling interval (seconds) + + logger.info(f"GPUMonitor initialized for devices: {device_ids}") + + @abstractmethod + def get_current_memory_usage(self): + """Get current GPU memory usage for all devices""" + pass + + def start_monitoring(self): + """Start monitoring GPU memory usage""" + if self.monitor_thread and self.monitor_thread.is_alive(): + logger.warning("Monitoring already started") + return + + self._stop_monitoring_flag = False + self.peak_memory_mib = 0 + + def monitor_loop(): + logger.info("GPU monitoring started") + while not self._stop_monitoring_flag: + try: + current_mem = self.get_current_memory_usage() + if current_mem: + current_peak = max(current_mem) + if current_peak > self.peak_memory_mib: + self.peak_memory_mib = current_peak + logger.debug(f"New peak memory: {self.peak_memory_mib} MiB") + except Exception as e: + logger.debug(f"Error getting GPU memory: {e}") + + time.sleep(self.poll_interval) + + logger.info("GPU monitoring stopped") + + self.monitor_thread = threading.Thread(target=monitor_loop, daemon=True) + self.monitor_thread.start() + + def stop_monitoring(self): + """Stop monitoring GPU memory usage""" + self._stop_monitoring_flag = True + if self.monitor_thread: + self.monitor_thread.join(timeout=5.0) + if self.monitor_thread.is_alive(): + logger.warning("Monitor thread did not stop gracefully") + + def get_peak_memory_gb(self): + """Get peak memory usage in GB""" + return round(self.peak_memory_mib / 1024.0, 6) + + def get_peak_memory_mib(self): + """Get peak memory usage in MiB""" + return self.peak_memory_mib + + def get_peak_memory_per_device(self): + """Get peak memory usage per device""" + total_gb = self.get_peak_memory_gb() + if self.device_ids: + per_device = total_gb / len(self.device_ids) + return {device_id: per_device for device_id in self.device_ids} + else: + return {"all": total_gb} + +class NVIDIAGPUMonitor(GPUMonitor): + """NVIDIA GPU monitor implementation""" + + def get_current_memory_usage(self): + try: + cmd = ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"] + + if self.device_ids: + device_str = ",".join(str(d) for d in self.device_ids) + cmd.extend(["--id", device_str]) + + out = subprocess.check_output( + cmd, + text=True, + stderr=subprocess.DEVNULL + ) + + lines = [l.strip() for l in out.splitlines() if l.strip()] + memory_values = [int(x) for x in lines if x.isdigit()] + + if memory_values: + logger.debug(f"Current GPU memory usage: {memory_values} MiB") + return memory_values + else: + return [] + + except subprocess.CalledProcessError as e: + logger.error(f"nvidia-smi command failed: {e}") + return [] + except FileNotFoundError: + logger.error("nvidia-smi not found. Is NVIDIA driver installed?") + return [] + except Exception as e: + logger.error(f"Error getting GPU memory: {e}") + return [] + +class MockGPUMonitor(GPUMonitor): + """Mock GPU monitor (for testing or CPU mode)""" + + def get_current_memory_usage(self): + """Return mock memory usage data""" + import random + if self.device_ids: + return [random.randint(100, 1000) for _ in self.device_ids] + else: + return [random.randint(100, 1000)] + +def create_gpu_monitor(gpu_platform="nvidia", device_ids=None): + """ + Factory function to create GPU monitor + + Args: + gpu_platform: GPU platform name, "nvidia" or others + device_ids: List of device IDs to monitor + + Returns: + Instance of GPUMonitor + """ + platform_lower = gpu_platform.lower() + + if platform_lower == "nvidia": + logger.info(f"Creating NVIDIA GPU monitor for devices: {device_ids}") + return NVIDIAGPUMonitor(device_ids) + elif platform_lower == "mock" or platform_lower == "test": + logger.info(f"Creating Mock GPU monitor for devices: {device_ids}") + return MockGPUMonitor(device_ids) + else: + logger.warning(f"Unsupported GPU platform: {gpu_platform}, using mock monitor") + return MockGPUMonitor(device_ids) diff --git a/infinimetrics/inference/utils/prompt_generator.py b/infinimetrics/inference/utils/prompt_generator.py new file mode 100644 index 0000000..7589bad --- /dev/null +++ b/infinimetrics/inference/utils/prompt_generator.py @@ -0,0 +1,543 @@ +# utils/prompt_generator.py (Extended Version) +#!/usr/bin/env python3 +""" +Prompt Generator +Generate test prompts based on input_token_num +Supports multiple generation methods and configurations +""" + +import random +import string +import json +from pathlib import Path +from typing import Callable, Optional, Dict, Any, List, Union +import logging + +logger = logging.getLogger(__name__) + +# Preset prompt template library +PRESET_TEMPLATES = { + "ai_qa": [ + "Explain the concept of {topic} in simple terms.", + "What are the main applications of {topic} in today's world?", + "Describe the history and development of {topic}.", + "Compare and contrast {topic} with similar technologies.", + "What are the ethical considerations surrounding {topic}?", + "How does {topic} impact our daily lives?", + "What are the future trends in {topic}?", + "What are the key challenges in {topic} research?" + ], + + "general_qa": [ + "Tell me about {topic}.", + "What is {topic}?", + "Can you explain {topic}?", + "I need information about {topic}.", + "Please provide details about {topic}.", + "Help me understand {topic}." + ], + + "technical": [ + "Discuss the technical implementation of {topic}.", + "What are the algorithms used in {topic}?", + "Explain the architecture of {topic} systems.", + "What are the performance considerations for {topic}?", + "Describe the scalability challenges in {topic}." + ] +} + +# Preset topic library +PRESET_TOPICS = { + "ai_ml": [ + "artificial intelligence", + "machine learning", + "deep learning", + "natural language processing", + "computer vision", + "reinforcement learning", + "neural networks", + "transformers", + "large language models", + "generative AI" + ], + + "tech": [ + "cloud computing", + "blockchain technology", + "quantum computing", + "Internet of Things", + "edge computing", + "distributed systems", + "cybersecurity", + "databases", + "software engineering" + ], + + "science": [ + "climate change", + "genetic engineering", + "space exploration", + "renewable energy", + "quantum physics", + "biotechnology", + "nanotechnology" + ] +} + + +class PromptGenerator: + """Prompt Generator Class (New)""" + + def __init__( + self, + method: str = "template", # template, random, file, fixed + template_name: str = "ai_qa", + topic_name: str = "ai_ml", + fixed_prompt: Optional[str] = None, + prompt_file: Optional[str] = None, + tokenizer = None, + chars_per_token: int = 4 + ): + """ + Initialize prompt generator + """ + self.method = method + self.template_name = template_name + self.topic_name = topic_name + self.fixed_prompt = fixed_prompt + self.prompt_file = prompt_file + self.tokenizer = tokenizer + self.chars_per_token = chars_per_token + + # Load templates and topics + self.templates = PRESET_TEMPLATES.get(template_name, PRESET_TEMPLATES["ai_qa"]) + self.topics = PRESET_TOPICS.get(topic_name, PRESET_TOPICS["ai_ml"]) + + # Load prompts from file (if needed) + self.file_prompts = [] + if method == "file" and prompt_file and Path(prompt_file).exists(): + self._load_prompts_from_file() + + def _load_prompts_from_file(self): + """Load prompts from file""" + try: + with open(self.prompt_file, 'r', encoding='utf-8') as f: + content = f.read().strip() + + if self.prompt_file.endswith('.json'): + data = json.loads(content) + if isinstance(data, list): + self.file_prompts = [str(item) for item in data] + elif isinstance(data, dict): + # Try to extract all string values + for value in data.values(): + if isinstance(value, str): + self.file_prompts.append(value) + elif isinstance(value, list): + self.file_prompts.extend([str(v) for v in value if isinstance(v, str)]) + else: + # Text file, one prompt per line + self.file_prompts = [line.strip() for line in content.split('\n') if line.strip()] + + logger.info(f"Loaded {len(self.file_prompts)} prompts from {self.prompt_file}") + + except Exception as e: + logger.error(f"Failed to load prompts from file: {e}") + self.file_prompts = [] + + def generate(self, token_num: int, prompt_id: int = 0) -> str: + """Generate single prompt (compatible with original interface)""" + return self.generate_prompt(token_num, prompt_id) + + def generate_prompt(self, token_num: int, prompt_id: int = 0) -> str: + """ + Generate single prompt + + Args: + token_num: Required number of tokens + prompt_id: Prompt ID (for generating different prompts) + + Returns: + Prompt text + """ + if self.method == "fixed" and self.fixed_prompt: + # Use fixed prompt + return self._adjust_length(self.fixed_prompt, token_num) + + elif self.method == "file" and self.file_prompts: + # Select prompt from file + prompt_idx = prompt_id % len(self.file_prompts) + base_prompt = self.file_prompts[prompt_idx] + return self._adjust_length(base_prompt, token_num) + + elif self.method == "random": + # Random generation + return self._generate_random_prompt(token_num) + + else: # Default to template method + # Select template and topic + template_idx = prompt_id % len(self.templates) + topic_idx = prompt_id % len(self.topics) + + template = self.templates[template_idx] + topic = self.topics[topic_idx] + + # Generate base prompt + base_prompt = template.format(topic=topic) + + # Adjust length + return self._adjust_length(base_prompt, token_num) + + def _adjust_length(self, prompt: str, token_num: int) -> str: + """Adjust prompt length to specified token count""" + if self.tokenizer: + # Use tokenizer for precise control + try: + tokens = self.tokenizer.encode(prompt) + + if len(tokens) >= token_num: + # Truncate + tokens = tokens[:token_num] + return self.tokenizer.decode(tokens, skip_special_tokens=True) + else: + # Repeat until desired length is reached + repeat_count = (token_num + len(tokens) - 1) // len(tokens) + repeated_tokens = tokens * repeat_count + repeated_tokens = repeated_tokens[:token_num] + return self.tokenizer.decode(repeated_tokens, skip_special_tokens=True) + except Exception as e: + logger.warning(f"Tokenizer length adjustment failed: {e}, using fallback") + # Fallback to character-level adjustment + return self._adjust_length_fallback(prompt, token_num) + else: + # Fallback method + return self._adjust_length_fallback(prompt, token_num) + + def _adjust_length_fallback(self, prompt: str, token_num: int) -> str: + """Fallback method: character-level length adjustment""" + estimated_chars = token_num * self.chars_per_token + + if len(prompt) >= estimated_chars: + # Truncate + return prompt[:estimated_chars] + else: + # Repeat until desired length is reached + repeat_count = (estimated_chars + len(prompt) - 1) // len(prompt) + repeated_prompt = prompt * repeat_count + return repeated_prompt[:estimated_chars] + + def _generate_random_prompt(self, token_num: int) -> str: + """Generate random prompt""" + if self.tokenizer: + # Randomly select token IDs + try: + vocab_size = self.tokenizer.vocab_size if hasattr(self.tokenizer, 'vocab_size') else 32000 + token_ids = [random.randint(0, vocab_size-1) for _ in range(token_num)] + return self.tokenizer.decode(token_ids, skip_special_tokens=True) + except Exception as e: + logger.warning(f"Tokenizer random generation failed: {e}, using fallback") + # Fallback to character-level generation + return self._generate_random_text(token_num) + else: + # Fallback method + return self._generate_random_text(token_num) + + def _generate_random_text(self, token_num: int) -> str: + """Generate random text (fallback method)""" + total_chars = token_num * self.chars_per_token + + # Use letters, digits, and common punctuation + chars = string.ascii_letters + string.digits + ' .,!?;:\n-' + random_text = ''.join(random.choices(chars, k=total_chars)) + + return random_text + + def generate_prompts(self, num_prompts: int, token_num: int) -> List[str]: + """ + Generate multiple prompts + + Args: + num_prompts: Number of prompts to generate + token_num: Token count for each prompt + + Returns: + List of prompts + """ + prompts = [] + for i in range(num_prompts): + prompt = self.generate_prompt(token_num, i) + + # Add unique identifier + unique_suffix = f" [Request {i+1}:{self._generate_unique_suffix()}]" + + # Ensure total length doesn't exceed token limit after adding suffix + if self.tokenizer: + try: + base_tokens = self.tokenizer.encode(prompt) + suffix_tokens = self.tokenizer.encode(unique_suffix) + + if len(base_tokens) + len(suffix_tokens) <= token_num: + prompt += unique_suffix + else: + # Adjust base prompt length to accommodate suffix + adjusted_token_num = token_num - len(suffix_tokens) + if adjusted_token_num > 0: + prompt = self.generate_prompt(adjusted_token_num, i) + unique_suffix + except Exception as e: + logger.warning(f"Tokenizer suffix adjustment failed: {e}") + # Simple append without precise control + prompt += unique_suffix + else: + # Simple estimation + suffix_chars = len(unique_suffix) + base_chars = token_num * self.chars_per_token - suffix_chars + if base_chars > 0: + prompt = prompt[:base_chars] + unique_suffix + + prompts.append(prompt) + + logger.info(f"Generated {len(prompts)} prompts using method: {self.method}") + if prompts: + logger.debug(f"First prompt preview: {prompts[0][:100]}...") + + return prompts + + def _generate_unique_suffix(self) -> str: + """Generate unique identifier suffix""" + return ''.join(random.choices(string.ascii_letters + string.digits, k=6)) + + +# ==================== Original Functions (Maintaining Compatibility) ==================== + +def create_prompt_generator( + tokenizer=None, + method: str = "random", + **kwargs +) -> Callable[[int], str]: + """ + Create prompt generator function (compatible with original interface) + + New recommended usage: Create PromptGenerator class instance directly + + Args: + tokenizer: Optional tokenizer for precise token counting + method: Generation method, "random" or "template" + **kwargs: Additional parameters passed to specific generator + + Returns: + Function: (token_num) -> prompt_text + """ + logger.warning("Using deprecated create_prompt_generator function. " + "Consider using PromptGenerator class directly.") + + if method == "random": + return _create_random_prompt_generator(tokenizer, **kwargs) + elif method == "template": + return _create_template_prompt_generator(**kwargs) + else: + raise ValueError(f"Unknown prompt generation method: {method}") + + +def _create_random_prompt_generator( + tokenizer=None, + vocab_size: int = 32000, + exclude_special_tokens: bool = True, + **kwargs +) -> Callable[[int], str]: + """Create random token generator (compatible with original interface)""" + + # Extract parameters from kwargs + chars_per_token = kwargs.get('chars_per_token', 4) + + def generate_random_prompt(token_num: int) -> str: + """Generate random prompt with specified token count""" + nonlocal tokenizer, vocab_size, exclude_special_tokens, chars_per_token + + if tokenizer is not None: + # Use tokenizer for precise token control + # Generate random token ID sequence + try: + if exclude_special_tokens and hasattr(tokenizer, 'special_tokens_map'): + # Get special tokens + special_tokens = tokenizer.special_tokens_map.values() + special_token_ids = set(tokenizer.convert_tokens_to_ids(special_tokens)) + + # Generate non-special tokens + valid_token_ids = [] + for token_id in range(vocab_size): + if token_id not in special_token_ids: + valid_token_ids.append(token_id) + + if valid_token_ids: + token_ids = random.choices(valid_token_ids, k=token_num) + else: + token_ids = random.choices(list(range(vocab_size)), k=token_num) + else: + # Simple random token ID selection + token_ids = random.choices(list(range(vocab_size)), k=token_num) + + # Decode to text + prompt = tokenizer.decode(token_ids, skip_special_tokens=True) + return prompt + + except Exception as e: + # If tokenizer method fails, fall back to random text generation + logger.warning(f"Tokenizer-based prompt generation failed: {e}. " + f"Falling back to random text generation.") + tokenizer = None # Mark tokenizer as unavailable + + # Method 1: No tokenizer, generate random text (estimate token count) + total_chars = token_num * chars_per_token + + # Generate random text + chars = string.ascii_letters + string.digits + ' .,!?;:\n' + random_text = ''.join(random.choices(chars, k=total_chars)) + + return random_text + + return generate_random_prompt + + +def _create_template_prompt_generator(**kwargs) -> Callable[[int], str]: + """Create template prompt generator (compatible with original interface)""" + templates = kwargs.get('templates', [ + "Explain the concept of artificial intelligence in simple terms. ", + "What are the main applications of machine learning in today's world? ", + "Describe the history and development of deep learning. ", + "Compare and contrast natural language processing with computer vision. ", + ]) + + def generate_template_prompt(token_num: int) -> str: + """Generate prompt using template""" + # Select a template + template = random.choice(templates) + + # Repeat template until desired length is reached + estimated_template_tokens = 15 + repeat_count = max(1, token_num // estimated_template_tokens) + + prompt = template * repeat_count + + # Truncate to approximate length (simple handling) + estimated_chars = token_num * 4 # Assume average 4 characters per token + if len(prompt) > estimated_chars: + prompt = prompt[:estimated_chars] + + return prompt + + return generate_template_prompt + + +# ==================== New Helper Functions ==================== + +def create_prompt_generator_from_config( + config: Dict[str, Any], + tokenizer=None +) -> PromptGenerator: + """ + Create PromptGenerator instance from configuration (recommended new way) + + Args: + config: Configuration dictionary containing prompt-related settings + tokenizer: Optional tokenizer + + Returns: + PromptGenerator instance + """ + # Extract configuration parameters + prompt_config = config.get("prompt_config", {}) + + return PromptGenerator( + method=prompt_config.get("method", "template"), + template_name=prompt_config.get("template_name", "ai_qa"), + topic_name=prompt_config.get("topic_name", "ai_ml"), + fixed_prompt=prompt_config.get("fixed_prompt"), + prompt_file=prompt_config.get("prompt_file"), + tokenizer=tokenizer, + chars_per_token=prompt_config.get("chars_per_token", 4) + ) + + +def load_prompts_from_file(file_path: str) -> List[str]: + """Load prompt list from file (general function)""" + if not Path(file_path).exists(): + logger.error(f"Prompt file not found: {file_path}") + return [] + + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read().strip() + + if file_path.endswith('.json'): + data = json.loads(content) + if isinstance(data, list): + return [str(item) for item in data] + elif isinstance(data, dict): + # Try to extract all string values + prompts = [] + for value in data.values(): + if isinstance(value, str): + prompts.append(value) + elif isinstance(value, list): + prompts.extend([str(v) for v in value if isinstance(v, str)]) + return prompts + else: + # Text file, one prompt per line + return [line.strip() for line in content.split('\n') if line.strip()] + + except Exception as e: + logger.error(f"Failed to load prompts from file {file_path}: {e}") + + return [] + + +# ==================== Test Functions ==================== + +def test_prompt_generator(): + """Test prompt generator""" + print("Testing prompt generators...") + + # Test original functions (maintain compatibility) + print("\n1. Testing legacy functions:") + + # Test random generator (no tokenizer) + print("\n Random generator (no tokenizer):") + random_gen = create_prompt_generator(method="random", chars_per_token=3) + prompt1 = random_gen(10) # Generate prompt with 10 tokens + print(f" Generated prompt (10 tokens): {prompt1[:50]}...") + + # Test template generator + print("\n Template generator:") + template_gen = create_prompt_generator(method="template") + prompt2 = template_gen(20) # Generate prompt with 20 tokens + print(f" Generated prompt (20 tokens): {prompt2[:50]}...") + + # Test new PromptGenerator class + print("\n2. Testing new PromptGenerator class:") + + config = { + "prompt_config": { + "method": "template", + "template_name": "ai_qa", + "topic_name": "ai_ml", + "chars_per_token": 4 + } + } + + generator = create_prompt_generator_from_config(config) + + # Generate single prompt + single_prompt = generator.generate(15) + print(f" Single prompt (15 tokens): {single_prompt[:50]}...") + + # Generate multiple prompts + prompts = generator.generate_prompts(3, 10) + print(f" Generated {len(prompts)} prompts:") + for i, prompt in enumerate(prompts): + print(f" Prompt {i+1}: {prompt[:50]}...") + + print("\nPrompt generators test completed.") + + +if __name__ == "__main__": + test_prompt_generator() \ No newline at end of file diff --git a/infinimetrics/inference/utils/token_generator.py b/infinimetrics/inference/utils/token_generator.py new file mode 100644 index 0000000..13c443a --- /dev/null +++ b/infinimetrics/inference/utils/token_generator.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +""" +Random Token Generation Utility +Used for performance testing to generate random token sequences +""" +import random +import logging +from typing import Dict, Any, List, Set, Optional +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +@dataclass +class TokenGeneratorConfig: + """Configuration for token generator""" + exclude_special_tokens: bool = True + min_token_id: int = 0 + max_token_id: Optional[int] = None + custom_excluded_ids: Set[int] = None + + def __post_init__(self): + if self.custom_excluded_ids is None: + self.custom_excluded_ids = set() + +class TokenGenerator: + """Random token generator""" + + def __init__(self, tokenizer, config: Optional[TokenGeneratorConfig] = None): + self.tokenizer = tokenizer + self.config = config or TokenGeneratorConfig() + + # Initialize token information + self._init_token_info() + + def _init_token_info(self): + """Initialize token information""" + # Get vocabulary size + if hasattr(self.tokenizer, 'vocab_size'): + self.vocab_size = self.tokenizer.vocab_size + elif hasattr(self.tokenizer, 'get_vocab_size'): + self.vocab_size = self.tokenizer.get_vocab_size() + else: + # Try to get via len + try: + self.vocab_size = len(self.tokenizer) + except: + self.vocab_size = 32000 # Default value + logger.warning(f"Cannot determine vocab_size, using default: {self.vocab_size}") + + # Set maximum token ID + if self.config.max_token_id is None: + self.config.max_token_id = self.vocab_size - 1 + + # Get special token IDs + self.special_token_ids = self._get_special_token_ids() + + # Calculate valid token range + self.valid_token_ids = self._get_valid_token_ids() + + logger.info(f"TokenGenerator initialized: vocab_size={self.vocab_size}, " + f"valid_tokens={len(self.valid_token_ids)}, " + f"special_tokens={len(self.special_token_ids)}") + + def _get_special_token_ids(self) -> Set[int]: + """Get set of special token IDs""" + special_ids = set() + + # Get special tokens from tokenizer + special_tokens_map = getattr(self.tokenizer, 'special_tokens_map', {}) + + # Process special token mappings + for key, token in special_tokens_map.items(): + token_id = self._convert_to_token_id(token) + if token_id is not None: + special_ids.add(token_id) + + # Check common special token attributes + common_special_attrs = [ + 'bos_token', 'eos_token', 'pad_token', 'unk_token', + 'sep_token', 'cls_token', 'mask_token' + ] + + for attr_name in common_special_attrs: + token = getattr(self.tokenizer, attr_name, None) + if token is not None: + token_id = self._convert_to_token_id(token) + if token_id is not None: + special_ids.add(token_id) + + # Add custom excluded tokens + special_ids.update(self.config.custom_excluded_ids) + + return special_ids + + def _convert_to_token_id(self, token) -> Optional[int]: + """Convert token to ID""" + if isinstance(token, int): + return token + elif isinstance(token, str): + try: + return self.tokenizer.convert_tokens_to_ids(token) + except: + return None + elif hasattr(token, 'content'): # Handle special token objects + try: + return self.tokenizer.convert_tokens_to_ids(token.content) + except: + return None + return None + + def _get_valid_token_ids(self) -> List[int]: + """Get list of valid token IDs (excluding special tokens)""" + if not self.config.exclude_special_tokens: + # If not excluding special tokens, return all tokens + return list(range(self.config.min_token_id, self.config.max_token_id + 1)) + + # Exclude special tokens + all_ids = set(range(self.config.min_token_id, self.config.max_token_id + 1)) + valid_ids = sorted(list(all_ids - self.special_token_ids)) + + if not valid_ids: + logger.warning("No valid tokens after exclusion, using all tokens") + valid_ids = list(range(self.config.min_token_id, self.config.max_token_id + 1)) + + return valid_ids + + def generate_tokens(self, num_tokens: int) -> List[int]: + """ + Generate random token sequence + + Args: + num_tokens: Number of tokens to generate + + Returns: + List of token IDs + """ + if not self.valid_token_ids: + raise ValueError("No valid tokens available for generation") + + tokens = random.choices(self.valid_token_ids, k=num_tokens) + + logger.debug(f"Generated {num_tokens} random tokens from {len(self.valid_token_ids)} valid tokens") + return tokens + + def generate_token_batch(self, batch_size: int, tokens_per_sample: int) -> List[List[int]]: + """ + Generate batch of token sequences + + Args: + batch_size: Batch size + tokens_per_sample: Number of tokens per sample + + Returns: + List of token ID batches + """ + batch = [] + for i in range(batch_size): + tokens = self.generate_tokens(tokens_per_sample) + batch.append(tokens) + + logger.info(f"Generated {batch_size} samples, {tokens_per_sample} tokens each") + return batch + + def tokens_to_text(self, tokens: List[int]) -> str: + """Convert token IDs to text""" + return self.tokenizer.decode(tokens, skip_special_tokens=True) + + def batch_to_text(self, batch_tokens: List[List[int]]) -> List[str]: + """Batch convert token IDs to text""" + texts = [] + for tokens in batch_tokens: + text = self.tokens_to_text(tokens) + texts.append(text) + return texts + + def get_token_info(self) -> Dict[str, Any]: + """Get token information statistics""" + return { + "vocab_size": self.vocab_size, + "valid_token_count": len(self.valid_token_ids), + "special_token_count": len(self.special_token_ids), + "min_token_id": min(self.valid_token_ids) if self.valid_token_ids else 0, + "max_token_id": max(self.valid_token_ids) if self.valid_token_ids else 0, + "exclude_special": self.config.exclude_special_tokens + } + +def create_token_generator(tokenizer, **kwargs): + """ + Convenience function to create token generator + + Args: + tokenizer: Tokenizer object + **kwargs: Parameters passed to TokenGeneratorConfig + + Returns: + TokenGenerator instance + """ + config = TokenGeneratorConfig(**kwargs) + return TokenGenerator(tokenizer, config) + +# Unit tests +if __name__ == "__main__": + # Mock a simple tokenizer for testing + class MockTokenizer: + vocab_size = 50000 + special_tokens_map = { + "bos_token": 0, + "eos_token": 1, + "pad_token": 2, + "unk_token": 3 + } + + def convert_tokens_to_ids(self, token): + if isinstance(token, str): + # Simple mapping + return hash(token) % 100 + return token + + def decode(self, tokens, skip_special_tokens=True): + return f"Decoded text with {len(tokens)} tokens" + + # Test token generator + tokenizer = MockTokenizer() + generator = TokenGenerator(tokenizer) + + print("Token Info:", generator.get_token_info()) + + # Generate some tokens + tokens = generator.generate_tokens(10) + print(f"Generated tokens: {tokens}") + print(f"Decoded: {generator.tokens_to_text(tokens)}") + + # Generate batch + batch = generator.generate_token_batch(3, 5) + print(f"Generated batch: {batch}") \ No newline at end of file diff --git a/infinimetrics/inference/utils/trace_client.py b/infinimetrics/inference/utils/trace_client.py new file mode 100644 index 0000000..be4a181 --- /dev/null +++ b/infinimetrics/inference/utils/trace_client.py @@ -0,0 +1,527 @@ +# utils/trace_client.py +#!/usr/bin/env python3 +""" +Trace client implementation +Reads requests from a trace file and sends them to the inference service according to time intervals. +""" + +import asyncio +import aiohttp +import csv +import json +import time +import logging +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Any +from dataclasses import dataclass +import numpy as np + +logger = logging.getLogger(__name__) + +@dataclass +class RequestTrace: + """Single request entry in the trace file""" + request_id: str + arrival_timestamp_ms: float # timestamp in milliseconds (relative or absolute) + input_token_num: int + output_token_num: int + + # runtime computed fields + actual_prompt: Optional[str] = None + start_time: Optional[float] = None + ttft: Optional[float] = None # Time To First Token (ms) + e2e_latency: Optional[float] = None # End-to-end latency (ms) + total_tokens: int = 0 + success: bool = False + error: Optional[str] = None + +@dataclass +class TraceClientConfig: + """Trace client configuration""" + api_url: str + model_name: str + timeout_ms: int = 30000 + max_retries: int = 3 + warmup_requests: int = 10 + +class TraceClient: + """Trace client""" + + def __init__(self, config: TraceClientConfig): + self.config = config + self.session: Optional[aiohttp.ClientSession] = None + self.request_counter = 0 + self.semaphore: Optional[asyncio.Semaphore] = None + + # performance statistics + self.total_requests = 0 + self.successful_requests = 0 + self.failed_requests = 0 + + async def __aenter__(self): + """Async context manager entry""" + timeout = aiohttp.ClientTimeout(total=self.config.timeout_ms/1000 + 10) + self.session = aiohttp.ClientSession(timeout=timeout) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit""" + if self.session: + await self.session.close() + + @staticmethod + def load_trace_file(trace_file: str, prompt_generator) -> List[RequestTrace]: + """ + Load trace file + + Args: + trace_file: Path to the trace file + prompt_generator: Function to generate actual prompts + + Returns: + List of request traces + """ + traces = [] + + try: + with open(trace_file, 'r', encoding='utf-8') as f: + # Auto detect file format + first_line = f.readline().strip() + f.seek(0) + + if trace_file.endswith('.csv') or ',' in first_line: + # CSV format + reader = csv.DictReader(f) + for row in reader: + # support variations of column names + request_id = row.get('request_id') or row.get('RequestID') or f"req-{len(traces):04d}" + + # parse timestamp + timestamp_str = row.get('arrival_timestamp_ms') or row.get('timestamp') or '0' + try: + arrival_timestamp_ms = float(timestamp_str) + except ValueError: + logger.warning(f"Invalid timestamp for {request_id}: {timestamp_str}, using 0") + arrival_timestamp_ms = 0 + + # parse token counts + try: + input_token_num = int(row.get('input_token_num') or row.get('input_tokens') or 128) + output_token_num = int(row.get('output_token_num') or row.get('output_tokens') or 128) + except ValueError: + logger.warning(f"Invalid token numbers for {request_id}, using defaults") + input_token_num = 128 + output_token_num = 128 + + # generate prompt content + actual_prompt = prompt_generator(input_token_num) + + trace = RequestTrace( + request_id=request_id, + arrival_timestamp_ms=arrival_timestamp_ms, + input_token_num=input_token_num, + output_token_num=output_token_num, + actual_prompt=actual_prompt + ) + traces.append(trace) + + elif trace_file.endswith('.json') or first_line.startswith('[') or first_line.startswith('{'): + # JSON format + data = json.load(f) + + if isinstance(data, dict): + data = [data] + + for i, item in enumerate(data): + request_id = item.get('request_id') or f"req-{i:04d}" + + arrival_timestamp_ms = float(item.get('arrival_timestamp_ms', 0)) + input_token_num = int(item.get('input_token_num', 128)) + output_token_num = int(item.get('output_token_num', 128)) + + actual_prompt = prompt_generator(input_token_num) + + trace = RequestTrace( + request_id=request_id, + arrival_timestamp_ms=arrival_timestamp_ms, + input_token_num=input_token_num, + output_token_num=output_token_num, + actual_prompt=actual_prompt + ) + traces.append(trace) + + else: + raise ValueError(f"Unsupported trace file format: {trace_file}") + + logger.info(f"Loaded {len(traces)} requests from trace file: {trace_file}") + + # sort by timestamp + traces.sort(key=lambda x: x.arrival_timestamp_ms) + + # print debug info for first few entries + if traces: + logger.info(f"First request: ID={traces[0].request_id}, " + f"time={traces[0].arrival_timestamp_ms}ms, " + f"input={traces[0].input_token_num}, " + f"output={traces[0].output_token_num}") + if len(traces) > 1: + logger.info(f"Last request: ID={traces[-1].request_id}, " + f"time={traces[-1].arrival_timestamp_ms}ms") + + return traces + + except Exception as e: + logger.error(f"Failed to load trace file {trace_file}: {e}") + raise + + async def send_request( + self, + trace: RequestTrace, + semaphore: asyncio.Semaphore + ) -> RequestTrace: + """ + Send a single request and record metrics + + Args: + trace: Request trace + semaphore: Concurrency control semaphore + + Returns: + Updated request trace with performance metrics + """ + async with semaphore: + self.total_requests += 1 + request_start = time.perf_counter() + ttft = None + total_tokens = 0 + + try: + # Simulate request arrival timing (assuming timestamps are relative) + if trace.arrival_timestamp_ms > 0: + await asyncio.sleep(trace.arrival_timestamp_ms / 1000) + + # record start time + trace.start_time = time.perf_counter() + + # construct request payload + payload = { + "model": self.config.model_name, + "messages": [ + { + "role": "user", + "content": trace.actual_prompt + } + ], + "max_tokens": trace.output_token_num, + "temperature": 0.7, + "top_p": 0.9, + "stream": True # use streaming to measure TTFT + } + + logger.debug(f"Sending request {trace.request_id}: " + f"input={trace.input_token_num}, " + f"output={trace.output_token_num}") + + # send request + async with self.session.post( + f"{self.config.api_url}/chat/completions", + json=payload, + headers={"Content-Type": "application/json"} + ) as response: + + if response.status != 200: + error_msg = f"HTTP {response.status}: {await response.text()}" + trace.error = error_msg + trace.success = False + self.failed_requests += 1 + + logger.error(f"Request {trace.request_id} failed: {error_msg}") + return trace + + # process streaming response + first_token_received = False + first_token_time = None + + async for line in response.content: + if line: + line = line.decode('utf-8').strip() + if line.startswith('data: '): + data_str = line[6:] + + if data_str == '[DONE]': + break + + try: + data = json.loads(data_str) + if 'choices' in data and len(data['choices']) > 0: + choice = data['choices'][0] + + # check for content + if 'delta' in choice and 'content' in choice['delta']: + content = choice['delta']['content'] + + if not first_token_received: + first_token_time = time.perf_counter() + ttft = (first_token_time - trace.start_time) * 1000 + first_token_received = True + logger.debug(f"First token received for {trace.request_id}: " + f"TTFT={ttft:.2f}ms") + + # count tokens (rough estimate) + if content.strip(): + total_tokens += 1 + + # check finish condition + if choice.get('finish_reason'): + break + + except json.JSONDecodeError: + logger.warning(f"Invalid JSON in stream: {line}") + continue + + # compute e2e latency + e2e_latency = (time.perf_counter() - trace.start_time) * 1000 + + # update trace + trace.ttft = ttft if ttft else e2e_latency + trace.e2e_latency = e2e_latency + trace.total_tokens = total_tokens + trace.success = True + + self.successful_requests += 1 + + logger.debug(f"Request {trace.request_id} completed: " + f"TTFT={trace.ttft:.2f}ms, " + f"E2E={trace.e2e_latency:.2f}ms, " + f"tokens={total_tokens}") + + except asyncio.TimeoutError: + error_msg = f"Timeout after {self.config.timeout_ms}ms" + trace.error = error_msg + trace.success = False + self.failed_requests += 1 + logger.error(f"Request {trace.request_id} timeout: {error_msg}") + + except Exception as e: + error_msg = str(e) + trace.error = error_msg + trace.success = False + self.failed_requests += 1 + logger.error(f"Request {trace.request_id} failed: {error_msg}") + + return trace + + async def run_trace( + self, + traces: List[RequestTrace], + concurrency: int = 32, + warmup_requests: int = 10 + ) -> Tuple[List[RequestTrace], Dict[str, Any]]: + """ + Run all requests in the trace + + Args: + traces: List of request traces + concurrency: Max concurrency + warmup_requests: Number of warmup requests + + Returns: + (updated traces, statistics dictionary) + """ + # reset counters + self.total_requests = 0 + self.successful_requests = 0 + self.failed_requests = 0 + + logger.info(f"Starting trace run: {len(traces)} requests, concurrency={concurrency}") + + # warmup phase + if warmup_requests > 0: + logger.info(f"Warmup phase: {warmup_requests} requests") + + warmup_traces = traces[:min(warmup_requests, len(traces))] + semaphore = asyncio.Semaphore(concurrency) + warmup_tasks = [] + + for trace in warmup_traces: + # Warm-up requests do not calculate arrival interval + warmup_trace = RequestTrace( + request_id=f"warmup-{trace.request_id}", + arrival_timestamp_ms=0, + input_token_num=trace.input_token_num, + output_token_num=min(10, trace.output_token_num), # Warm-up generates fewer tokens + actual_prompt=trace.actual_prompt + ) + + task = self.send_request(warmup_trace, semaphore) + warmup_tasks.append(task) + + warmup_results = await asyncio.gather(*warmup_tasks, return_exceptions=True) + + # Handle exceptions + for i, result in enumerate(warmup_results): + if isinstance(result, Exception): + logger.warning(f"Warmup request {i} failed: {result}") + + logger.info("Warmup completed") + + # Measurement phase + logger.info(f"Measurement phase: {len(traces)} requests") + + # Create semaphore to control concurrency + self.semaphore = asyncio.Semaphore(concurrency) + + # Record start time + test_start_time = time.perf_counter() + + # Send all requests + tasks = [] + for trace in traces: + task = self.send_request(trace, self.semaphore) + tasks.append(task) + + # Wait for all requests to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Record end time + test_end_time = time.perf_counter() + total_test_duration = test_end_time - test_start_time + + # Process results + processed_traces = [] + for i, result in enumerate(results): + if isinstance(result, Exception): + logger.error(f"Request {i} raised exception: {result}") + if i < len(traces): + failed_trace = traces[i] + failed_trace.error = str(result) + failed_trace.success = False + processed_traces.append(failed_trace) + else: + processed_traces.append(result) + + # Calculate statistics + stats = self._calculate_statistics(processed_traces, total_test_duration) + + # Log summary + logger.info(f"Trace run completed: " + f"{stats['success_rate']:.2%} success rate, " + f"{stats['avg_ttft']:.2f}ms avg TTFT, " + f"{stats['avg_e2e_latency']:.2f}ms avg E2E latency") + + return processed_traces, stats + + def _calculate_statistics( + self, + traces: List[RequestTrace], + total_duration: float + ) -> Dict[str, Any]: + stats = {} + + # Filter successful requests + successful_traces = [t for t in traces if t.success] + + if not successful_traces: + logger.warning("No successful requests in trace run") + return stats + + # Basic statistics + stats['total_requests'] = len(traces) + stats['successful_requests'] = len(successful_traces) + stats['failed_requests'] = len(traces) - len(successful_traces) + stats['success_rate'] = len(successful_traces) / len(traces) if traces else 0 + + # Time statistics + stats['total_duration'] = total_duration + stats['requests_per_second'] = len(traces) / total_duration if total_duration > 0 else 0 + + # TTFT statistics + ttfts = [t.ttft for t in successful_traces if t.ttft is not None] + if ttfts: + stats['avg_ttft'] = np.mean(ttfts) + stats['p50_ttft'] = np.percentile(ttfts, 50) + stats['p95_ttft'] = np.percentile(ttfts, 95) + stats['p99_ttft'] = np.percentile(ttfts, 99) + stats['min_ttft'] = np.min(ttfts) + stats['max_ttft'] = np.max(ttfts) + stats['std_ttft'] = np.std(ttfts) + + # E2E latency statistics + e2e_latencies = [t.e2e_latency for t in successful_traces if t.e2e_latency is not None] + if e2e_latencies: + stats['avg_e2e_latency'] = np.mean(e2e_latencies) + stats['p50_e2e_latency'] = np.percentile(e2e_latencies, 50) + stats['p95_e2e_latency'] = np.percentile(e2e_latencies, 95) + stats['p99_e2e_latency'] = np.percentile(e2e_latencies, 99) + stats['min_e2e_latency'] = np.min(e2e_latencies) + stats['max_e2e_latency'] = np.max(e2e_latencies) + stats['std_e2e_latency'] = np.std(e2e_latencies) + + # Token statistics + total_tokens = sum(t.total_tokens for t in successful_traces) + stats['total_tokens'] = total_tokens + stats['avg_tokens_per_request'] = total_tokens / len(successful_traces) if successful_traces else 0 + + # Throughput + if total_duration > 0: + stats['throughput_tps'] = total_tokens / total_duration # tokens per second + + # Input/output token statistics + input_tokens = sum(t.input_token_num for t in successful_traces) + output_tokens = sum(t.output_token_num for t in successful_traces) + stats['total_input_tokens'] = input_tokens + stats['total_output_tokens'] = output_tokens + + return stats + + def save_results_to_csv( + self, + traces: List[RequestTrace], + output_dir: Path, + run_id: str + ): + # Save detailed results + detailed_file = output_dir / f"{run_id}_trace_detailed.csv" + with open(detailed_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'request_id', 'arrival_timestamp_ms', 'input_token_num', + 'output_token_num', 'ttft_ms', 'e2e_latency_ms', + 'total_tokens', 'success', 'error' + ]) + + for trace in traces: + writer.writerow([ + trace.request_id, + trace.arrival_timestamp_ms, + trace.input_token_num, + trace.output_token_num, + trace.ttft if trace.ttft is not None else '', + trace.e2e_latency if trace.e2e_latency is not None else '', + trace.total_tokens, + trace.success, + trace.error or '' + ]) + + # Save time-series files (for plotting) + if traces: + # TTFT time series + ttft_file = output_dir / f"{run_id}_trace_ttft.csv" + with open(ttft_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['request_index', 'ttft_ms']) + for i, trace in enumerate(traces): + if trace.ttft is not None: + writer.writerow([i, trace.ttft]) + + # E2E latency time series + latency_file = output_dir / f"{run_id}_trace_latency.csv" + with open(latency_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['request_index', 'e2e_latency_ms']) + for i, trace in enumerate(traces): + if trace.e2e_latency is not None: + writer.writerow([i, trace.e2e_latency]) + + logger.info(f"Trace results saved to {output_dir}") + return detailed_file From 225c458560fe73843876315d1834328cb00942a9 Mon Sep 17 00:00:00 2001 From: sunjqa1 Date: Thu, 11 Dec 2025 15:51:52 +0800 Subject: [PATCH 2/6] fix: correct service command string in _build_command_string --- .../configs/real_infinilm_service_small.json | 38 ++++++++++++++ .../configs/test_direct_infinilm.json | 36 ++++++++++++++ .../configs/test_direct_infinilm_real.json | 39 +++++++++++++++ .../configs/test_direct_infinilm_v2.json | 41 ++++++++++++++++ infinimetrics/inference/infer_runner_base.py | 49 ++++++++++++++++--- 5 files changed, 195 insertions(+), 8 deletions(-) create mode 100644 infinimetrics/inference/configs/real_infinilm_service_small.json create mode 100644 infinimetrics/inference/configs/test_direct_infinilm.json create mode 100644 infinimetrics/inference/configs/test_direct_infinilm_real.json create mode 100644 infinimetrics/inference/configs/test_direct_infinilm_v2.json diff --git a/infinimetrics/inference/configs/real_infinilm_service_small.json b/infinimetrics/inference/configs/real_infinilm_service_small.json new file mode 100644 index 0000000..0252565 --- /dev/null +++ b/infinimetrics/inference/configs/real_infinilm_service_small.json @@ -0,0 +1,38 @@ +{ + "run_id": "my.custom.runid", + "testcase": "infer.InfiniLM.Service", + "config": { + "model": "Qwen3-1.7B", + "model_path":"/var/qy_home/sunjinge/models/Qwen3-1.7B", + "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B", + + "device": { + "gpu_platform": "nvidia", + "device_ids": [0], + "cpu_only": false + }, + + "train_dataset": null, + "validation_dataset": null, + "test_dataset": null, + "output_dir": "./test_output_service", + + "infer_args": { + "parallel": { + "dp": 1, + "tp": 1, + "pp": 1, + "sp": 1 + }, + "request_trace": "./test_trace.csv", + "concurrency": 4, + "max_seq_len": 2048, + "stream": true, + "timeout_ms": 30000 + }, + + "timeout_ms": 60000, + "warmup_iterations": 10, + "measured_iterations": 50 + } +} diff --git a/infinimetrics/inference/configs/test_direct_infinilm.json b/infinimetrics/inference/configs/test_direct_infinilm.json new file mode 100644 index 0000000..63f8222 --- /dev/null +++ b/infinimetrics/inference/configs/test_direct_infinilm.json @@ -0,0 +1,36 @@ +{ + "config": { + "run_id": "infer.infinilm.direct.test.2024120601", + "testcase": "infer.InfiniLM.Direct", + "model": "Qwen3-1.7B", + "model_path": "/home/sunjinge/model/Qwen3-1.7B", + "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json", + "device": { + "gpu_platform": "nvidia", + "device_ids": [0], + "cpu_only": false + }, + "train_dataset": null, + "validation_dataset": null, + "test_dataset": null, + "output_dir": "./test_output", + "infer_args": { + "parallel": { + "dp": 1, + "tp": 1, + "pp": 1, + "sp": 1 + }, + "static_batch_size": 1, + "prompt_token_num": 100, + "output_token_num": 50, + "max_seq_len": 2048, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50 + }, + "timeout_ms": 30000, + "warmup_iterations": 2, + "measured_iterations": 5 + } +} diff --git a/infinimetrics/inference/configs/test_direct_infinilm_real.json b/infinimetrics/inference/configs/test_direct_infinilm_real.json new file mode 100644 index 0000000..2c575f5 --- /dev/null +++ b/infinimetrics/inference/configs/test_direct_infinilm_real.json @@ -0,0 +1,39 @@ +{ + "run_id": "my.custom.runid", + "testcase": "infer.InfiniLM.Direct", + "config": { + "model": "Qwen3-1.7B", + "model_path": "/var/qy_home/sunjinge/models/Qwen3-1.7B", + "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B", + + "device": { + "gpu_platform": "nvidia", + "device_ids": [0], + "cpu_only": false + }, + + "train_dataset": null, + "validation_dataset": null, + "test_dataset": "./test_perplexity_data.json", + "output_dir": "./test_output_real", + + "infer_args": { + "parallel": { + "dp": 1, + "tp": 1, + "pp": 1, + "sp": 1 + }, + "static_batch_size": 4, + "prompt_token_num": 128, + "output_token_num": 128, + "max_seq_len": 512, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50 + }, + + "warmup_iterations": 1, + "measured_iterations": 2 + } +} diff --git a/infinimetrics/inference/configs/test_direct_infinilm_v2.json b/infinimetrics/inference/configs/test_direct_infinilm_v2.json new file mode 100644 index 0000000..50bdbce --- /dev/null +++ b/infinimetrics/inference/configs/test_direct_infinilm_v2.json @@ -0,0 +1,41 @@ +{ + "config": { + "run_id": "infer.infinilm.direct.test.v2", + "testcase": "infer.InfiniLM.Direct", + + "model": "Qwen3-1.7B", + "model_path": "/home/sunjinge/model/Qwen3-1.7B", + "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json", + + "device": { + "gpu_platform": "nvidia", + "device_ids": [0], + "cpu_only": false + }, + + "train_dataset": null, + "validation_dataset": null, + "test_dataset": null, + "output_dir": "./test_output", + + "infer_args": { + "parallel": { + "dp": 1, + "tp": 1, + "pp": 1, + "sp": 1 + }, + "static_batch_size": 1, + "prompt_token_num": 100, + "output_token_num": 50, + "max_seq_len": 2048, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50 + }, + + "timeout_ms": 30000, + "warmup_iterations": 2, + "measured_iterations": 5 + } +} diff --git a/infinimetrics/inference/infer_runner_base.py b/infinimetrics/inference/infer_runner_base.py index 219d272..3999b7e 100644 --- a/infinimetrics/inference/infer_runner_base.py +++ b/infinimetrics/inference/infer_runner_base.py @@ -432,15 +432,39 @@ def _build_command_string(self) -> str: cmd_parts = [] if self.config.framework.value == "infinilm": - cmd_parts.append("python scripts/jiuge.py --nvidia") - cmd_parts.append(self.config.model_path) - cmd_parts.append(str(self.config.infer_args.parallel.tp)) - - # Add batch size parameter (if jiuge.py supports it) if self.config.mode.value == "direct": - if hasattr(self.config.infer_args, 'static_batch_size'): - cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}") + # Direct mode: use jiuge.py for direct inference + cmd_parts.append("python scripts/jiuge.py --nvidia") + cmd_parts.append(self.config.model_path) + cmd_parts.append(str(self.config.infer_args.parallel.tp)) + + # Add batch size parameter (if jiuge.py supports it) + if self.config.mode.value == "direct": + if hasattr(self.config.infer_args, 'static_batch_size'): + cmd_parts.append(f"--batch-size {self.config.infer_args.static_batch_size}") + else: # service mode + # Service mode: use launch_server.py to start inference service + cmd_parts.append("python scripts/launch_server.py") + cmd_parts.append(f"--model-path {self.config.model_path}") + cmd_parts.append(f"--dev nvidia") + cmd_parts.append(f"--ndev {self.config.infer_args.parallel.tp}") + + # Add common arguments + if hasattr(self.config.infer_args, 'max_batch'): + cmd_parts.append(f"--max-batch {self.config.infer_args.max_batch}") + elif hasattr(self.config.infer_args, 'max_seq_len'): + cmd_parts.append(f"--max-tokens {self.config.infer_args.max_seq_len}") + + # Add trace testing command (if trace is provided) + if hasattr(self.config.infer_args, 'request_trace'): + trace_cmd = ( + f"# Trace test: python trace_client.py " + f"--trace {self.config.infer_args.request_trace} " + f"--concurrency {self.config.infer_args.concurrency}" + ) + cmd_parts.append(trace_cmd) + else: # vllm if self.config.mode.value == "direct": cmd_parts.append("python -m vllm.benchmarks.benchmark_throughput") @@ -451,6 +475,15 @@ def _build_command_string(self) -> str: cmd_parts.append("python -m vllm.entrypoints.api_server") cmd_parts.append(f"--model {self.config.model_path}") cmd_parts.append(f"--port 8000") + cmd_parts.append(f"--tensor-parallel-size {self.config.infer_args.parallel.tp}") + + # Add trace serving command + if hasattr(self.config.infer_args, 'request_trace'): + trace_cmd = ( + f"# Trace test: python -m vllm.benchmarks.benchmark_serving " + f"--trace {self.config.infer_args.request_trace}" + ) + cmd_parts.append(trace_cmd) return " ".join(cmd_parts) @@ -579,4 +612,4 @@ def execute(self) -> None: @abc.abstractmethod def collect_metrics(self) -> None: """Collect performance metrics (to be implemented by subclasses)""" - pass \ No newline at end of file + pass From 01b71993dd1b9ff87bfc420d619af927fb9ca912 Mon Sep 17 00:00:00 2001 From: sunjinge <94775646+zzhfz@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:09:17 +0800 Subject: [PATCH 3/6] Delete infinimetrics/inference/configs/real_infinilm_service_small.json --- .../configs/real_infinilm_service_small.json | 38 ------------------- 1 file changed, 38 deletions(-) delete mode 100644 infinimetrics/inference/configs/real_infinilm_service_small.json diff --git a/infinimetrics/inference/configs/real_infinilm_service_small.json b/infinimetrics/inference/configs/real_infinilm_service_small.json deleted file mode 100644 index 0252565..0000000 --- a/infinimetrics/inference/configs/real_infinilm_service_small.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "run_id": "my.custom.runid", - "testcase": "infer.InfiniLM.Service", - "config": { - "model": "Qwen3-1.7B", - "model_path":"/var/qy_home/sunjinge/models/Qwen3-1.7B", - "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B", - - "device": { - "gpu_platform": "nvidia", - "device_ids": [0], - "cpu_only": false - }, - - "train_dataset": null, - "validation_dataset": null, - "test_dataset": null, - "output_dir": "./test_output_service", - - "infer_args": { - "parallel": { - "dp": 1, - "tp": 1, - "pp": 1, - "sp": 1 - }, - "request_trace": "./test_trace.csv", - "concurrency": 4, - "max_seq_len": 2048, - "stream": true, - "timeout_ms": 30000 - }, - - "timeout_ms": 60000, - "warmup_iterations": 10, - "measured_iterations": 50 - } -} From cae31d143a35f1df63418337fe340a6dd8fd864b Mon Sep 17 00:00:00 2001 From: sunjinge <94775646+zzhfz@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:09:59 +0800 Subject: [PATCH 4/6] Delete infinimetrics/inference/configs/test_direct_infinilm_real.json --- .../configs/test_direct_infinilm_real.json | 39 ------------------- 1 file changed, 39 deletions(-) delete mode 100644 infinimetrics/inference/configs/test_direct_infinilm_real.json diff --git a/infinimetrics/inference/configs/test_direct_infinilm_real.json b/infinimetrics/inference/configs/test_direct_infinilm_real.json deleted file mode 100644 index 2c575f5..0000000 --- a/infinimetrics/inference/configs/test_direct_infinilm_real.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "run_id": "my.custom.runid", - "testcase": "infer.InfiniLM.Direct", - "config": { - "model": "Qwen3-1.7B", - "model_path": "/var/qy_home/sunjinge/models/Qwen3-1.7B", - "model_config": "/var/qy_home/sunjinge/models/Qwen3-1.7B", - - "device": { - "gpu_platform": "nvidia", - "device_ids": [0], - "cpu_only": false - }, - - "train_dataset": null, - "validation_dataset": null, - "test_dataset": "./test_perplexity_data.json", - "output_dir": "./test_output_real", - - "infer_args": { - "parallel": { - "dp": 1, - "tp": 1, - "pp": 1, - "sp": 1 - }, - "static_batch_size": 4, - "prompt_token_num": 128, - "output_token_num": 128, - "max_seq_len": 512, - "temperature": 0.7, - "top_p": 0.9, - "top_k": 50 - }, - - "warmup_iterations": 1, - "measured_iterations": 2 - } -} From 9d011b8c81175d1a35e1fe6f8766f637bac04425 Mon Sep 17 00:00:00 2001 From: sunjinge <94775646+zzhfz@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:10:19 +0800 Subject: [PATCH 5/6] Delete infinimetrics/inference/configs/test_direct_infinilm_v2.json --- .../configs/test_direct_infinilm_v2.json | 41 ------------------- 1 file changed, 41 deletions(-) delete mode 100644 infinimetrics/inference/configs/test_direct_infinilm_v2.json diff --git a/infinimetrics/inference/configs/test_direct_infinilm_v2.json b/infinimetrics/inference/configs/test_direct_infinilm_v2.json deleted file mode 100644 index 50bdbce..0000000 --- a/infinimetrics/inference/configs/test_direct_infinilm_v2.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "config": { - "run_id": "infer.infinilm.direct.test.v2", - "testcase": "infer.InfiniLM.Direct", - - "model": "Qwen3-1.7B", - "model_path": "/home/sunjinge/model/Qwen3-1.7B", - "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json", - - "device": { - "gpu_platform": "nvidia", - "device_ids": [0], - "cpu_only": false - }, - - "train_dataset": null, - "validation_dataset": null, - "test_dataset": null, - "output_dir": "./test_output", - - "infer_args": { - "parallel": { - "dp": 1, - "tp": 1, - "pp": 1, - "sp": 1 - }, - "static_batch_size": 1, - "prompt_token_num": 100, - "output_token_num": 50, - "max_seq_len": 2048, - "temperature": 0.7, - "top_p": 0.9, - "top_k": 50 - }, - - "timeout_ms": 30000, - "warmup_iterations": 2, - "measured_iterations": 5 - } -} From 266abdfa1ffe25e4fe3e86ea61e7ad3578689a6e Mon Sep 17 00:00:00 2001 From: sunjinge <94775646+zzhfz@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:10:37 +0800 Subject: [PATCH 6/6] Delete infinimetrics/inference/configs/test_direct_infinilm.json --- .../configs/test_direct_infinilm.json | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100644 infinimetrics/inference/configs/test_direct_infinilm.json diff --git a/infinimetrics/inference/configs/test_direct_infinilm.json b/infinimetrics/inference/configs/test_direct_infinilm.json deleted file mode 100644 index 63f8222..0000000 --- a/infinimetrics/inference/configs/test_direct_infinilm.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "config": { - "run_id": "infer.infinilm.direct.test.2024120601", - "testcase": "infer.InfiniLM.Direct", - "model": "Qwen3-1.7B", - "model_path": "/home/sunjinge/model/Qwen3-1.7B", - "model_config": "/home/sunjinge/model/Qwen3-1.7B/config.json", - "device": { - "gpu_platform": "nvidia", - "device_ids": [0], - "cpu_only": false - }, - "train_dataset": null, - "validation_dataset": null, - "test_dataset": null, - "output_dir": "./test_output", - "infer_args": { - "parallel": { - "dp": 1, - "tp": 1, - "pp": 1, - "sp": 1 - }, - "static_batch_size": 1, - "prompt_token_num": 100, - "output_token_num": 50, - "max_seq_len": 2048, - "temperature": 0.7, - "top_p": 0.9, - "top_k": 50 - }, - "timeout_ms": 30000, - "warmup_iterations": 2, - "measured_iterations": 5 - } -}