diff --git a/modelopt/torch/puzzletron/build_library_and_stats.py b/modelopt/torch/puzzletron/build_library_and_stats.py index 5f04f6049..31cebdf6b 100644 --- a/modelopt/torch/puzzletron/build_library_and_stats.py +++ b/modelopt/torch/puzzletron/build_library_and_stats.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Unified command that runs build_replacement_library followed by calc_subblock_stats. +""" +Unified command that runs build_replacement_library followed by calc_subblock_stats. This script combines the functionality of both commands into a single workflow: 1. First, it builds the replacement library for the puzzle @@ -28,17 +29,21 @@ all the same configuration parameters for both build_replacement_library and calc_subblock_stats. """ +import hydra from omegaconf import DictConfig from modelopt.torch.puzzletron.replacement_library.build_replacement_library import ( launch_build_replacement_library, ) from modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats import launch_calc_subblock_stats +from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers from modelopt.torch.puzzletron.tools.logger import mprint +from modelopt.torch.puzzletron.utils.parsing import format_global_config def launch_build_library_and_stats(cfg: DictConfig) -> None: - """Launch both build_replacement_library and calc_subblock_stats in sequence. + """ + Launch both build_replacement_library and calc_subblock_stats in sequence. Args: cfg: Hydra configuration containing settings for both commands diff --git a/modelopt/torch/puzzletron/puzzletron.py b/modelopt/torch/puzzletron/puzzletron.py index 94a1de57e..87d90fdd9 100644 --- a/modelopt/torch/puzzletron/puzzletron.py +++ b/modelopt/torch/puzzletron/puzzletron.py @@ -62,10 +62,10 @@ def puzzletron( pruning_ckpts.launch_prune_ckpt(hydra_cfg) dist.barrier() - # # Step 4: build_library_and_stats (single process) - # if dist.is_master(): - # build_library_and_stats.launch_build_library_and_stats(hydra_cfg) - # dist.barrier() + # Step 4: build_library_and_stats (single process) + if dist.is_master(): + build_library_and_stats.launch_build_library_and_stats(hydra_cfg) + dist.barrier() # # Step 5: calc_one_block_scores (distributed processing) # scoring.launch_scoring(hydra_cfg) diff --git a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py index 1618aceaf..aec10e03b 100644 --- a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py +++ b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py @@ -12,17 +12,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""This module constructs the replacement library JSON files from a puzzle directory containing +""" +This module constructs the replacement library JSON files from a puzzle directory containing multiple trained model checkpoints. It analyzes checkpoints to extract unique block and subblock configurations, builds a library of available replacements, and generates solutions for layer replacement in compressed models. The resulting replacement library can then be used by ReplacementLibrary to efficiently load models with mixed teacher/student layers. + +Standard Puzzle Usage: +====================== +python -m modelopt.torch.puzzletron.replacement_library.build_replacement_library PUZZLE_DIR + +Teacher checkpoint dir is assumed to be inside PUZZLE_DIR/ckpts/teacher (symlink is recommended) +though you can supply an explicit --teacher_checkpoint_dir. + +--add_ffn_no_ops and --add_attention_no_ops are optional (default True), + + +Untrained puzzle run (with bypass): +=================================== +The subblock that doesn't interest you in the checkpoint should be no_op. + """ # mypy: ignore-errors import json from pathlib import Path -from typing import Any +from typing import Any, Type import pandas as pd from omegaconf import DictConfig @@ -57,7 +73,8 @@ def build_replacement_library( add_ffn_no_ops: bool = True, add_attention_no_ops: bool = True, ) -> None: - """For normal puzzle runs, use default values. + """ + For normal puzzle runs, use default values. For advanced use cases, see the Usage section. """ master_puzzle_dir = Path(master_puzzle_dir) @@ -90,7 +107,9 @@ def build_replacement_library( def launch_build_replacement_library(cfg: DictConfig) -> None: - """Launch the build replacement library function with Hydra configuration.""" + """ + Launch the build replacement library function with Hydra configuration. + """ mprint(f"Building replacement library for puzzle directory: {cfg.puzzle_dir}") mprint(f"Teacher directory: {cfg.teacher_dir}") mprint( @@ -113,8 +132,8 @@ def infer_teacher_dir( teacher_checkpoint_dir = Path(master_puzzle_dir) / CHECKPOINTS_DIR_NAME / "teacher" if not teacher_checkpoint_dir.exists(): raise ValueError( - "You must either provide the --teacher_checkpoint_dir argument, or create a link to the " - "teacher dir under '{PUZZLE_DIR}/ckpts'." + f"You must either provide the --teacher_checkpoint_dir argument, or create a link to the " + f"teacher dir under '{{PUZZLE_DIR}}/ckpts'." ) teacher_checkpoint_dir = Path(teacher_checkpoint_dir).resolve().absolute() return teacher_checkpoint_dir @@ -362,7 +381,7 @@ def _add_no_op_subblock_rows( def _get_rows_with_no_op_subblock( subblocks_df: pd.DataFrame, no_op_subblock: str -) -> tuple[pd.DataFrame, type[AttentionConfig] | type[FFNConfig]]: +) -> tuple[pd.DataFrame, Type[AttentionConfig] | Type[FFNConfig]]: other_subblock = "ffn" if no_op_subblock == "attention" else "attention" subblock_cls = AttentionConfig if no_op_subblock == "attention" else FFNConfig no_op_subblock_config = subblock_cls(no_op=True) diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py index 2e8630bc9..88081d177 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py @@ -189,7 +189,7 @@ def calculate_attention_memory( ): seq_len = min(seq_len, attention_chunk_size) - kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd) + kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd) total_num_tokens = seq_len * (batch_size + prefill_queue_size) kv_cache_size = total_num_tokens * kv_dim query_prefill_size = seq_len * n_embd if allocate_prefill_query else 0 @@ -208,7 +208,7 @@ def calculate_attention_params( n_embd: int, n_head: int, ) -> int: - kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd) + kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd) return ( n_embd * n_embd * 2 # Wq + Wo + n_embd * kv_dim # Wk + Wv diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index 07597eb5c..2db0bc391 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -19,11 +19,10 @@ import dataclasses import json import os -from collections.abc import Iterable from functools import partial from itertools import product from pathlib import Path -from typing import TypeVar +from typing import Iterable, Optional, Type, TypeVar os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" @@ -33,6 +32,10 @@ from omegaconf import DictConfig, ListConfig, OmegaConf from tqdm import tqdm +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, BlockConfig, @@ -56,6 +59,15 @@ # Type variable for dataclasses T_DataClass = TypeVar("T_DataClass") +""" +Usage: +python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ] + +--benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime, + only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker. + +""" + def calculate_subblock_stats( calc_subblock_stats_config: DictConfig, @@ -69,7 +81,7 @@ def calculate_subblock_stats( n_embd: int, n_head: int, vocab_size: int, - benchmark_iterations: int | None, + benchmark_iterations: Optional[int], use_cuda_graph: bool, weights_dtype: torch.dtype, activations_dtype: torch.dtype, @@ -181,6 +193,7 @@ def calculate_subblock_stats( ) if is_calc_runtime: + pass # TODO: fix # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \ @@ -206,17 +219,21 @@ def calculate_subblock_stats( def launch_calc_subblock_stats(cfg: DictConfig) -> None: - """Launch the calc subblock stats function with Hydra configuration.""" + """ + Launch the calc subblock stats function with Hydra configuration. + """ mprint(f"Calculating subblock stats for puzzle directory: {cfg.puzzle_dir}") mprint(f"Teacher directory: {cfg.teacher_dir}") mprint( f"Calc subblock stats config: {format_global_config(cfg.calc_subblock_stats, title='Calc subblock stats')}" ) + descriptor = ModelDescriptorFactory.get(cfg.descriptor) calculate_subblock_stats_for_puzzle_dir( cfg.calc_subblock_stats, master_puzzle_dir=cfg.puzzle_dir, teacher_dir=cfg.teacher_dir, + descriptor=descriptor, model_hidden_sizes=cfg.calc_subblock_stats.get("model_hidden_sizes", OmegaConf.create([])), ffn_hidden_sizes=cfg.calc_subblock_stats.get("ffn_hidden_sizes", OmegaConf.create([])), batch_sizes=cfg.calc_subblock_stats.batch_sizes, @@ -224,7 +241,7 @@ def launch_calc_subblock_stats(cfg: DictConfig) -> None: generation_seq_len=cfg.calc_subblock_stats.generation_seq_len, num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None), prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size, - allocate_prefill_query=cfg.calc_subblock_stats.allocate_prefill_query, + allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False), benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None), merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats, subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename, @@ -236,6 +253,7 @@ def calculate_subblock_stats_for_puzzle_dir( calc_subblock_stats_config: DictConfig, master_puzzle_dir: Path | str, teacher_dir: Path | str, + descriptor: Type[ModelDescriptor], model_hidden_sizes: ListConfig, ffn_hidden_sizes: ListConfig, batch_sizes: Iterable[int] = (1, 8, 16, 32, 64, 128, 256), @@ -268,6 +286,8 @@ def calculate_subblock_stats_for_puzzle_dir( Path(teacher_dir) if teacher_dir is not None else master_puzzle_dir / "ckpts" / "teacher" ) model_config = load_model_config(teacher_dir) + # Get language model config for LM-specific attributes (VL models have nested config) + lm_config = descriptor.get_language_model_config(model_config) subblock_configs = _load_subblock_configs(master_puzzle_dir, ffn_hidden_sizes, model_config) subblock_stats_file = master_puzzle_dir / subblock_stats_filename @@ -299,7 +319,7 @@ def calculate_subblock_stats_for_puzzle_dir( ] model_hidden_sizes = model_hidden_sizes + [ - model_config.hidden_size + lm_config.hidden_size ] # add a teacher model hidden size for batch_size, ( weights_dtype, @@ -323,8 +343,8 @@ def calculate_subblock_stats_for_puzzle_dir( generation_seq_len=generation_seq_len, prefill_queue_size=prefill_queue_size, n_embd=model_hidden_size, - n_head=model_config.num_attention_heads, - vocab_size=model_config.vocab_size, + n_head=lm_config.num_attention_heads, + vocab_size=lm_config.vocab_size, benchmark_iterations=curr_benchmark_iterations, use_cuda_graph=True, weights_dtype=weights_dtype, @@ -445,7 +465,7 @@ def _load_subblock_configs_from_replacement_library( return subblock_configs -T_DataClass: TypeVar = type[dataclasses.dataclass] +T_DataClass: TypeVar = Type[dataclasses.dataclass] def _dataclass_from_dict( @@ -483,7 +503,7 @@ def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None: if (subblock_config := curr_subblock.get("subblock_config")) is not None: if hasattr(subblock_config, "__dataclass_fields__"): subblock_config = dataclasses.asdict(subblock_config) - is_attention = subblock_config.get("n_heads_in_group", None) is not None + is_attention = subblock_config.get("num_key_value_heads", None) is not None runtime_factor = attention_factor if is_attention else ffn_factor for stat_name, stat_value in bf16_subblock.items(): if "runtime" in stat_name: @@ -512,7 +532,10 @@ def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> di stats for stats in subblock_stats if all( - [stats["args"][key] == corresponding_bf16_args[key] for key in corresponding_bf16_args] + [ + stats["args"][key] == corresponding_bf16_args[key] + for key in corresponding_bf16_args.keys() + ] ) ] if len(matching_bf16_stats) == 0: diff --git a/modelopt/torch/puzzletron/utils/utils.py b/modelopt/torch/puzzletron/utils/utils.py index d56aab0bd..77a13609a 100644 --- a/modelopt/torch/puzzletron/utils/utils.py +++ b/modelopt/torch/puzzletron/utils/utils.py @@ -28,24 +28,21 @@ ) -def calculate_kv_dim(n_heads_in_group: int, n_head: int, n_embd: int) -> int: +def calculate_kv_dim(num_key_value_heads: int, n_head: int, n_embd: int) -> int: """Calculate the key-value dimension for grouped-query attention. - TODO: Consider a better place for this function. - Args: - n_heads_in_group: Number of attention heads per key-value group. + num_key_value_heads: Number of key-value heads. n_head: Total number of attention heads. n_embd: Embedding dimension. Returns: - Combined dimension for key and value tensors (2 * n_kv_heads * head_size). + Combined dimension for key and value tensors (2 * num_key_value_heads * head_size). """ - if n_heads_in_group is None: + if num_key_value_heads is None: return 0 - n_kv_heads = n_head // n_heads_in_group head_size = n_embd // n_head - kv_dim = 2 * n_kv_heads * head_size + kv_dim = 2 * num_key_value_heads * head_size return kv_dim @@ -53,7 +50,6 @@ def raise_unknown_subblock_config_error(subblock_config: Any) -> None: """Raise an error for invalid subblock configuration types. TODO: Consider a better place for this function. - Args: subblock_config: The invalid subblock configuration object. @@ -69,7 +65,6 @@ def sizeof_dtype(dtype: torch.dtype) -> int | float: """Return the size in bytes of the given data type. TODO: Consider a better place for this function. - Args: dtype: PyTorch data type or custom type string (e.g., 'nvfp4'). @@ -125,10 +120,10 @@ def solution_to_str(block_configs: list[dict[str, Any] | BlockConfig]) -> str: def block_config_to_str(block_config: BlockConfig | dict[str, Any] | None) -> str | None: - """Convert a BlockConfig to a human-readable string representation. + """ + Convert a BlockConfig to a human-readable string representation. TODO: Consider a better place for this function. - Args: block_config: BlockConfig dataclass or dict containing attention and ffn configs. @@ -153,7 +148,6 @@ def subblock_config_to_str( """Convert a subblock config (FFN, Attention, Mamba, or MoE) to string. TODO: Consider a better place for this function. - Args: subblock_config: FFNConfig, AttentionConfig dataclass or dict. subblock_name: Name of subblock ('ffn', 'attention', 'mamba', 'moe'). @@ -161,7 +155,7 @@ def subblock_config_to_str( Returns: Formatted string showing subblock type and key parameters (e.g., intermediate_size, - n_heads_in_group), or None if input is None. + num_key_value_heads), or None if input is None. """ if subblock_config is None: return None @@ -194,8 +188,8 @@ def subblock_config_to_str( intermediate_size = subblock_config["intermediate_size"] rep += f" intermediate_{intermediate_size}".ljust(8) elif subblock_name == "attention": - n_heads_in_group = subblock_config["n_heads_in_group"] - rep += f" gqa_{n_heads_in_group}".ljust(8) + num_key_value_heads = subblock_config["num_key_value_heads"] + rep += f" kv_heads_{num_key_value_heads}".ljust(8) elif subblock_name == "mamba": mamba_num_heads = subblock_config["mamba"]["num_heads"] mamba_head_dim = subblock_config["mamba"]["head_dim"] @@ -216,7 +210,8 @@ def subblock_config_to_str( class EmptyInitOnDevice(torch.overrides.TorchFunctionMode): def __init__(self, device=None, dtype=None): - """Create tensors with given device and dtype and don't run initialization + """ + Create tensors with given device and dtype and don't run initialization (but instead use "empty tensors", i.e. uninitialized memory). device: `torch.device` to work with @@ -225,8 +220,8 @@ def __init__(self, device=None, dtype=None): Example:: with EmptyInitOnDevice("cuda", dtype=torch.bfloat16): model = LLaMA(model_config) - model.load_state_dict(torch.load("llama-lit/7B/lit-llama.pth")) - """ + model.load_state_dict(torch.load("llama-lit/7B/lit-llama.pth"))""" + self.device = device self.dtype = dtype