Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions modelopt/torch/puzzletron/build_library_and_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""Unified command that runs build_replacement_library followed by calc_subblock_stats.
"""
Unified command that runs build_replacement_library followed by calc_subblock_stats.

This script combines the functionality of both commands into a single workflow:
1. First, it builds the replacement library for the puzzle
Expand All @@ -28,17 +29,21 @@
all the same configuration parameters for both build_replacement_library and calc_subblock_stats.
"""

import hydra
from omegaconf import DictConfig

from modelopt.torch.puzzletron.replacement_library.build_replacement_library import (
launch_build_replacement_library,
)
from modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats import launch_calc_subblock_stats
from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers
from modelopt.torch.puzzletron.tools.logger import mprint
from modelopt.torch.puzzletron.utils.parsing import format_global_config


def launch_build_library_and_stats(cfg: DictConfig) -> None:
"""Launch both build_replacement_library and calc_subblock_stats in sequence.
"""
Launch both build_replacement_library and calc_subblock_stats in sequence.

Args:
cfg: Hydra configuration containing settings for both commands
Expand Down
8 changes: 4 additions & 4 deletions modelopt/torch/puzzletron/puzzletron.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ def puzzletron(
pruning_ckpts.launch_prune_ckpt(hydra_cfg)
dist.barrier()

# # Step 4: build_library_and_stats (single process)
# if dist.is_master():
# build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
# dist.barrier()
# Step 4: build_library_and_stats (single process)
if dist.is_master():
build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
dist.barrier()

# # Step 5: calc_one_block_scores (distributed processing)
# scoring.launch_scoring(hydra_cfg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module constructs the replacement library JSON files from a puzzle directory containing
"""
This module constructs the replacement library JSON files from a puzzle directory containing
multiple trained model checkpoints. It analyzes checkpoints to extract unique block and subblock
configurations, builds a library of available replacements, and generates solutions for layer
replacement in compressed models. The resulting replacement library can then be used by
ReplacementLibrary to efficiently load models with mixed teacher/student layers.

Standard Puzzle Usage:
======================
python -m modelopt.torch.puzzletron.replacement_library.build_replacement_library PUZZLE_DIR

Teacher checkpoint dir is assumed to be inside PUZZLE_DIR/ckpts/teacher (symlink is recommended)
though you can supply an explicit --teacher_checkpoint_dir.

--add_ffn_no_ops and --add_attention_no_ops are optional (default True),


Untrained puzzle run (with bypass):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we dont have bypass

===================================
The subblock that doesn't interest you in the checkpoint should be no_op.

"""
# mypy: ignore-errors

import json
from pathlib import Path
from typing import Any
from typing import Any, Type

import pandas as pd
from omegaconf import DictConfig
Expand Down Expand Up @@ -57,7 +73,8 @@ def build_replacement_library(
add_ffn_no_ops: bool = True,
add_attention_no_ops: bool = True,
) -> None:
"""For normal puzzle runs, use default values.
"""
For normal puzzle runs, use default values.
For advanced use cases, see the Usage section.
"""
master_puzzle_dir = Path(master_puzzle_dir)
Expand Down Expand Up @@ -90,7 +107,9 @@ def build_replacement_library(


def launch_build_replacement_library(cfg: DictConfig) -> None:
"""Launch the build replacement library function with Hydra configuration."""
"""
Launch the build replacement library function with Hydra configuration.
"""
mprint(f"Building replacement library for puzzle directory: {cfg.puzzle_dir}")
mprint(f"Teacher directory: {cfg.teacher_dir}")
mprint(
Expand All @@ -113,8 +132,8 @@ def infer_teacher_dir(
teacher_checkpoint_dir = Path(master_puzzle_dir) / CHECKPOINTS_DIR_NAME / "teacher"
if not teacher_checkpoint_dir.exists():
raise ValueError(
"You must either provide the --teacher_checkpoint_dir argument, or create a link to the "
"teacher dir under '{PUZZLE_DIR}/ckpts'."
f"You must either provide the --teacher_checkpoint_dir argument, or create a link to the "
f"teacher dir under '{{PUZZLE_DIR}}/ckpts'."
)
teacher_checkpoint_dir = Path(teacher_checkpoint_dir).resolve().absolute()
return teacher_checkpoint_dir
Expand Down Expand Up @@ -362,7 +381,7 @@ def _add_no_op_subblock_rows(

def _get_rows_with_no_op_subblock(
subblocks_df: pd.DataFrame, no_op_subblock: str
) -> tuple[pd.DataFrame, type[AttentionConfig] | type[FFNConfig]]:
) -> tuple[pd.DataFrame, Type[AttentionConfig] | Type[FFNConfig]]:
other_subblock = "ffn" if no_op_subblock == "attention" else "attention"
subblock_cls = AttentionConfig if no_op_subblock == "attention" else FFNConfig
no_op_subblock_config = subblock_cls(no_op=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def calculate_attention_memory(
):
seq_len = min(seq_len, attention_chunk_size)

kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd)
kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd)
total_num_tokens = seq_len * (batch_size + prefill_queue_size)
kv_cache_size = total_num_tokens * kv_dim
query_prefill_size = seq_len * n_embd if allocate_prefill_query else 0
Expand All @@ -208,7 +208,7 @@ def calculate_attention_params(
n_embd: int,
n_head: int,
) -> int:
kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd)
kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd)
return (
n_embd * n_embd * 2 # Wq + Wo
+ n_embd * kv_dim # Wk + Wv
Expand Down
45 changes: 34 additions & 11 deletions modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@
import dataclasses
import json
import os
from collections.abc import Iterable
from functools import partial
from itertools import product
from pathlib import Path
from typing import TypeVar
from typing import Iterable, Optional, Type, TypeVar

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Expand All @@ -33,6 +32,10 @@
from omegaconf import DictConfig, ListConfig, OmegaConf
from tqdm import tqdm

from modelopt.torch.puzzletron.anymodel.model_descriptor import (
ModelDescriptor,
ModelDescriptorFactory,
)
from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
AttentionConfig,
BlockConfig,
Expand All @@ -56,6 +59,15 @@
# Type variable for dataclasses
T_DataClass = TypeVar("T_DataClass")

"""
Usage:
python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ]

--benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime,
only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker.

"""


def calculate_subblock_stats(
calc_subblock_stats_config: DictConfig,
Expand All @@ -69,7 +81,7 @@ def calculate_subblock_stats(
n_embd: int,
n_head: int,
vocab_size: int,
benchmark_iterations: int | None,
benchmark_iterations: Optional[int],
use_cuda_graph: bool,
weights_dtype: torch.dtype,
activations_dtype: torch.dtype,
Expand Down Expand Up @@ -181,6 +193,7 @@ def calculate_subblock_stats(
)

if is_calc_runtime:
pass
# TODO: fix
# from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms
# non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \
Expand All @@ -206,25 +219,29 @@ def calculate_subblock_stats(


def launch_calc_subblock_stats(cfg: DictConfig) -> None:
"""Launch the calc subblock stats function with Hydra configuration."""
"""
Launch the calc subblock stats function with Hydra configuration.
"""
mprint(f"Calculating subblock stats for puzzle directory: {cfg.puzzle_dir}")
mprint(f"Teacher directory: {cfg.teacher_dir}")
mprint(
f"Calc subblock stats config: {format_global_config(cfg.calc_subblock_stats, title='Calc subblock stats')}"
)

descriptor = ModelDescriptorFactory.get(cfg.descriptor)
calculate_subblock_stats_for_puzzle_dir(
cfg.calc_subblock_stats,
master_puzzle_dir=cfg.puzzle_dir,
teacher_dir=cfg.teacher_dir,
descriptor=descriptor,
model_hidden_sizes=cfg.calc_subblock_stats.get("model_hidden_sizes", OmegaConf.create([])),
ffn_hidden_sizes=cfg.calc_subblock_stats.get("ffn_hidden_sizes", OmegaConf.create([])),
batch_sizes=cfg.calc_subblock_stats.batch_sizes,
prefill_seq_len=cfg.calc_subblock_stats.prefill_seq_len,
generation_seq_len=cfg.calc_subblock_stats.generation_seq_len,
num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None),
prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size,
allocate_prefill_query=cfg.calc_subblock_stats.allocate_prefill_query,
allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False),
benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None),
merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats,
subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename,
Expand All @@ -236,6 +253,7 @@ def calculate_subblock_stats_for_puzzle_dir(
calc_subblock_stats_config: DictConfig,
master_puzzle_dir: Path | str,
teacher_dir: Path | str,
descriptor: Type[ModelDescriptor],
model_hidden_sizes: ListConfig,
ffn_hidden_sizes: ListConfig,
batch_sizes: Iterable[int] = (1, 8, 16, 32, 64, 128, 256),
Expand Down Expand Up @@ -268,6 +286,8 @@ def calculate_subblock_stats_for_puzzle_dir(
Path(teacher_dir) if teacher_dir is not None else master_puzzle_dir / "ckpts" / "teacher"
)
model_config = load_model_config(teacher_dir)
# Get language model config for LM-specific attributes (VL models have nested config)
lm_config = descriptor.get_language_model_config(model_config)
subblock_configs = _load_subblock_configs(master_puzzle_dir, ffn_hidden_sizes, model_config)

subblock_stats_file = master_puzzle_dir / subblock_stats_filename
Expand Down Expand Up @@ -299,7 +319,7 @@ def calculate_subblock_stats_for_puzzle_dir(
]

model_hidden_sizes = model_hidden_sizes + [
model_config.hidden_size
lm_config.hidden_size
] # add a teacher model hidden size
for batch_size, (
weights_dtype,
Expand All @@ -323,8 +343,8 @@ def calculate_subblock_stats_for_puzzle_dir(
generation_seq_len=generation_seq_len,
prefill_queue_size=prefill_queue_size,
n_embd=model_hidden_size,
n_head=model_config.num_attention_heads,
vocab_size=model_config.vocab_size,
n_head=lm_config.num_attention_heads,
vocab_size=lm_config.vocab_size,
benchmark_iterations=curr_benchmark_iterations,
use_cuda_graph=True,
weights_dtype=weights_dtype,
Expand Down Expand Up @@ -445,7 +465,7 @@ def _load_subblock_configs_from_replacement_library(
return subblock_configs


T_DataClass: TypeVar = type[dataclasses.dataclass]
T_DataClass: TypeVar = Type[dataclasses.dataclass]


def _dataclass_from_dict(
Expand Down Expand Up @@ -483,7 +503,7 @@ def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None:
if (subblock_config := curr_subblock.get("subblock_config")) is not None:
if hasattr(subblock_config, "__dataclass_fields__"):
subblock_config = dataclasses.asdict(subblock_config)
is_attention = subblock_config.get("n_heads_in_group", None) is not None
is_attention = subblock_config.get("num_key_value_heads", None) is not None
runtime_factor = attention_factor if is_attention else ffn_factor
for stat_name, stat_value in bf16_subblock.items():
if "runtime" in stat_name:
Expand Down Expand Up @@ -512,7 +532,10 @@ def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> di
stats
for stats in subblock_stats
if all(
[stats["args"][key] == corresponding_bf16_args[key] for key in corresponding_bf16_args]
[
stats["args"][key] == corresponding_bf16_args[key]
for key in corresponding_bf16_args.keys()
]
)
]
if len(matching_bf16_stats) == 0:
Expand Down
Loading
Loading