NVIDIA · danielkorzekwa · Mar 6, 2026 · kevalmorabia97 · Mar 6, 2026
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Unified command that runs build_replacement_library followed by calc_subblock_stats.
+"""
+Unified command that runs build_replacement_library followed by calc_subblock_stats.
 
 This script combines the functionality of both commands into a single workflow:
 1. First, it builds the replacement library for the puzzle
@@ -28,17 +29,21 @@
 all the same configuration parameters for both build_replacement_library and calc_subblock_stats.
 """
 
+import hydra
 from omegaconf import DictConfig
 
 from modelopt.torch.puzzletron.replacement_library.build_replacement_library import (
     launch_build_replacement_library,
 )
 from modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats import launch_calc_subblock_stats
+from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers
 from modelopt.torch.puzzletron.tools.logger import mprint
+from modelopt.torch.puzzletron.utils.parsing import format_global_config
 
 
 def launch_build_library_and_stats(cfg: DictConfig) -> None:
-    """Launch both build_replacement_library and calc_subblock_stats in sequence.
+    """
+    Launch both build_replacement_library and calc_subblock_stats in sequence.
 
     Args:
         cfg: Hydra configuration containing settings for both commands

@@ -62,10 +62,10 @@ def puzzletron(
         pruning_ckpts.launch_prune_ckpt(hydra_cfg)
     dist.barrier()
 
-    # # Step 4: build_library_and_stats (single process)
-    # if dist.is_master():
-    #     build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
-    # dist.barrier()
+    # Step 4: build_library_and_stats (single process)
+    if dist.is_master():
+        build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
+    dist.barrier()
 
     # # Step 5: calc_one_block_scores (distributed processing)
     # scoring.launch_scoring(hydra_cfg)

@@ -12,17 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This module constructs the replacement library JSON files from a puzzle directory containing
+"""
+This module constructs the replacement library JSON files from a puzzle directory containing
 multiple trained model checkpoints. It analyzes checkpoints to extract unique block and subblock
 configurations, builds a library of available replacements, and generates solutions for layer
 replacement in compressed models. The resulting replacement library can then be used by
 ReplacementLibrary to efficiently load models with mixed teacher/student layers.
+
+Standard Puzzle Usage:
+======================
+python -m modelopt.torch.puzzletron.replacement_library.build_replacement_library PUZZLE_DIR
+
+Teacher checkpoint dir is assumed to be inside PUZZLE_DIR/ckpts/teacher (symlink is recommended)
+though you can supply an explicit --teacher_checkpoint_dir.
+
+--add_ffn_no_ops and --add_attention_no_ops are optional (default True),
+
+
+Untrained puzzle run (with bypass):
+===================================
+The subblock that doesn't interest you in the checkpoint should be no_op.
+
 """
 # mypy: ignore-errors
 
 import json
 from pathlib import Path
-from typing import Any
+from typing import Any, Type
 
 import pandas as pd
 from omegaconf import DictConfig
@@ -57,7 +73,8 @@ def build_replacement_library(
     add_ffn_no_ops: bool = True,
     add_attention_no_ops: bool = True,
 ) -> None:
-    """For normal puzzle runs, use default values.
+    """
+    For normal puzzle runs, use default values.
     For advanced use cases, see the Usage section.
     """
     master_puzzle_dir = Path(master_puzzle_dir)
@@ -90,7 +107,9 @@ def build_replacement_library(
 
 
 def launch_build_replacement_library(cfg: DictConfig) -> None:
-    """Launch the build replacement library function with Hydra configuration."""
+    """
+    Launch the build replacement library function with Hydra configuration.
+    """
     mprint(f"Building replacement library for puzzle directory: {cfg.puzzle_dir}")
     mprint(f"Teacher directory: {cfg.teacher_dir}")
     mprint(
@@ -113,8 +132,8 @@ def infer_teacher_dir(
         teacher_checkpoint_dir = Path(master_puzzle_dir) / CHECKPOINTS_DIR_NAME / "teacher"
         if not teacher_checkpoint_dir.exists():
             raise ValueError(
-                "You must either provide the --teacher_checkpoint_dir argument, or create a link to the "
-                "teacher dir under '{PUZZLE_DIR}/ckpts'."
+                f"You must either provide the --teacher_checkpoint_dir argument, or create a link to the "
+                f"teacher dir under '{{PUZZLE_DIR}}/ckpts'."
             )
     teacher_checkpoint_dir = Path(teacher_checkpoint_dir).resolve().absolute()
     return teacher_checkpoint_dir
@@ -362,7 +381,7 @@ def _add_no_op_subblock_rows(
 
 def _get_rows_with_no_op_subblock(
     subblocks_df: pd.DataFrame, no_op_subblock: str
-) -> tuple[pd.DataFrame, type[AttentionConfig] | type[FFNConfig]]:
+) -> tuple[pd.DataFrame, Type[AttentionConfig] | Type[FFNConfig]]:
     other_subblock = "ffn" if no_op_subblock == "attention" else "attention"
     subblock_cls = AttentionConfig if no_op_subblock == "attention" else FFNConfig
     no_op_subblock_config = subblock_cls(no_op=True)

@@ -189,7 +189,7 @@ def calculate_attention_memory(
     ):
         seq_len = min(seq_len, attention_chunk_size)
 
-    kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd)
+    kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd)
     total_num_tokens = seq_len * (batch_size + prefill_queue_size)
     kv_cache_size = total_num_tokens * kv_dim
     query_prefill_size = seq_len * n_embd if allocate_prefill_query else 0
@@ -208,7 +208,7 @@ def calculate_attention_params(
     n_embd: int,
     n_head: int,
 ) -> int:
-    kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd)
+    kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd)
     return (
         n_embd * n_embd * 2  # Wq + Wo
         + n_embd * kv_dim  # Wk + Wv

@@ -19,11 +19,10 @@
 import dataclasses
 import json
 import os
-from collections.abc import Iterable
 from functools import partial
 from itertools import product
 from pathlib import Path
-from typing import TypeVar
+from typing import Iterable, Optional, Type, TypeVar
 
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
@@ -33,6 +32,10 @@
 from omegaconf import DictConfig, ListConfig, OmegaConf
 from tqdm import tqdm
 
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
 from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
     AttentionConfig,
     BlockConfig,
@@ -56,6 +59,15 @@
 # Type variable for dataclasses
 T_DataClass = TypeVar("T_DataClass")
 
+"""
+Usage:
+python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ]
+
+--benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime,
+  only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker.
+
+"""
+
 
 def calculate_subblock_stats(
     calc_subblock_stats_config: DictConfig,
@@ -69,7 +81,7 @@ def calculate_subblock_stats(
     n_embd: int,
     n_head: int,
     vocab_size: int,
-    benchmark_iterations: int | None,
+    benchmark_iterations: Optional[int],
     use_cuda_graph: bool,
     weights_dtype: torch.dtype,
     activations_dtype: torch.dtype,
@@ -181,6 +193,7 @@ def calculate_subblock_stats(
         )
 
     if is_calc_runtime:
+        pass
         # TODO: fix
         # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms
         # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \
@@ -206,25 +219,29 @@ def calculate_subblock_stats(
 
 
 def launch_calc_subblock_stats(cfg: DictConfig) -> None:
-    """Launch the calc subblock stats function with Hydra configuration."""
+    """
+    Launch the calc subblock stats function with Hydra configuration.
+    """
     mprint(f"Calculating subblock stats for puzzle directory: {cfg.puzzle_dir}")
     mprint(f"Teacher directory: {cfg.teacher_dir}")
     mprint(
         f"Calc subblock stats config: {format_global_config(cfg.calc_subblock_stats, title='Calc subblock stats')}"
     )
 
+    descriptor = ModelDescriptorFactory.get(cfg.descriptor)
     calculate_subblock_stats_for_puzzle_dir(
         cfg.calc_subblock_stats,
         master_puzzle_dir=cfg.puzzle_dir,
         teacher_dir=cfg.teacher_dir,
+        descriptor=descriptor,
         model_hidden_sizes=cfg.calc_subblock_stats.get("model_hidden_sizes", OmegaConf.create([])),
         ffn_hidden_sizes=cfg.calc_subblock_stats.get("ffn_hidden_sizes", OmegaConf.create([])),
         batch_sizes=cfg.calc_subblock_stats.batch_sizes,
         prefill_seq_len=cfg.calc_subblock_stats.prefill_seq_len,
         generation_seq_len=cfg.calc_subblock_stats.generation_seq_len,
         num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None),
         prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size,
-        allocate_prefill_query=cfg.calc_subblock_stats.allocate_prefill_query,
+        allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False),
         benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None),
         merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats,
         subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename,
@@ -236,6 +253,7 @@ def calculate_subblock_stats_for_puzzle_dir(
     calc_subblock_stats_config: DictConfig,
     master_puzzle_dir: Path | str,
     teacher_dir: Path | str,
+    descriptor: Type[ModelDescriptor],
     model_hidden_sizes: ListConfig,
     ffn_hidden_sizes: ListConfig,
     batch_sizes: Iterable[int] = (1, 8, 16, 32, 64, 128, 256),
@@ -268,6 +286,8 @@ def calculate_subblock_stats_for_puzzle_dir(
         Path(teacher_dir) if teacher_dir is not None else master_puzzle_dir / "ckpts" / "teacher"
     )
     model_config = load_model_config(teacher_dir)
+    # Get language model config for LM-specific attributes (VL models have nested config)
+    lm_config = descriptor.get_language_model_config(model_config)
     subblock_configs = _load_subblock_configs(master_puzzle_dir, ffn_hidden_sizes, model_config)
 
     subblock_stats_file = master_puzzle_dir / subblock_stats_filename
@@ -299,7 +319,7 @@ def calculate_subblock_stats_for_puzzle_dir(
     ]
 
     model_hidden_sizes = model_hidden_sizes + [
-        model_config.hidden_size
+        lm_config.hidden_size
     ]  # add a teacher model hidden size
     for batch_size, (
         weights_dtype,
@@ -323,8 +343,8 @@ def calculate_subblock_stats_for_puzzle_dir(
             generation_seq_len=generation_seq_len,
             prefill_queue_size=prefill_queue_size,
             n_embd=model_hidden_size,
-            n_head=model_config.num_attention_heads,
-            vocab_size=model_config.vocab_size,
+            n_head=lm_config.num_attention_heads,
+            vocab_size=lm_config.vocab_size,
             benchmark_iterations=curr_benchmark_iterations,
             use_cuda_graph=True,
             weights_dtype=weights_dtype,
@@ -445,7 +465,7 @@ def _load_subblock_configs_from_replacement_library(
     return subblock_configs
 
 
-T_DataClass: TypeVar = type[dataclasses.dataclass]
+T_DataClass: TypeVar = Type[dataclasses.dataclass]
 
 
 def _dataclass_from_dict(
@@ -483,7 +503,7 @@ def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None:
                     if (subblock_config := curr_subblock.get("subblock_config")) is not None:
                         if hasattr(subblock_config, "__dataclass_fields__"):
                             subblock_config = dataclasses.asdict(subblock_config)
-                        is_attention = subblock_config.get("n_heads_in_group", None) is not None
+                        is_attention = subblock_config.get("num_key_value_heads", None) is not None
                     runtime_factor = attention_factor if is_attention else ffn_factor
                     for stat_name, stat_value in bf16_subblock.items():
                         if "runtime" in stat_name:
@@ -512,7 +532,10 @@ def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> di
         stats
         for stats in subblock_stats
         if all(
-            [stats["args"][key] == corresponding_bf16_args[key] for key in corresponding_bf16_args]
+            [
+                stats["args"][key] == corresponding_bf16_args[key]
+                for key in corresponding_bf16_args.keys()
+            ]
         )
     ]
     if len(matching_bf16_stats) == 0: