diff --git a/modelopt/torch/puzzletron/build_library_and_stats.py b/modelopt/torch/puzzletron/build_library_and_stats.py
index 5f04f6049..31cebdf6b 100644
--- a/modelopt/torch/puzzletron/build_library_and_stats.py
+++ b/modelopt/torch/puzzletron/build_library_and_stats.py
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Unified command that runs build_replacement_library followed by calc_subblock_stats.
+"""
+Unified command that runs build_replacement_library followed by calc_subblock_stats.
 
 This script combines the functionality of both commands into a single workflow:
 1. First, it builds the replacement library for the puzzle
@@ -28,17 +29,21 @@
 all the same configuration parameters for both build_replacement_library and calc_subblock_stats.
 """
 
+import hydra
 from omegaconf import DictConfig
 
 from modelopt.torch.puzzletron.replacement_library.build_replacement_library import (
     launch_build_replacement_library,
 )
 from modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats import launch_calc_subblock_stats
+from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers
 from modelopt.torch.puzzletron.tools.logger import mprint
+from modelopt.torch.puzzletron.utils.parsing import format_global_config
 
 
 def launch_build_library_and_stats(cfg: DictConfig) -> None:
-    """Launch both build_replacement_library and calc_subblock_stats in sequence.
+    """
+    Launch both build_replacement_library and calc_subblock_stats in sequence.
 
     Args:
         cfg: Hydra configuration containing settings for both commands
diff --git a/modelopt/torch/puzzletron/puzzletron.py b/modelopt/torch/puzzletron/puzzletron.py
index 94a1de57e..87d90fdd9 100644
--- a/modelopt/torch/puzzletron/puzzletron.py
+++ b/modelopt/torch/puzzletron/puzzletron.py
@@ -62,10 +62,10 @@ def puzzletron(
         pruning_ckpts.launch_prune_ckpt(hydra_cfg)
     dist.barrier()
 
-    # # Step 4: build_library_and_stats (single process)
-    # if dist.is_master():
-    #     build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
-    # dist.barrier()
+    # Step 4: build_library_and_stats (single process)
+    if dist.is_master():
+        build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
+    dist.barrier()
 
     # # Step 5: calc_one_block_scores (distributed processing)
     # scoring.launch_scoring(hydra_cfg)
diff --git a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py
index 1618aceaf..aec10e03b 100644
--- a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py
+++ b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py
@@ -12,17 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This module constructs the replacement library JSON files from a puzzle directory containing
+"""
+This module constructs the replacement library JSON files from a puzzle directory containing
 multiple trained model checkpoints. It analyzes checkpoints to extract unique block and subblock
 configurations, builds a library of available replacements, and generates solutions for layer
 replacement in compressed models. The resulting replacement library can then be used by
 ReplacementLibrary to efficiently load models with mixed teacher/student layers.
+
+Standard Puzzle Usage:
+======================
+python -m modelopt.torch.puzzletron.replacement_library.build_replacement_library PUZZLE_DIR
+
+Teacher checkpoint dir is assumed to be inside PUZZLE_DIR/ckpts/teacher (symlink is recommended)
+though you can supply an explicit --teacher_checkpoint_dir.
+
+--add_ffn_no_ops and --add_attention_no_ops are optional (default True),
+
+
+Untrained puzzle run (with bypass):
+===================================
+The subblock that doesn't interest you in the checkpoint should be no_op.
+
 """
 # mypy: ignore-errors
 
 import json
 from pathlib import Path
-from typing import Any
+from typing import Any, Type
 
 import pandas as pd
 from omegaconf import DictConfig
@@ -57,7 +73,8 @@ def build_replacement_library(
     add_ffn_no_ops: bool = True,
     add_attention_no_ops: bool = True,
 ) -> None:
-    """For normal puzzle runs, use default values.
+    """
+    For normal puzzle runs, use default values.
     For advanced use cases, see the Usage section.
     """
     master_puzzle_dir = Path(master_puzzle_dir)
@@ -90,7 +107,9 @@ def build_replacement_library(
 
 
 def launch_build_replacement_library(cfg: DictConfig) -> None:
-    """Launch the build replacement library function with Hydra configuration."""
+    """
+    Launch the build replacement library function with Hydra configuration.
+    """
     mprint(f"Building replacement library for puzzle directory: {cfg.puzzle_dir}")
     mprint(f"Teacher directory: {cfg.teacher_dir}")
     mprint(
@@ -113,8 +132,8 @@ def infer_teacher_dir(
         teacher_checkpoint_dir = Path(master_puzzle_dir) / CHECKPOINTS_DIR_NAME / "teacher"
         if not teacher_checkpoint_dir.exists():
             raise ValueError(
-                "You must either provide the --teacher_checkpoint_dir argument, or create a link to the "
-                "teacher dir under '{PUZZLE_DIR}/ckpts'."
+                f"You must either provide the --teacher_checkpoint_dir argument, or create a link to the "
+                f"teacher dir under '{{PUZZLE_DIR}}/ckpts'."
             )
     teacher_checkpoint_dir = Path(teacher_checkpoint_dir).resolve().absolute()
     return teacher_checkpoint_dir
@@ -362,7 +381,7 @@ def _add_no_op_subblock_rows(
 
 def _get_rows_with_no_op_subblock(
     subblocks_df: pd.DataFrame, no_op_subblock: str
-) -> tuple[pd.DataFrame, type[AttentionConfig] | type[FFNConfig]]:
+) -> tuple[pd.DataFrame, Type[AttentionConfig] | Type[FFNConfig]]:
     other_subblock = "ffn" if no_op_subblock == "attention" else "attention"
     subblock_cls = AttentionConfig if no_op_subblock == "attention" else FFNConfig
     no_op_subblock_config = subblock_cls(no_op=True)
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
index 2e8630bc9..88081d177 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
@@ -189,7 +189,7 @@ def calculate_attention_memory(
     ):
         seq_len = min(seq_len, attention_chunk_size)
 
-    kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd)
+    kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd)
     total_num_tokens = seq_len * (batch_size + prefill_queue_size)
     kv_cache_size = total_num_tokens * kv_dim
     query_prefill_size = seq_len * n_embd if allocate_prefill_query else 0
@@ -208,7 +208,7 @@ def calculate_attention_params(
     n_embd: int,
     n_head: int,
 ) -> int:
-    kv_dim = calculate_kv_dim(attention_config.n_heads_in_group, n_head, n_embd)
+    kv_dim = calculate_kv_dim(attention_config.num_key_value_heads, n_head, n_embd)
     return (
         n_embd * n_embd * 2  # Wq + Wo
         + n_embd * kv_dim  # Wk + Wv
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index 07597eb5c..2db0bc391 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -19,11 +19,10 @@
 import dataclasses
 import json
 import os
-from collections.abc import Iterable
 from functools import partial
 from itertools import product
 from pathlib import Path
-from typing import TypeVar
+from typing import Iterable, Optional, Type, TypeVar
 
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
@@ -33,6 +32,10 @@
 from omegaconf import DictConfig, ListConfig, OmegaConf
 from tqdm import tqdm
 
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
 from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
     AttentionConfig,
     BlockConfig,
@@ -56,6 +59,15 @@
 # Type variable for dataclasses
 T_DataClass = TypeVar("T_DataClass")
 
+"""
+Usage:
+python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ]
+
+--benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime,
+  only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker.
+
+"""
+
 
 def calculate_subblock_stats(
     calc_subblock_stats_config: DictConfig,
@@ -69,7 +81,7 @@ def calculate_subblock_stats(
     n_embd: int,
     n_head: int,
     vocab_size: int,
-    benchmark_iterations: int | None,
+    benchmark_iterations: Optional[int],
     use_cuda_graph: bool,
     weights_dtype: torch.dtype,
     activations_dtype: torch.dtype,
@@ -181,6 +193,7 @@ def calculate_subblock_stats(
         )
 
     if is_calc_runtime:
+        pass
         # TODO: fix
         # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms
         # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \
@@ -206,17 +219,21 @@ def calculate_subblock_stats(
 
 
 def launch_calc_subblock_stats(cfg: DictConfig) -> None:
-    """Launch the calc subblock stats function with Hydra configuration."""
+    """
+    Launch the calc subblock stats function with Hydra configuration.
+    """
     mprint(f"Calculating subblock stats for puzzle directory: {cfg.puzzle_dir}")
     mprint(f"Teacher directory: {cfg.teacher_dir}")
     mprint(
         f"Calc subblock stats config: {format_global_config(cfg.calc_subblock_stats, title='Calc subblock stats')}"
     )
 
+    descriptor = ModelDescriptorFactory.get(cfg.descriptor)
     calculate_subblock_stats_for_puzzle_dir(
         cfg.calc_subblock_stats,
         master_puzzle_dir=cfg.puzzle_dir,
         teacher_dir=cfg.teacher_dir,
+        descriptor=descriptor,
         model_hidden_sizes=cfg.calc_subblock_stats.get("model_hidden_sizes", OmegaConf.create([])),
         ffn_hidden_sizes=cfg.calc_subblock_stats.get("ffn_hidden_sizes", OmegaConf.create([])),
         batch_sizes=cfg.calc_subblock_stats.batch_sizes,
@@ -224,7 +241,7 @@ def launch_calc_subblock_stats(cfg: DictConfig) -> None:
         generation_seq_len=cfg.calc_subblock_stats.generation_seq_len,
         num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None),
         prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size,
-        allocate_prefill_query=cfg.calc_subblock_stats.allocate_prefill_query,
+        allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False),
         benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None),
         merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats,
         subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename,
@@ -236,6 +253,7 @@ def calculate_subblock_stats_for_puzzle_dir(
     calc_subblock_stats_config: DictConfig,
     master_puzzle_dir: Path | str,
     teacher_dir: Path | str,
+    descriptor: Type[ModelDescriptor],
     model_hidden_sizes: ListConfig,
     ffn_hidden_sizes: ListConfig,
     batch_sizes: Iterable[int] = (1, 8, 16, 32, 64, 128, 256),
@@ -268,6 +286,8 @@ def calculate_subblock_stats_for_puzzle_dir(
         Path(teacher_dir) if teacher_dir is not None else master_puzzle_dir / "ckpts" / "teacher"
     )
     model_config = load_model_config(teacher_dir)
+    # Get language model config for LM-specific attributes (VL models have nested config)
+    lm_config = descriptor.get_language_model_config(model_config)
     subblock_configs = _load_subblock_configs(master_puzzle_dir, ffn_hidden_sizes, model_config)
 
     subblock_stats_file = master_puzzle_dir / subblock_stats_filename
@@ -299,7 +319,7 @@ def calculate_subblock_stats_for_puzzle_dir(
     ]
 
     model_hidden_sizes = model_hidden_sizes + [
-        model_config.hidden_size
+        lm_config.hidden_size
     ]  # add a teacher model hidden size
     for batch_size, (
         weights_dtype,
@@ -323,8 +343,8 @@ def calculate_subblock_stats_for_puzzle_dir(
             generation_seq_len=generation_seq_len,
             prefill_queue_size=prefill_queue_size,
             n_embd=model_hidden_size,
-            n_head=model_config.num_attention_heads,
-            vocab_size=model_config.vocab_size,
+            n_head=lm_config.num_attention_heads,
+            vocab_size=lm_config.vocab_size,
             benchmark_iterations=curr_benchmark_iterations,
             use_cuda_graph=True,
             weights_dtype=weights_dtype,
@@ -445,7 +465,7 @@ def _load_subblock_configs_from_replacement_library(
     return subblock_configs
 
 
-T_DataClass: TypeVar = type[dataclasses.dataclass]
+T_DataClass: TypeVar = Type[dataclasses.dataclass]
 
 
 def _dataclass_from_dict(
@@ -483,7 +503,7 @@ def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None:
                     if (subblock_config := curr_subblock.get("subblock_config")) is not None:
                         if hasattr(subblock_config, "__dataclass_fields__"):
                             subblock_config = dataclasses.asdict(subblock_config)
-                        is_attention = subblock_config.get("n_heads_in_group", None) is not None
+                        is_attention = subblock_config.get("num_key_value_heads", None) is not None
                     runtime_factor = attention_factor if is_attention else ffn_factor
                     for stat_name, stat_value in bf16_subblock.items():
                         if "runtime" in stat_name:
@@ -512,7 +532,10 @@ def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> di
         stats
         for stats in subblock_stats
         if all(
-            [stats["args"][key] == corresponding_bf16_args[key] for key in corresponding_bf16_args]
+            [
+                stats["args"][key] == corresponding_bf16_args[key]
+                for key in corresponding_bf16_args.keys()
+            ]
         )
     ]
     if len(matching_bf16_stats) == 0:
diff --git a/modelopt/torch/puzzletron/utils/utils.py b/modelopt/torch/puzzletron/utils/utils.py
index d56aab0bd..77a13609a 100644
--- a/modelopt/torch/puzzletron/utils/utils.py
+++ b/modelopt/torch/puzzletron/utils/utils.py
@@ -28,24 +28,21 @@
 )
 
 
-def calculate_kv_dim(n_heads_in_group: int, n_head: int, n_embd: int) -> int:
+def calculate_kv_dim(num_key_value_heads: int, n_head: int, n_embd: int) -> int:
     """Calculate the key-value dimension for grouped-query attention.
 
-    TODO: Consider a better place for this function.
-
     Args:
-        n_heads_in_group: Number of attention heads per key-value group.
+        num_key_value_heads: Number of key-value heads.
         n_head: Total number of attention heads.
         n_embd: Embedding dimension.
 
     Returns:
-        Combined dimension for key and value tensors (2 * n_kv_heads * head_size).
+        Combined dimension for key and value tensors (2 * num_key_value_heads * head_size).
     """
-    if n_heads_in_group is None:
+    if num_key_value_heads is None:
         return 0
-    n_kv_heads = n_head // n_heads_in_group
     head_size = n_embd // n_head
-    kv_dim = 2 * n_kv_heads * head_size
+    kv_dim = 2 * num_key_value_heads * head_size
     return kv_dim
 
 
@@ -53,7 +50,6 @@ def raise_unknown_subblock_config_error(subblock_config: Any) -> None:
     """Raise an error for invalid subblock configuration types.
 
     TODO: Consider a better place for this function.
-
     Args:
         subblock_config: The invalid subblock configuration object.
 
@@ -69,7 +65,6 @@ def sizeof_dtype(dtype: torch.dtype) -> int | float:
     """Return the size in bytes of the given data type.
 
     TODO: Consider a better place for this function.
-
     Args:
         dtype: PyTorch data type or custom type string (e.g., 'nvfp4').
 
@@ -125,10 +120,10 @@ def solution_to_str(block_configs: list[dict[str, Any] | BlockConfig]) -> str:
 
 
 def block_config_to_str(block_config: BlockConfig | dict[str, Any] | None) -> str | None:
-    """Convert a BlockConfig to a human-readable string representation.
+    """
+    Convert a BlockConfig to a human-readable string representation.
 
     TODO: Consider a better place for this function.
-
     Args:
         block_config: BlockConfig dataclass or dict containing attention and ffn configs.
 
@@ -153,7 +148,6 @@ def subblock_config_to_str(
     """Convert a subblock config (FFN, Attention, Mamba, or MoE) to string.
 
     TODO: Consider a better place for this function.
-
     Args:
         subblock_config: FFNConfig, AttentionConfig dataclass or dict.
         subblock_name: Name of subblock ('ffn', 'attention', 'mamba', 'moe').
@@ -161,7 +155,7 @@ def subblock_config_to_str(
 
     Returns:
         Formatted string showing subblock type and key parameters (e.g., intermediate_size,
-        n_heads_in_group), or None if input is None.
+        num_key_value_heads), or None if input is None.
     """
     if subblock_config is None:
         return None
@@ -194,8 +188,8 @@ def subblock_config_to_str(
         intermediate_size = subblock_config["intermediate_size"]
         rep += f"  intermediate_{intermediate_size}".ljust(8)
     elif subblock_name == "attention":
-        n_heads_in_group = subblock_config["n_heads_in_group"]
-        rep += f"  gqa_{n_heads_in_group}".ljust(8)
+        num_key_value_heads = subblock_config["num_key_value_heads"]
+        rep += f"  kv_heads_{num_key_value_heads}".ljust(8)
     elif subblock_name == "mamba":
         mamba_num_heads = subblock_config["mamba"]["num_heads"]
         mamba_head_dim = subblock_config["mamba"]["head_dim"]
@@ -216,7 +210,8 @@ def subblock_config_to_str(
 
 class EmptyInitOnDevice(torch.overrides.TorchFunctionMode):
     def __init__(self, device=None, dtype=None):
-        """Create tensors with given device and dtype and don't run initialization
+        """
+        Create tensors with given device and dtype and don't run initialization
            (but instead use "empty tensors", i.e. uninitialized memory).
 
             device: `torch.device` to work with
@@ -225,8 +220,8 @@ def __init__(self, device=None, dtype=None):
         Example::
             with EmptyInitOnDevice("cuda", dtype=torch.bfloat16):
                 model = LLaMA(model_config)
-            model.load_state_dict(torch.load("llama-lit/7B/lit-llama.pth"))
-        """
+            model.load_state_dict(torch.load("llama-lit/7B/lit-llama.pth"))"""
+
         self.device = device
         self.dtype = dtype