[Performance] [Utils] Refactor embeddings utils (#2080)

kylesayrs · web-flow · commit 034072486874 · 2025-12-02T16:38:24.000Z
## Purpose ##
* Prerequisite for detecting if the lm head can be skipped (which is
necessary to support large batch calibration)
* Breaking up the embeddings utils in this way makes it easier to
implement `disable_lm_head` later

## Changes ##
* Generalize embedding utils
  * `_get_embeddings_or_warn` -&gt; `get_embeddings`
    * Callers no longer have to use try-catch
    * Callers are responsible for warning
  * `untie_word_embeddings` is largely unchanged, slight clarity changes
* Update modifiers
* `untie_if_target_shared_embedding(...)` -&gt; `if
targets_embeddings(...): untie_word_embeddings(...)`

## Testing ##
* Rename `test_model_shared_tensors` to `test_untie_word_embeddings`
* Add `test_targets_embeddings` to test targeting embeddings

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -29,12 +29,12 @@
 from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.compression.compressed_tensors_utils import (
     modify_save_pretrained,
-    untie_word_embeddings,
 )
 from llmcompressor.transformers.utils.helpers import (
     is_model_ct_quantized_from_path,
 )
 from llmcompressor.typing import Processor
+from llmcompressor.utils import untie_word_embeddings
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
 
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
@@ -20,10 +20,8 @@
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
 from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_if_target_shared_embedding,
-)
-from llmcompressor.utils.pytorch.module import get_no_split_params
+from llmcompressor.utils import targets_embeddings, untie_word_embeddings
+from llmcompressor.utils.pytorch import get_no_split_params
 
 __all__ = ["AutoRoundModifier"]
 
@@ -110,7 +108,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     batch_size: int = 8
 
     # private variables
-    _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
     _q_input: Optional[torch.Tensor] = PrivateAttr(default=None)
 
@@ -125,10 +122,6 @@ def on_initialize(self, state: State, **kwargs) -> bool:
             QuantizationMixin.initialize_quantization(self, state.model)
 
         # prepare module names
-        self._module_names = {
-            m: name
-            for name, m in match_named_modules(state.model, self.targets, self.ignore)
-        }
         self._add_temporary_names(state.model)
         # freeze all model parameters
         for _, param in state.model.named_parameters():
@@ -143,7 +136,9 @@ def start_calibration(self, model: torch.nn.Module):
 
         :param model: model to prepare for calibration
         """
-        untie_if_target_shared_embedding(model, self._module_names.values())
+        targets = match_named_modules(model, self.targets, self.ignore)
+        if targets_embeddings(model, targets):
+            untie_word_embeddings(model)
 
         for _, module in match_named_modules(model, self.targets, self.ignore):
             # Note: No need to register observers for auto-round
diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py
@@ -34,9 +34,7 @@
     reset_quantization_status,
 )
 from llmcompressor.modifiers.utils.hooks import HooksMixin
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_if_target_shared_embedding,
-)
+from llmcompressor.utils import targets_embeddings, untie_word_embeddings
 
 __all__ = ["QuantizationMixin"]
 
@@ -182,11 +180,9 @@ def start_calibration(self, model: torch.nn.Module):
 
         :param model: model to prepare for calibration
         """
-
-        matched_module_generator = (
-            x[1] for x in match_named_modules(model, self.resolved_targets, self.ignore)
-        )
-        untie_if_target_shared_embedding(model, matched_module_generator)
+        targets = match_named_modules(model, self.resolved_targets, self.ignore)
+        if targets_embeddings(model, targets):
+            untie_word_embeddings(model)
 
         for _, module in match_named_modules(model, self.resolved_targets, self.ignore):
             self._initialize_observers(module)
diff --git a/src/llmcompressor/modifiers/transform/quip/base.py b/src/llmcompressor/modifiers/transform/quip/base.py
@@ -12,9 +12,8 @@
 
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_if_target_shared_embedding,
-)
+from llmcompressor.typing import NamedModules
+from llmcompressor.utils import targets_embeddings, untie_word_embeddings
 
 __all__ = ["QuIPModifier"]
 
@@ -102,18 +101,13 @@ def on_initialize(self, state: State, **kwargs) -> bool:
 
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
-
-        def matched_module_generator():
-            for scheme in self.transform_config.config_groups.values():
-                for arg in scheme.apply:
-                    gen = match_named_modules(state.model, arg.targets, arg.ignore)
-                    for _, module in gen:
-                        yield module
+        model = state.model
 
         # Untie embeddings if they will be targeted by transforms
-        untie_if_target_shared_embedding(state.model, matched_module_generator())
+        if targets_embeddings(model, self._get_targets(model)):
+            untie_word_embeddings(model)
 
-        apply_transform_config(state.model, self.transform_config)
+        apply_transform_config(model, self.transform_config)
 
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
@@ -136,6 +130,17 @@ def on_finalize(self, state: State, **kwargs) -> bool:
 
         return True
 
+    def _get_targets(self, model: torch.nn.Module) -> NamedModules:
+        if not self.initialized_:
+            raise ValueError("Cannot get targets before modifier has been initialized")
+
+        return [
+            (name, module)
+            for scheme in self.transform_config.config_groups.values()
+            for arg in scheme.apply
+            for name, module in match_named_modules(model, arg.targets, arg.ignore)
+        ]
+
     def _create_config(self) -> TransformConfig:
         config_groups = dict()
         if "v" in self.rotations:
diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py
@@ -16,9 +16,8 @@
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modeling import center_embeddings, fuse_norm_linears
 from llmcompressor.modifiers import Modifier
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_word_embeddings,
-)
+from llmcompressor.typing import NamedModules
+from llmcompressor.utils import untie_word_embeddings
 
 from .mappings import SpinQuantMapping, infer_mapping_from_model
 from .norm_mappings import NormMapping, infer_norm_mapping_from_model
@@ -151,14 +150,16 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     @torch.no_grad()
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
+        model = state.model
+
+        # untie embeddings to avoid unintended effects of `_center_embeddings`
+        untie_word_embeddings(model)
 
-        # needed any time embeddings/lm_head is modified
-        untie_word_embeddings(state.model)
         # needs to happen after the model has been hooked to execute on the GPU
         # otherwise we're applying weight transforms on CPU
-        self._center_embeddings(state.model)
-        self._fuse_norms(state.model)
-        apply_transform_config(state.model, self.transform_config)
+        self._center_embeddings(model)
+        self._fuse_norms(model)
+        apply_transform_config(model, self.transform_config)
 
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
@@ -181,6 +182,17 @@ def on_finalize(self, state: State, **kwargs) -> bool:
 
         return True
 
+    def _get_targets(self, model: torch.nn.Module) -> NamedModules:
+        if not self.initialized_:
+            raise ValueError("Cannot get targets before modifier has been initialized")
+
+        return [
+            (name, module)
+            for scheme in self.transform_config.config_groups.values()
+            for arg in scheme.apply
+            for name, module in match_named_modules(model, arg.targets, arg.ignore)
+        ]
+
     def _center_embeddings(self, model: PreTrainedModel):
         for _, embedding in match_named_modules(
             model, [self.mappings.embedding], warn_on_fail=True
diff --git a/src/llmcompressor/transformers/compression/compressed_tensors_utils.py b/src/llmcompressor/transformers/compression/compressed_tensors_utils.py
@@ -1,6 +1,5 @@
 import os
 import weakref
-from collections.abc import Generator
 from functools import wraps
 from typing import Optional
 
@@ -9,9 +8,6 @@
 from compressed_tensors import (
     ModelCompressor,
     SparsityCompressionConfig,
-    delete_offload_parameter,
-    has_offloaded_params,
-    register_offload_parameter,
 )
 from compressed_tensors.config import CompressionFormat
 from loguru import logger
@@ -25,7 +21,7 @@
 from llmcompressor.transformers.utils import RECIPE_FILE_NAME
 from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path
 
-__all__ = ["modify_save_pretrained", "untie_word_embeddings"]
+__all__ = ["modify_save_pretrained"]
 
 
 def modify_save_pretrained(model: PreTrainedModel):
@@ -118,119 +114,6 @@ def save_pretrained_wrapper(
         model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
 
 
-def untie_word_embeddings(model: PreTrainedModel):
-    """
-    Patches bug where HF transformers will fail to untie weights under specific
-    circumstances (https://github.com/huggingface/transformers/issues/33689).
-
-    This function detects those cases and unties the tensors if applicable
-
-    :param model: model to fix
-    """
-    try:
-        input_embed = model.get_input_embeddings()
-        output_embed = model.get_output_embeddings()
-    except NotImplementedError as e:
-        logger.warning(
-            f"cannot untie model of type {model.__class__} which doesn't have "
-            f"get_input_embeddings and get_output_embeddings implmented\n{e}"
-        )
-        return
-
-    for module in (input_embed, output_embed):
-        if module is None or not hasattr(module, "weight"):
-            logger.warning(f"Cannot untie {module} which does not have weight param")
-            continue
-
-        # this could be replaced by a `get_offloaded_parameter` util
-        if not has_offloaded_params(module):
-            untied_data = module.weight.data.clone()
-        else:
-            untied_data = module._hf_hook.weights_map["weight"].clone()
-
-        requires_grad = module.weight.requires_grad
-        new_parameter = torch.nn.Parameter(untied_data, requires_grad=requires_grad)
-        delete_offload_parameter(module, "weight")
-        register_offload_parameter(module, "weight", new_parameter)
-
-    if hasattr(model.config, "tie_word_embeddings"):
-        model.config.tie_word_embeddings = False
-
-
-def _get_embeddings_or_warn(
-    model: torch.nn.Module,
-) -> tuple[torch.nn.Module | None, torch.nn.Module | None]:
-    if not (
-        hasattr(model, "get_input_embeddings")
-        and hasattr(model, "get_output_embeddings")
-    ):
-        logger.warning(
-            f"{model.__class__} doesn't have attribute get_input_embeddings and"
-            " get_output_embeddings implemented."
-            "\nThis can cause"
-            " problems when quantizing layers with shared weights"
-        )
-        return None, None
-
-    try:
-        input_embeddings, output_embeddings = (
-            model.get_input_embeddings(),
-            model.get_output_embeddings(),
-        )
-    except NotImplementedError as e:
-        logger.warning(
-            f"{model.__class__} doesn't have get_input_embeddings and "
-            "get_output_embeddings implemented."
-            "\nThis can cause"
-            " problems when quantizing layers with shared weights"
-            f"\n{e}"
-        )
-        return None, None
-
-    if not (
-        isinstance(input_embeddings, torch.nn.Module)
-        and isinstance(output_embeddings, torch.nn.Module)
-    ):
-        logger.warning(
-            f"expected modules from {model.__class__} get_input_embeddings and"
-            f" get_output_embeddings but got {type(input_embeddings)}"
-            f"  and {type(output_embeddings)}."
-            "\nThis can cause"
-            " problems when quantizing layers with shared weights"
-        )
-        return None, None
-    return input_embeddings, output_embeddings
-
-
-def untie_if_target_shared_embedding(
-    model: torch.nn.Module, matched_module_generator: Generator[torch.nn.Module]
-):
-    """
-    Helper method that checks for shared input/output embedding and unties them
-    if either shows up in the matched_module_generator
-
-    :param model: model to untie if embeddings are shared and targeted by
-        matched_module_generator
-    :param matched_module_generator: Generator of all modules (not names) which
-            will be modified by quantization or transformation
-    """
-    input_embeddings, output_embeddings = _get_embeddings_or_warn(model)
-
-    if None in (input_embeddings, output_embeddings):  # if couldn't find embeddings
-        return
-
-    if (
-        input_embeddings.weight is not output_embeddings.weight
-    ):  # if not shared, can ignore
-        return
-
-    # if shared, check if either is targeted
-    for module in matched_module_generator:
-        if module in (input_embeddings, output_embeddings):
-            untie_word_embeddings(model)
-            return
-
-
 def get_model_compressor(
     model: torch.nn.Module,
     sparsity_config: Optional[SparsityCompressionConfig] = None,
diff --git a/src/llmcompressor/typing.py b/src/llmcompressor/typing.py
@@ -2,8 +2,9 @@
 Defines type aliases for the llm-compressor library.
 """
 
-from typing import Union
+from typing import Iterable
 
+import torch
 from datasets import Dataset, DatasetDict, IterableDataset
 from transformers import (
     BaseImageProcessor,
@@ -13,9 +14,12 @@
 )
 
 # Tokenizer or Processor. Processors do not inherit from a unified base class
-Processor = Union[
-    PreTrainedTokenizer, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin
-]
+Processor = (
+    PreTrainedTokenizer | BaseImageProcessor | FeatureExtractionMixin | ProcessorMixin
+)
 
 # Supported dataset types, IterableDataset is a streamed dataset
-DatasetType = Union[Dataset, DatasetDict, IterableDataset]
+DatasetType = Dataset | DatasetDict | IterableDataset
+
+# Torch types
+NamedModules = Iterable[tuple[str, torch.nn.Module]]
diff --git a/src/llmcompressor/utils/__init__.py b/src/llmcompressor/utils/__init__.py
@@ -4,5 +4,6 @@
 
 # ruff: noqa
 
+from .transformers import *
 from .dev import *
 from .helpers import *
diff --git a/src/llmcompressor/utils/transformers.py b/src/llmcompressor/utils/transformers.py
diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py
diff --git a/tests/llmcompressor/utils/test_transformers.py b/tests/llmcompressor/utils/test_transformers.py

Original file line number	Diff line number	Diff line change
`@@ -29,12 +29,12 @@`
`29`	`29`	`from llmcompressor.pytorch.model_load.helpers import parse_dtype`
`30`	`30`	`from llmcompressor.transformers.compression.compressed_tensors_utils import (`
`31`	`31`	`modify_save_pretrained,`
`32`		`- untie_word_embeddings,`
`33`	`32`	`)`
`34`	`33`	`from llmcompressor.transformers.utils.helpers import (`
`35`	`34`	`is_model_ct_quantized_from_path,`
`36`	`35`	`)`
`37`	`36`	`from llmcompressor.typing import Processor`
	`37`	`+from llmcompressor.utils import untie_word_embeddings`
`38`	`38`	`from llmcompressor.utils.fsdp.helpers import is_fsdp_model`
`39`	`39`
`40`	`40`