Auto module tree (#2204)

LRL2-ModelCloud · web-flow · commit 927de72e5348 · 2025-11-18T19:06:15.000+08:00
* add _auto_detect_module_tree

* cleanup

* add test_auto_detect_module_tree

* cleanup

* fix from_quantized

* use warn level

* mod log

* cleanup

* pass quant_method

* cleanup
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -271,12 +271,15 @@ def _is_supported_quantization_config(config: AutoConfig) -> bool:
     return False
 
 
-def check_and_get_model_type(model_dir, trust_remote_code=False):
+def check_and_get_model_definition(model_dir, trust_remote_code=False):
     config = AutoConfig.from_pretrained(model_dir, trust_remote_code=trust_remote_code)
-    if config.model_type.lower() not in SUPPORTED_MODELS:
-        raise TypeError(f"{config.model_type} isn't supported yet.")
-    model_type = config.model_type
-    return model_type.lower()
+    model_type = config.model_type.lower()
+
+    # if model_type is not supported, use BaseQModel, will use auto_detect_module_tree to generate module tree
+    if model_type not in SUPPORTED_MODELS:
+        return BaseQModel
+
+    return MODEL_MAP[model_type]
 
 class GPTQModel:
     def __init__(self):
@@ -372,8 +375,9 @@ def from_pretrained(
             log.warn(
                 "GPTQModel's per-module `dynamic` quantization feature is fully supported in latest vLLM and SGLang but not yet available in hf transformers.")
 
-        model_type = check_and_get_model_type(model_id_or_path, trust_remote_code)
-        return MODEL_MAP[model_type].from_pretrained(
+        model_definition = check_and_get_model_definition(model_id_or_path, trust_remote_code)
+
+        return model_definition.from_pretrained(
             pretrained_model_id_or_path=model_id_or_path,
             quantize_config=quantize_config,
             trust_remote_code=trust_remote_code,
@@ -395,12 +399,12 @@ def from_quantized(
         adapter = normalize_adapter(adapter)
 
         print(f"from_quantized: adapter: {adapter}")
-        model_type = check_and_get_model_type(model_id_or_path, trust_remote_code)
+        model_definition = check_and_get_model_definition(model_id_or_path, trust_remote_code)
 
         if isinstance(backend, str):
             backend = BACKEND(backend)
 
-        return MODEL_MAP[model_type].from_quantized(
+        return model_definition.from_quantized(
             model_id_or_path=model_id_or_path,
             device_map=device_map,
             device=device,
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -236,6 +236,14 @@ def __init__(
             # setting cls.module_tree
             type(self).module_tree = apply_module_tree_override(self.module_tree, self.module_tree_overrides[quant_method])
 
+        if type(self).module_tree is None:
+            type(self).module_tree = self._auto_detect_module_tree(model, quant_method)
+        
+        # If module_tree is still None after auto-detection, raise an error indicating unsupported model type
+        if type(self).module_tree is None:
+            raise ValueError(f"Unsupport model_type {model.config.model_type}, and failed to auto-detect module tree for model {model}")
+            
+
         # record configuration early so model lifecycle hooks can rely on them
         self.compiled = False  # set to True while compile() is triggered successfully
         self.quantized = quantized
@@ -1657,6 +1665,91 @@ def __getattr__(self, item):
                 return getattr(model, item)
             raise exc
 
+    def _auto_detect_module_tree(self, model: PreTrainedModel, quant_method: METHOD):
+        log.warn("Model not yet support, attempting Module Tree AutoCompat...")
+
+        if quant_method != METHOD.GPTQ:
+            log.warn(f"Module Tree AutoCompat: Failed, quant_method={quant_method}, only support GPTQ")
+            return None
+
+        def _get(path):
+            base = model
+            for p in path.split("."):
+                base = getattr(base, p, None)
+                if base is None:
+                    return None
+            return base
+
+        candidates = [
+            "model.layers",
+            "language_model.layers",
+            "model.decoder.layers",
+            "transformer.h",
+            "transformer.blocks",
+            "layers",
+            "blocks",
+            "model.blocks",
+        ]
+        
+        chosen = None
+        for c in candidates:
+            m = _get(c)
+            if isinstance(m, (nn.ModuleList, list, tuple)) and len(m) > 0 and isinstance(m[0], nn.Module):
+                chosen = c
+                log.warn(f"Module Tree AutoCompat: Matched candidate path '{c}', type={type(m).__name__}")
+                break
+
+        if chosen is None:
+            log.warn("Module Tree AutoCompat: All candidate paths invalid, return None")    
+            return None
+
+        layer0 = _get(chosen)[0]
+        log.warn(f"Module Tree AutoCompat: Using layer0: {type(layer0).__name__}")
+
+        def _linear_names(module):
+            mods = find_modules(module, layers=[nn.Linear, nn.Conv1d, nn.Conv2d])
+            log.warn(f"Module Tree AutoCompat: _linear_names: found {len(mods)} Linear/Conv modules in {type(module).__name__}")
+            return list(mods.keys())
+
+        all_linear = _linear_names(layer0)
+        if len(all_linear)>0:
+            log.warn(f"Module Tree AutoCompat: found {len(all_linear)} Linear/Conv modules in {type(layer0).__name__}: {all_linear}")
+        else:
+            log.warn(f"Module Tree AutoCompat: No Linear/Conv names in layer0, return None")
+            return None
+
+        mapping = {}
+
+        def _find_parents(module, possible_names):
+            found = set()
+            for n, _ in module.named_children():
+                l = n.lower()
+                if any(k in l for k in possible_names):
+                    found.add(n)
+            return found
+
+        def _leaf_tokens(prefix):
+            return tuple(x.split(".")[-1] for x in all_linear if x.startswith(f"{prefix}."))
+
+        possible_parent = ["attn", "attention", "self_attn", "mlp", "ffn", "feed", "dense"]
+        
+        found_parents = _find_parents(layer0, possible_parent)
+
+        for p in found_parents:
+            t = _leaf_tokens(p)
+            if t:
+                mapping[p] = t
+
+        if not mapping:
+            blocks = tuple(n.split(".")[-1] for n in all_linear)
+            mapping[""] = blocks
+            log.warn(f"Module Tree AutoCompat: Mapping empty, using all Linear as fallback: {blocks}")
+
+        parts = chosen.split(".")
+        tree = parts + ["#", mapping]
+        log.warn(f"Module Tree AutoCompat: Final module_tree: {tree}")
+        return tree
+
 __all__ = ["BaseQModel"]
 
 BaseQModel = ModelLoader(ModelWriter(BaseQModel))
diff --git a/tests/test_auto_detect_module_tree.py b/tests/test_auto_detect_module_tree.py
@@ -0,0 +1,44 @@
+import unittest
+import torch.nn as nn
+
+from gptqmodel.models.base import BaseQModel
+
+
+class DummyAttention(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.q_proj = nn.Linear(4, 4)
+        self.k_proj = nn.Linear(4, 4)
+
+
+class DummyMLP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(4, 4)
+        self.fc2 = nn.Linear(4, 4)
+
+
+class DummyBlock(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.self_attn = DummyAttention()
+        self.mlp = DummyMLP()
+
+
+class DummyModel:
+    def __init__(self):
+        self.layers = nn.ModuleList([DummyBlock()])
+
+
+class TestAutoDetectModuleTree(unittest.TestCase):
+    def test_layers_with_parents(self):
+        model = DummyModel()
+        base = BaseQModel.__new__(BaseQModel)
+        tree = base._auto_detect_module_tree(model, quant_method="gptq")
+        self.assertEqual(tree[0], "layers")
+        self.assertEqual(tree[1], "#")
+        mapping = tree[2]
+        self.assertIn("self_attn", mapping)
+        self.assertIn("mlp", mapping)
+        self.assertSetEqual(set(mapping["self_attn"]), {"q_proj", "k_proj"})
+        self.assertSetEqual(set(mapping["mlp"]), {"fc1", "fc2"})