update load weights

yuhao-zh · yuhao-zh · commit 41ebfecd3e26 · 2025-11-12T08:46:55.000Z
diff --git a/src/parallax/vllm/model_runner.py b/src/parallax/vllm/model_runner.py
@@ -315,6 +315,7 @@ def custom_get_pp_indices(num_layers: int, rank: int, world_size: int):
                 f"Successfully loaded {self.num_shard_layers} layers "
                 f"[{self.start_layer}:{self.end_layer}]"
             )
+
         finally:
             vllm.distributed.utils.get_pp_indices = original_get_pp_indices
 
@@ -347,15 +348,15 @@ def initialize_vllm_model_runner(
     config = load_config(model_path)
     tokenizer = load_tokenizer(model_path, eos_token_ids=config.get("eos_token_id", None))
     dtype = config.get("torch_dtype", "bfloat16")
-
-    num_hidden_layers = getattr(config, "num_hidden_layers", 28)
+    
+    num_hidden_layers = config.get("num_hidden_layers")
     is_first_peer = start_layer == 0
     is_last_peer = end_layer == num_hidden_layers
 
     # Apply Parallax vLLM monkey patches for pipeline parallelism
     try:
-        apply_parallax_vllm_monkey_patch(is_last_stage=is_last_peer)
-        logger.debug(f"Applied Parallax vLLM monkey patches: is_last_stage={is_last_peer}")
+        apply_parallax_vllm_monkey_patch(is_first_stage=is_first_peer, is_last_stage=is_last_peer)
+        logger.debug(f"Applied Parallax vLLM monkey patches: is_first_stage={is_first_peer}, is_last_stage={is_last_peer}")
     except Exception as e:
         logger.warning("Failed to apply Parallax vLLM monkey patches: %s", e)
 
diff --git a/src/parallax/vllm/monkey_patch.py b/src/parallax/vllm/monkey_patch.py
@@ -14,13 +14,14 @@
 ## Here are patch functions for vLLM
 ## Hopefully, when vLLM supports pipeline parallelism natively in the way we need,
 ## we can remove these patches
-def apply_parallax_vllm_monkey_patch(is_last_stage: bool = True):
+def apply_parallax_vllm_monkey_patch(is_first_stage: bool, is_last_stage: bool):
     """
     Apply all Parallax monkey patches for vLLM.
 
     Args:
+        is_first_stage: Whether this is the first pipeline stage.
         is_last_stage: Whether this is the last pipeline stage. This affects
                       whether lm_head weights are expected to be loaded.
     """
-    set_vllm_pipeline_stage(is_last_stage)
+    set_vllm_pipeline_stage(is_first_stage, is_last_stage)
     apply_vllm_weight_loader_patch()
diff --git a/src/parallax/vllm/monkey_patch_utils/weight_loader.py b/src/parallax/vllm/monkey_patch_utils/weight_loader.py
@@ -1,5 +1,5 @@
 """
-Monkey patch for vLLM weight loading to skip lm_head weights on non-last pipeline stages.
+Monkey patch for vLLM weight loading to skip non-existent weights on different pipeline stages.
 This is similar to the approach used in sglang monkey patches.
 """
 
@@ -9,25 +9,25 @@
 logger = logging.getLogger(__name__)
 
 _vllm_patch_applied = False
+_is_first_stage = False  # Default to False
 _is_last_stage = True  # Default to True for safety
 
 
-def set_vllm_pipeline_stage(is_last_stage: bool):
-    """Set whether this is the last pipeline stage."""
-    global _is_last_stage
+def set_vllm_pipeline_stage(is_first_stage: bool, is_last_stage: bool):
+    """Set whether this is the first and/or last pipeline stage."""
+    global _is_first_stage, _is_last_stage
+    _is_first_stage = is_first_stage
     _is_last_stage = is_last_stage
-    logger.debug(f"Set vLLM pipeline stage: is_last_stage={is_last_stage}")
+    logger.debug(f"Set vLLM pipeline stage: is_first_stage={_is_first_stage}, is_last_stage={_is_last_stage}")
 
 
 def apply_vllm_weight_loader_patch():
     """
-    Apply monkey patch to vLLM's default loader to skip lm_head initialization check
-    when not on the last pipeline stage.
+    Apply monkey patch to vLLM's default loader to skip initialization checks
+    for weights that are not expected on certain pipeline stages.
 
-    This patch intercepts ValueError exceptions during weight loading and checks if they
-    are related to lm_head.weight not being initialized. If this occurs on a non-last
-    pipeline stage, the error is suppressed as expected behavior. Otherwise, the error
-    is re-raised.
+    - Skips `embed_tokens` check on non-first stages.
+    - Skips `lm_head` check on non-last stages.
     """
     global _vllm_patch_applied
 
@@ -41,28 +41,37 @@ def apply_vllm_weight_loader_patch():
         original_load_weights = default_loader.DefaultModelLoader.load_weights
 
         def patched_load_weights(self, model: Any, model_config: Any):
-            """Patched load_weights that handles lm_head for pipeline parallelism."""
-            global _is_last_stage
+            """Patched load_weights that handles embed_tokens and lm_head for pipeline parallelism."""
+            global _is_first_stage, _is_last_stage
 
             try:
                 # Call original load_weights
                 original_load_weights(self, model, model_config)
             except ValueError as e:
                 error_msg = str(e)
-                # Check if this is the lm_head initialization error
-                if "lm_head.weight" in error_msg and "not initialized from checkpoint" in error_msg:
+                uninitialized_weights = "not initialized from checkpoint" in error_msg
+
+                # Case 1: embed_tokens.weight not found
+                if "model.embed_tokens.weight" in error_msg and uninitialized_weights:
+                    if not _is_first_stage:
+                        # Expected behavior for non-first pipeline stages
+                        logger.info("Skipping embed_tokens.weight initialization check on non-first pipeline stage")
+                    else:
+                        # This is the first stage, embed_tokens should be initialized
+                        logger.error("embed_tokens.weight not initialized on first pipeline stage, this is an error")
+                        raise
+
+                # Case 2: lm_head.weight not found
+                elif "lm_head.weight" in error_msg and uninitialized_weights:
                     if not _is_last_stage:
                         # Expected behavior for non-last pipeline stages
-                        logger.info(
-                            "Skipping lm_head.weight initialization check on non-last pipeline stage"
-                        )
-                        return
+                        logger.info("Skipping lm_head.weight initialization check on non-last pipeline stage")
                     else:
                         # This is the last stage, lm_head should be initialized
-                        logger.error(
-                            "lm_head.weight not initialized on last pipeline stage, this is an error"
-                        )
+                        logger.error("lm_head.weight not initialized on last pipeline stage, this is an error")
                         raise
+                
+                # Case 3: Other errors
                 else:
                     # Different error, re-raise
                     raise