pre-commit

yuhao-zh · yuhao-zh · commit efc5a0d885c7 · 2025-11-12T13:04:50.000+08:00
diff --git a/src/parallax/vllm/model_runner.py b/src/parallax/vllm/model_runner.py
@@ -26,13 +26,13 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheGroupSpec, KVCacheTensor
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
-from parallax.utils.tokenizer_utils import load_tokenizer
-from parallax_utils.logging_config import get_logger
 from parallax.sglang.monkey_patch_utils.weight_loader_filter import (
     apply_weight_loader_filter_patch,
     set_layer_range_for_filtering,
 )
+from parallax.utils.tokenizer_utils import load_tokenizer
 from parallax.vllm.monkey_patch import apply_parallax_vllm_monkey_patch
+from parallax_utils.logging_config import get_logger
 
 logger = get_logger(__name__)
 
@@ -200,7 +200,9 @@ def _create_kv_cache_config(self, kv_cache_memory_fraction: float = None) -> KVC
             model_dtype = self.vllm_config.model_config.dtype
             if isinstance(model_dtype, str):
                 try:
-                    from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE  # type: ignore
+                    from vllm.utils.torch_utils import (
+                        STR_DTYPE_TO_TORCH_DTYPE,  # type: ignore
+                    )
                 except Exception:
                     # Older/newer vLLM versions may not expose torch_utils.
                     # Fall back silently and default to float16.
@@ -349,16 +351,14 @@ def initialize_vllm_model_runner(
     num_hidden_layers = getattr(config, "num_hidden_layers", 28)
     is_first_peer = start_layer == 0
     is_last_peer = end_layer == num_hidden_layers
-    
+
     # Apply Parallax vLLM monkey patches for pipeline parallelism
     try:
         apply_parallax_vllm_monkey_patch(is_last_stage=is_last_peer)
-        logger.debug(
-            f"Applied Parallax vLLM monkey patches: is_last_stage={is_last_peer}"
-        )
+        logger.debug(f"Applied Parallax vLLM monkey patches: is_last_stage={is_last_peer}")
     except Exception as e:
         logger.warning("Failed to apply Parallax vLLM monkey patches: %s", e)
-    
+
     # Apply layer-range-based weight file filtering before any model load.
     # Reuse the generic monkey patch used by sglang implementation to reduce
     # local weight file reads when loading a partial layer shard.
diff --git a/src/parallax/vllm/monkey_patch.py b/src/parallax/vllm/monkey_patch.py
@@ -17,11 +17,10 @@
 def apply_parallax_vllm_monkey_patch(is_last_stage: bool = True):
     """
     Apply all Parallax monkey patches for vLLM.
-    
+
     Args:
         is_last_stage: Whether this is the last pipeline stage. This affects
                       whether lm_head weights are expected to be loaded.
     """
     set_vllm_pipeline_stage(is_last_stage)
     apply_vllm_weight_loader_patch()
-
diff --git a/src/parallax/vllm/monkey_patch_utils/weight_loader.py b/src/parallax/vllm/monkey_patch_utils/weight_loader.py
@@ -2,6 +2,7 @@
 Monkey patch for vLLM weight loading to skip lm_head weights on non-last pipeline stages.
 This is similar to the approach used in sglang monkey patches.
 """
+
 import logging
 from typing import Any
 
@@ -22,27 +23,27 @@ def apply_vllm_weight_loader_patch():
     """
     Apply monkey patch to vLLM's default loader to skip lm_head initialization check
     when not on the last pipeline stage.
-    
+
     This patch intercepts ValueError exceptions during weight loading and checks if they
     are related to lm_head.weight not being initialized. If this occurs on a non-last
     pipeline stage, the error is suppressed as expected behavior. Otherwise, the error
     is re-raised.
     """
     global _vllm_patch_applied
-    
+
     if _vllm_patch_applied:
         logger.debug("vLLM weight loader patch already applied, skipping")
         return
-    
+
     try:
         from vllm.model_executor.model_loader import default_loader
-        
+
         original_load_weights = default_loader.DefaultModelLoader.load_weights
-        
+
         def patched_load_weights(self, model: Any, model_config: Any):
             """Patched load_weights that handles lm_head for pipeline parallelism."""
             global _is_last_stage
-            
+
             try:
                 # Call original load_weights
                 original_load_weights(self, model, model_config)
@@ -65,15 +66,14 @@ def patched_load_weights(self, model: Any, model_config: Any):
                 else:
                     # Different error, re-raise
                     raise
-        
+
         # Apply the patch
         default_loader.DefaultModelLoader.load_weights = patched_load_weights
         _vllm_patch_applied = True
         logger.info("Successfully applied vLLM weight loader patch for pipeline parallelism")
-        
+
     except ImportError as e:
         logger.warning(f"Could not apply vLLM weight loader patch: {e}")
     except Exception as e:
         logger.error(f"Error applying vLLM weight loader patch: {e}")
         raise
-