|
26 | 26 | from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheGroupSpec, KVCacheTensor |
27 | 27 | from vllm.v1.worker.gpu_model_runner import GPUModelRunner |
28 | 28 |
|
29 | | -from parallax.utils.tokenizer_utils import load_tokenizer |
30 | | -from parallax_utils.logging_config import get_logger |
31 | 29 | from parallax.sglang.monkey_patch_utils.weight_loader_filter import ( |
32 | 30 | apply_weight_loader_filter_patch, |
33 | 31 | set_layer_range_for_filtering, |
34 | 32 | ) |
| 33 | +from parallax.utils.tokenizer_utils import load_tokenizer |
35 | 34 | from parallax.vllm.monkey_patch import apply_parallax_vllm_monkey_patch |
| 35 | +from parallax_utils.logging_config import get_logger |
36 | 36 |
|
37 | 37 | logger = get_logger(__name__) |
38 | 38 |
|
@@ -200,7 +200,9 @@ def _create_kv_cache_config(self, kv_cache_memory_fraction: float = None) -> KVC |
200 | 200 | model_dtype = self.vllm_config.model_config.dtype |
201 | 201 | if isinstance(model_dtype, str): |
202 | 202 | try: |
203 | | - from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE # type: ignore |
| 203 | + from vllm.utils.torch_utils import ( |
| 204 | + STR_DTYPE_TO_TORCH_DTYPE, # type: ignore |
| 205 | + ) |
204 | 206 | except Exception: |
205 | 207 | # Older/newer vLLM versions may not expose torch_utils. |
206 | 208 | # Fall back silently and default to float16. |
@@ -349,16 +351,14 @@ def initialize_vllm_model_runner( |
349 | 351 | num_hidden_layers = getattr(config, "num_hidden_layers", 28) |
350 | 352 | is_first_peer = start_layer == 0 |
351 | 353 | is_last_peer = end_layer == num_hidden_layers |
352 | | - |
| 354 | + |
353 | 355 | # Apply Parallax vLLM monkey patches for pipeline parallelism |
354 | 356 | try: |
355 | 357 | apply_parallax_vllm_monkey_patch(is_last_stage=is_last_peer) |
356 | | - logger.debug( |
357 | | - f"Applied Parallax vLLM monkey patches: is_last_stage={is_last_peer}" |
358 | | - ) |
| 358 | + logger.debug(f"Applied Parallax vLLM monkey patches: is_last_stage={is_last_peer}") |
359 | 359 | except Exception as e: |
360 | 360 | logger.warning("Failed to apply Parallax vLLM monkey patches: %s", e) |
361 | | - |
| 361 | + |
362 | 362 | # Apply layer-range-based weight file filtering before any model load. |
363 | 363 | # Reuse the generic monkey patch used by sglang implementation to reduce |
364 | 364 | # local weight file reads when loading a partial layer shard. |
|
0 commit comments