Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
32ed415
init vllm
yuhao-zh Oct 20, 2025
15be623
update
Alizen-1009 Oct 20, 2025
544a7a2
rebase
yuhao-zh Oct 21, 2025
5696936
add support without model shard (PP)
Alizen-1009 Oct 21, 2025
9878bb9
up date kvcache
Alizen-1009 Oct 21, 2025
7cafb41
link kv_cache_manager & model_runner
Alizen-1009 Oct 21, 2025
7873e58
add kvcache config
Alizen-1009 Oct 21, 2025
4dfd951
pre commit
yuhao-zh Oct 22, 2025
27950ce
update
yuhao-zh Oct 22, 2025
672da47
update
yuhao-zh Oct 22, 2025
687891c
update
yuhao-zh Oct 22, 2025
f4b3bde
run success but response error
yuhao-zh Oct 24, 2025
5b6bc79
update
Oct 27, 2025
57eb1c7
update
Oct 30, 2025
a1546af
Merge branch 'main' into featrue/add_vllm_support
Oct 31, 2025
45106a1
Merge remote-tracking branch 'origin/main' into featrue/add_vllm_support
Oct 31, 2025
35e5baf
update
Oct 31, 2025
508cea4
update args
yuhao-zh Nov 6, 2025
9db035c
success run
yuhao-zh Nov 6, 2025
f98578c
Merge branch 'main' into featrue/add_vllm_support
yuhao-zh Nov 6, 2025
fb728d8
update model path
yuhao-zh Nov 6, 2025
738ff21
Merge branch 'main' into featrue/add_vllm_support
yuhao-zh Nov 6, 2025
717f69f
test PP with mac
yuhao-zh Nov 6, 2025
9bfa201
test pass
yuhao-zh Nov 11, 2025
dfbc0c1
pre-commit
yuhao-zh Nov 11, 2025
6e79dfe
Merge branch 'main' into featrue/add_vllm_support
yuhao-zh Nov 11, 2025
d233d34
update
yuhao-zh Nov 11, 2025
191f218
update log and pyproject
yuhao-zh Nov 12, 2025
39ed7bd
Merge branch 'main' into featrue/add_vllm_support
yuhao-zh Nov 12, 2025
3c67c0f
add weight load fiter
yuhao-zh Nov 12, 2025
efc5a0d
pre-commit
yuhao-zh Nov 12, 2025
c5edfbc
Merge branch 'main' into featrue/add_vllm_support
yuhao-zh Nov 12, 2025
c56b7bd
update args name
yuhao-zh Nov 12, 2025
41ebfec
update load weights
Nov 12, 2025
ffeb18f
fix bug done
yuhao-zh Nov 14, 2025
05e5846
pre-commit
yuhao-zh Nov 14, 2025
f4eb7e2
refactor code
yuhao-zh Nov 16, 2025
4f3de54
rm code
yuhao-zh Nov 16, 2025
07ecac6
refactor executor
yuhao-zh Nov 17, 2025
16d9e57
pre-commit
yuhao-zh Nov 17, 2025
64db1c1
fix single gpu bug
yuhao-zh Nov 17, 2025
a964ba1
Merge branch 'main' into featrue/add_vllm_support
yuhao-zh Nov 17, 2025
1c68d24
pre-commit
yuhao-zh Nov 17, 2025
4734589
Merge branch 'main' into featrue/add_vllm_support
yuhao-zh Nov 17, 2025
7058c4c
rm useless args
yuhao-zh Nov 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,15 @@ mac = [
]

gpu = [
"sglang[all]==0.5.4.post1",
"mlx-lm==0.28.0",
"mlx[cpu]==0.29.1",
]

vllm = [
"vllm==0.11.0",
"mlx-lm==0.28.0",
"mlx[cpu]==0.29.1",
"sglang[all]==0.5.4.post1",
]

benchmark = [
Expand Down
435 changes: 311 additions & 124 deletions src/parallax/server/executor.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions src/parallax/server/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,14 @@ def parse_args() -> argparse.Namespace:

parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")

parser.add_argument(
"--gpu-backend",
type=str,
default="sglang",
choices=["sglang", "vllm"],
help="GPU backend to use",
)

parser.add_argument(
"--use-hfcache",
action="store_true",
Expand Down
2 changes: 1 addition & 1 deletion src/parallax/sglang/batch_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def form_sgl_batch_decode(
return forward_batch


def release_cuda_request(running_batch: ScheduleBatch, request_id: str):
def release_sglang_request(running_batch: ScheduleBatch, request_id: str):
"""Release KV Cache and other resources for finished/aborted requests."""
if running_batch is None or running_batch.is_empty():
return
Expand Down
22 changes: 13 additions & 9 deletions src/parallax/sglang/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import logging
import os
import random

import sglang
import sglang.srt.distributed.parallel_state
Expand Down Expand Up @@ -221,17 +222,15 @@ def form_sgl_server_args(


def initialize_sgl_model_runner(
original_model_path: str,
model_repo: str,
start_layer: int,
end_layer: int,
kv_cache_memory_fraction: float,
attention_backend: str,
kv_block_size: int,
moe_runner_backend: str,
tp_rank: int,
tp_size: int,
nccl_port: int,
use_hfcache: bool = False,
max_num_tokens_per_batch: int = 1024,
**kwargs,
):
"""
Creates a SGL ModelRunner object.
Expand All @@ -242,23 +241,28 @@ def initialize_sgl_model_runner(
"""
apply_parallax_sglang_monkey_patch()

# Extract TP-related parameters from kwargs or use defaults
tp_rank = kwargs.get("tp_rank", 0)
tp_size = kwargs.get("tp_size", 1)
use_hfcache = kwargs.get("use_hfcache", False)
nccl_port = kwargs.get("nccl_port", None)
# Use selective download for GPU models to save bandwidth and disk space
from parallax.utils.selective_download import get_model_path_with_selective_download

logger.info(
f"Downloading model with selective weight files for layers [{start_layer}, {end_layer})"
)
model_path = get_model_path_with_selective_download(
original_model_path,
start_layer=start_layer,
end_layer=end_layer,
local_files_only=use_hfcache,
model_repo, start_layer=start_layer, end_layer=end_layer, local_files_only=use_hfcache
)

config = load_config(model_path)
tokenizer = load_tokenizer(model_path, eos_token_ids=config.get("eos_token_id", None))
dtype = config.get("torch_dtype", "bfloat16")

if nccl_port is None:
nccl_port = random.randint(4000, 5000)

# Handling mxfp4 arguments
quant_method = config.get("quant_method", None)
quantization_config = config.get("quantization_config", None)
Expand Down
Loading