From 966d15aee14f6a12aa13132c4e2f9b1d32379e99 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Mon, 15 Jun 2026 20:16:32 +0545 Subject: [PATCH 1/2] [Docs]: PD Disaggregation gRPC workers --- mkdocs/docs/concepts/services.md | 50 ++++++++------- mkdocs/docs/examples/inference/sglang.md | 2 - mkdocs/docs/examples/inference/vllm.md | 78 ++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 24 deletions(-) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index 757546483..1f6dc5c4a 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -357,7 +357,6 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: ```yaml type: service name: prefill-decode - image: lmsysorg/sglang:v0.5.10.post1 env: - HF_TOKEN @@ -365,62 +364,69 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: replicas: - count: 1 - # For now replica group with router must have count: 1 + python: "3.12" commands: - pip install smg - | smg launch \ + --enable-igw \ + --pd-disaggregation \ + --model-path $MODEL_ID \ --host 0.0.0.0 \ --port 8000 \ - --pd-disaggregation \ --prefill-policy cache_aware - resources: - cpu: 4 router: type: sglang + resources: + cpu: 4 - - count: 1..4 + - count: 1..2 scaling: metric: rps - target: 3 + target: 300 + image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 commands: - | - python -m sglang.launch_server \ + python3 -m sglang.launch_server \ --model-path $MODEL_ID \ + --host 0.0.0.0 \ + --port 8000 \ + --grpc-mode \ --disaggregation-mode prefill \ --disaggregation-transfer-backend nixl \ - --port 8000 \ --disaggregation-bootstrap-port 8998 resources: gpu: H200 - - count: 1..8 + - count: 1..4 scaling: metric: rps - target: 2 + target: 300 + image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 commands: - | - python -m sglang.launch_server \ + python3 -m sglang.launch_server \ --model-path $MODEL_ID \ + --host 0.0.0.0 \ + --port 8000 \ + --grpc-mode \ --disaggregation-mode decode \ - --disaggregation-transfer-backend nixl \ - --port 8000 + --disaggregation-transfer-backend nixl resources: gpu: H200 port: 8000 - model: zai-org/GLM-4.5-Air-FP8 - - # Custom probe is required for PD disaggregation. - probes: - - type: http - url: /health - interval: 15s ``` - > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. + > With the `smg` router, workers communicate via gRPC as well as HTTP. + > + > On the router side, `--enable-igw` and `--model-path` are required for gRPC worker registration via HTTP endpoint. This is how `dstack` registers workers with SMG router. + > + > With SGLang gRPC workers, pass `--grpc-mode` to the worker launch command.To use [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake), set `--disaggregation-transfer-backend mooncake`. For PD disaggregation with SGLang HTTP workers, see [SGLang PD Disaggregation](../examples/inference/sglang.md#pd-disaggregation). + > + > The SMG router supports only gRPC communication mode with vLLM workers. For PD disaggregation with vLLM, see [here](../examples/inference/vllm.md#pd-disaggregation). === "Dynamo" diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md index 1ea9e6e06..7c8004f9d 100644 --- a/mkdocs/docs/examples/inference/sglang.md +++ b/mkdocs/docs/examples/inference/sglang.md @@ -211,8 +211,6 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/ - > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. - === "AMD" The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs: diff --git a/mkdocs/docs/examples/inference/vllm.md b/mkdocs/docs/examples/inference/vllm.md index dd6909ba6..4546ff4b3 100644 --- a/mkdocs/docs/examples/inference/vllm.md +++ b/mkdocs/docs/examples/inference/vllm.md @@ -124,6 +124,84 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ > If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. +## Configuration options + +### PD disaggregation + +To run vLLM with [PD disaggregation](https://docs.vllm.ai/en/latest/serving/disagg_prefill.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers (`kv_producer`), and one for decode workers (`kv_consumer`). + +
+ +```yaml +type: service +name: prefill-decode + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1 + python: "3.12" + commands: + - pip install smg + - | + smg launch \ + --pd-disaggregation \ + --model-path $MODEL_ID \ + --enable-igw \ + --host 0.0.0.0 \ + --port 8000 \ + --prefill-policy cache_aware + router: + type: sglang + resources: + cpu: 4 + + - count: 1..4 + scaling: + metric: rps + target: 3 + image: ghcr.io/lightseekorg/smg:1.4.1-vllm-v0.18.0 + commands: + - | + python3 -m vllm.entrypoints.grpc_server \ + --model "$MODEL_ID" \ + --host 0.0.0.0 \ + --port 8000 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + image: ghcr.io/lightseekorg/smg:1.4.1-vllm-v0.18.0 + commands: + - | + python3 -m vllm.entrypoints.grpc_server \ + --model "$MODEL_ID" \ + --host 0.0.0.0 \ + --port 8000 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' + resources: + gpu: H200 + +port: 8000 +``` + +
+ +> To use the [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake) backend, set `"kv_connector": "MooncakeConnector"` in `--kv-transfer-config`. + +Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. + +!!! info "Cluster" + PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. + + While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster. + ## What's next? 1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) From 6b525767a8dbdfcc7f861a467e2de1fa59290c36 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Mon, 15 Jun 2026 20:24:27 +0545 Subject: [PATCH 2/2] Minor Update --- mkdocs/docs/concepts/services.md | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index 1f6dc5c4a..99bf961f9 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -364,6 +364,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: replicas: - count: 1 + # For now replica group with router must have count: 1 python: "3.12" commands: - pip install smg