From c5308dcac49f90629a54e5fecc2bb3cd8291492d Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:16:18 +0200 Subject: [PATCH 01/11] Update Qwen 3.6 service docs and examples --- docs/docs/concepts/services.md | 123 ++++++++++------------------ docs/examples.md | 4 +- examples/inference/sglang/README.md | 87 +++++++++----------- examples/inference/vllm/README.md | 78 +++++++++++++----- 4 files changed, 140 insertions(+), 152 deletions(-) diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index fd0d2a2dc..f1a88e3dd 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -21,23 +21,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` ```yaml type: service - name: qwen397 + name: qwen36 image: lmsysorg/sglang:v0.5.10.post1 commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 30000 \ --tp $DSTACK_GPUS_NUM \ - --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ - --enable-flashinfer-allreduce-fusion \ - --mem-fraction-static 0.8 + --mem-fraction-static 0.8 \ + --context-length 262144 \ + --reasoning-parser qwen3 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -46,11 +46,8 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 ``` @@ -61,38 +58,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` ```yaml type: service - name: qwen397 + name: qwen36 - image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x - - env: - - HIP_FORCE_DEV_KERNARG=1 - - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 - - SGLANG_DISABLE_CUDNN_CHECK=1 - - SGLANG_INT4_WEIGHT=0 - - SGLANG_MOE_PADDING=1 - - SGLANG_ROCM_DISABLE_LINEARQUANT=0 - - SGLANG_ROCM_FUSED_DECODE_MLA=1 - - SGLANG_SET_CPU_AFFINITY=1 - - SGLANG_USE_AITER=1 - - SGLANG_USE_ROCM700A=1 + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ --tp $DSTACK_GPUS_NUM \ - --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ --mem-fraction-static 0.8 \ --context-length 262144 \ - --attention-backend triton \ - --disable-cuda-graph \ - --fp8-gemm-backend aiter \ - --port 30000 + --reasoning-parser qwen3 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -101,15 +83,18 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` optional: true resources: - cpu: x86:52.. - memory: 700GB.. + cpu: 52.. + memory: 896GB.. shm_size: 16GB - disk: 600GB.. - gpu: MI300X:192GB:4 + disk: 450GB.. + gpu: MI300X:4 ``` +The first startup on MI300X can take longer while SGLang compiles ROCm +kernels. + To run a service, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md):
@@ -117,14 +102,14 @@ To run a service, pass the configuration to [`dstack apply`](../reference/cli/ds ```shell $ dstack apply -f .dstack.yml -Submit the run qwen397? [y/n]: y +Submit the run qwen36? [y/n]: y Provisioning... ---> 100% Service is published at: - http://localhost:3000/proxy/services/main/qwen397/ -Model Qwen/Qwen3.5-397B-A17B-FP8 is published at: + http://localhost:3000/proxy/services/main/qwen36/ +Model Qwen/Qwen3.6-27B is published at: http://localhost:3000/proxy/models/main/ ``` @@ -138,11 +123,11 @@ If you do not have a [gateway](gateways.md) created, the service endpoint will b
```shell -$ curl http://localhost:3000/proxy/services/main/qwen397/v1/chat/completions \ +$ curl http://localhost:3000/proxy/services/main/qwen36/v1/chat/completions \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer <dstack token>' \ -d '{ - "model": "Qwen/Qwen3.5-397B-A17B-FP8", + "model": "Qwen/Qwen3.6-27B", "messages": [ { "role": "user", @@ -213,23 +198,23 @@ You can configure the number of replicas as well as the auto-scaling rules. ```yaml type: service - name: qwen397-service + name: qwen36-service image: lmsysorg/sglang:v0.5.10.post1 commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ - --enable-flashinfer-allreduce-fusion \ - --mem-fraction-static 0.8 + --mem-fraction-static 0.8 \ + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -238,11 +223,8 @@ You can configure the number of replicas as well as the auto-scaling rules. optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 replicas: 1..2 scaling: @@ -258,38 +240,23 @@ You can configure the number of replicas as well as the auto-scaling rules. ```yaml type: service - name: qwen397-service - - image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x + name: qwen36-service - env: - - HIP_FORCE_DEV_KERNARG=1 - - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 - - SGLANG_DISABLE_CUDNN_CHECK=1 - - SGLANG_INT4_WEIGHT=0 - - SGLANG_MOE_PADDING=1 - - SGLANG_ROCM_DISABLE_LINEARQUANT=0 - - SGLANG_ROCM_FUSED_DECODE_MLA=1 - - SGLANG_SET_CPU_AFFINITY=1 - - SGLANG_USE_AITER=1 - - SGLANG_USE_ROCM700A=1 + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ --mem-fraction-static 0.8 \ - --context-length 262144 \ - --attention-backend triton \ - --disable-cuda-graph \ - --fp8-gemm-backend aiter \ - --port 30000 + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -298,11 +265,11 @@ You can configure the number of replicas as well as the auto-scaling rules. optional: true resources: - cpu: x86:52.. - memory: 700GB.. + cpu: 52.. + memory: 896GB.. shm_size: 16GB - disk: 600GB.. - gpu: MI300X:192GB:4 + disk: 450GB.. + gpu: MI300X:4 replicas: 1..2 scaling: diff --git a/docs/examples.md b/docs/examples.md index 04cd5ff0f..31d6521b7 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -153,7 +153,7 @@ hide: SGLang

- Deploy DeepSeek distilled models with SGLang + Deploy Qwen3.6-27B with SGLang

- Deploy Llama 3.1 with vLLM + Deploy Qwen3.6-27B with vLLM

+
```yaml type: service - name: qwen397 + name: qwen36 image: lmsysorg/sglang:v0.5.10.post1 commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ - --enable-flashinfer-allreduce-fusion \ - --mem-fraction-static 0.8 + --mem-fraction-static 0.8 \ + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: - instance_path: /root/.cache @@ -43,52 +43,34 @@ Here's an example of a service that deploys optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 ```
=== "AMD" -
+
```yaml type: service - name: qwen397 - - image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x + name: qwen36 - env: - - HIP_FORCE_DEV_KERNARG=1 - - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 - - SGLANG_DISABLE_CUDNN_CHECK=1 - - SGLANG_INT4_WEIGHT=0 - - SGLANG_MOE_PADDING=1 - - SGLANG_ROCM_DISABLE_LINEARQUANT=0 - - SGLANG_ROCM_FUSED_DECODE_MLA=1 - - SGLANG_SET_CPU_AFFINITY=1 - - SGLANG_USE_AITER=1 - - SGLANG_USE_ROCM700A=1 + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ --mem-fraction-static 0.8 \ - --context-length 262144 \ - --attention-backend triton \ - --disable-cuda-graph \ - --fp8-gemm-backend aiter \ - --port 30000 + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: - instance_path: /root/.cache @@ -96,24 +78,26 @@ Here's an example of a service that deploys optional: true resources: - cpu: x86:52.. - memory: 700GB.. + cpu: 52.. + memory: 896GB.. shm_size: 16GB - disk: 600GB.. - gpu: MI300X:192GB:4 + disk: 450GB.. + gpu: MI300X:4 ```
-The AMD example uses the exact validated MI300X configuration for this model, -including the ROCm/AITER settings required for stable FP8 serving. +The AMD example keeps the deployment close to the upstream Qwen and SGLang +guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the +standard `qwen3` reasoning parser without extra ROCm-specific tuning flags. +The first startup on MI300X can take longer while SGLang compiles ROCm kernels. -Save one of the configurations above as `qwen397.dstack.yml`, then use the +Save one of the configurations above as `qwen36.dstack.yml`, then use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
```shell -$ dstack apply -f qwen397.dstack.yml +$ dstack apply -f qwen36.dstack.yml ```
@@ -123,26 +107,29 @@ If no gateway is created, the service endpoint will be available at ` ```shell -curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \ +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ -X POST \ -H 'Authorization: Bearer <dstack token>' \ -H 'Content-Type: application/json' \ -d '{ - "model": "Qwen/Qwen3.5-397B-A17B-FP8", + "model": "Qwen/Qwen3.6-27B", "messages": [ { "role": "user", "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount." } ], - "chat_template_kwargs": {"enable_thinking": true}, "separate_reasoning": true, "max_tokens": 1024 }' ```
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397./`. +Qwen3.6 uses thinking mode by default. To disable thinking, pass +`"chat_template_kwargs": {"enable_thinking": false}` in the request body. To +enable tool calling, add `--tool-call-parser qwen3_coder` to the serve command. + +> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## Configuration options @@ -232,4 +219,4 @@ Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics ## What's next? 1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) -2. Browse the [Qwen 3.5 SGLang cookbook](https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html) +2. Browse the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html) diff --git a/examples/inference/vllm/README.md b/examples/inference/vllm/README.md index 7497af669..75d6add9b 100644 --- a/examples/inference/vllm/README.md +++ b/examples/inference/vllm/README.md @@ -1,39 +1,39 @@ --- title: vLLM -description: Deploying Qwen3.5-397B-A17B-FP8 using vLLM on NVIDIA GPUs +description: Deploying Qwen3.6-27B using vLLM on NVIDIA and AMD GPUs --- # vLLM -This example shows how to deploy `Qwen/Qwen3.5-397B-A17B-FP8` using +This example shows how to deploy `Qwen/Qwen3.6-27B` using [vLLM](https://docs.vllm.ai/en/latest/) and `dstack`. ## Apply a configuration Here's an example of a service that deploys -`Qwen/Qwen3.5-397B-A17B-FP8` using vLLM. +`Qwen/Qwen3.6-27B` using vLLM. === "NVIDIA" -
+
```yaml type: service - name: qwen397 + name: qwen36 image: vllm/vllm-openai:v0.19.1 commands: - | - vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \ + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 8000 \ --tensor-parallel-size $DSTACK_GPUS_NUM \ --max-model-len 262144 \ - --reasoning-parser qwen3 \ - --language-model-only + --reasoning-parser qwen3 port: 8000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: - instance_path: /root/.cache @@ -41,26 +41,60 @@ Here's an example of a service that deploys optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 ```
-The NVIDIA example serves `Qwen/Qwen3.5-397B-A17B-FP8` on `8x H100` GPUs using -vLLM with tensor parallelism enabled. It uses `--language-model-only` because -`Qwen/Qwen3.5-397B-A17B-FP8` is a text-only model. +=== "AMD" -Save the configuration above as `qwen397.dstack.yml`, then use the +
+ + ```yaml + type: service + name: qwen36 + + image: vllm/vllm-openai-rocm:v0.19.1 + + commands: + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size $DSTACK_GPUS_NUM \ + --max-model-len 262144 \ + --reasoning-parser qwen3 + + port: 8000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
+ +Qwen3.6-27B is a multimodal model. For text-only workloads, add +`--language-model-only` to free more memory for the KV cache. To enable tool +calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder`. + +Save one of the configurations above as `qwen36.dstack.yml`, then use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
```shell -$ dstack apply -f qwen397.dstack.yml +$ dstack apply -f qwen36.dstack.yml ```
@@ -70,12 +104,12 @@ If no gateway is created, the service endpoint will be available at ` ```shell -curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \ +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ -X POST \ -H 'Authorization: Bearer <dstack token>' \ -H 'Content-Type: application/json' \ -d '{ - "model": "Qwen/Qwen3.5-397B-A17B-FP8", + "model": "Qwen/Qwen3.6-27B", "messages": [ { "role": "user", @@ -88,9 +122,9 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397./`. +> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## What's next? 1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) -2. Browse the [SGLang](https://dstack.ai/examples/inference/sglang/) and [NIM](https://dstack.ai/examples/inference/nim/) examples +2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](https://dstack.ai/examples/inference/sglang/) example From 062d5e148c6b5a86bb56a1151250e150bb60936f Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:32:35 +0200 Subject: [PATCH 02/11] Update quickstart service example to Qwen 3.6 --- docs/docs/quickstart.md | 52 ++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md index 4cdecae5e..4b12e56fe 100644 --- a/docs/docs/quickstart.md +++ b/docs/docs/quickstart.md @@ -219,27 +219,27 @@ description: Quick guide to creating fleets and submitting runs ```yaml type: service - name: llama31-service - - # If `image` is not specified, dstack uses its default image - python: "3.11" - #image: dstackai/base:py3.13-0.7-cuda-12.1 - - # Required environment variables - env: - - HF_TOKEN + name: qwen36-service + + image: vllm/vllm-openai:v0.19.1 + commands: - - pip install vllm - - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096 - # Expose the vllm server port + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --max-model-len 32768 \ + --reasoning-parser qwen3 + # Expose the vLLM server port port: 8000 # Specify a name if it's an OpenAI-compatible model - model: meta-llama/Meta-Llama-3.1-8B-Instruct - + model: Qwen/Qwen3.6-27B + # Required resources resources: - gpu: 24GB + shm_size: 16GB + gpu: H100 ```
@@ -249,22 +249,20 @@ description: Quick guide to creating fleets and submitting runs
```shell - $ HF_TOKEN=... $ dstack apply -f service.dstack.yml - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 aws us-west-2 g5.4xlarge 16xCPU, 64GB, 1xA10G (24GB) yes $0.22 - 2 aws us-east-2 g6.xlarge 4xCPU, 16GB, 1xL4 (24GB) yes $0.27 - 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB) yes $0.27 - - Submit the run llama31-service? [y/n]: y - - Provisioning `llama31-service`... + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 nebius eu-north1 gpu-h100-sxm 16xCPU, 250GB, 1xH100 (80GB) no $2.95 + 2 runpod US-CA-2 NVIDIA H100 80GB HBM3 64xCPU, 1004GB, 1xH100 (80GB) no $2.99 + + Submit the run qwen36-service? [y/n]: y + + Provisioning `qwen36-service`... ---> 100% Service is published at: - http://localhost:3000/proxy/services/main/llama31-service/ - Model meta-llama/Meta-Llama-3.1-8B-Instruct is published at: + http://localhost:3000/proxy/services/main/qwen36-service/ + Model Qwen/Qwen3.6-27B is published at: http://localhost:3000/proxy/models/main/ ``` From d88df45a079258e9edb4318e03aa05f8f7aea46f Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:37:04 +0200 Subject: [PATCH 03/11] Refresh AMD Qwen 3.6 deployment example --- examples/accelerators/amd/README.md | 143 ++++++++++++++++------------ 1 file changed, 84 insertions(+), 59 deletions(-) diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index 36be8044e..e267f6138 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -1,6 +1,6 @@ --- title: AMD -description: Deploying and fine-tuning models on AMD MI300X GPUs using vLLM, TRL, and Axolotl +description: Deploying and fine-tuning models on AMD MI300X GPUs using SGLang, vLLM, TRL, and Axolotl --- # AMD @@ -11,8 +11,49 @@ with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the ## Deployment -vLLM supports AMD GPUs. Here's an example of a [service](https://dstack.ai/docs/services) that deploys -Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html). +Here are examples of a [service](https://dstack.ai/docs/services) that deploy +`Qwen/Qwen3.6-27B` on AMD MI300X GPUs using +[SGLang](https://github.com/sgl-project/sglang) and +[vLLM](https://docs.vllm.ai/en/latest/). + +=== "SGLang" + +
+ + ```yaml + type: service + name: qwen36-service-sglang-amd + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
=== "vLLM" @@ -20,63 +61,46 @@ Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_starte ```yaml type: service - name: llama31-service-vllm-amd + name: qwen36-service-vllm-amd + + image: vllm/vllm-openai-rocm:v0.19.1 - # Using Runpod's ROCm Docker image - image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04 - # Required environment variables - env: - - HF_TOKEN - - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct - - MAX_MODEL_LEN=126192 - # Commands of the task commands: - - export PATH=/opt/conda/envs/py_3.10/bin:$PATH - - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip - - unzip rocm-6.1.0.zip - - cd hipBLAS-rocm-6.1.0 - - python rmake.py - - cd .. - - git clone https://github.com/vllm-project/vllm.git - - cd vllm - - pip install triton - - pip uninstall torch -y - - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 - - pip install /opt/rocm/share/amd_smi - - pip install --upgrade numba scipy huggingface-hub[cli] - - pip install "numpy<2" - - pip install -r requirements-rocm.txt - - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib - - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* - - export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl - - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl - - vllm serve $MODEL_ID --max-model-len $MAX_MODEL_LEN --port 8000 - # Service port + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size $DSTACK_GPUS_NUM \ + --max-model-len 262144 \ + --reasoning-parser qwen3 + port: 8000 - # Register the model - model: meta-llama/Meta-Llama-3.1-70B-Instruct + model: Qwen/Qwen3.6-27B - # Uncomment to leverage spot instances - #spot_policy: auto + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true resources: - gpu: MI300X - disk: 200GB + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 ```
- Note, maximum size of vLLM’s `KV cache` is 126192, consequently we must set `MAX_MODEL_LEN` to 126192. Adding `/opt/conda/envs/py_3.10/bin` to PATH ensures we use the Python 3.10 environment necessary for the pre-built binaries compiled specifically for this version. - - > To speed up the `vLLM-ROCm` installation, this example uses a pre-built binary from S3. - !!! info "Docker image" - If you want to use AMD, specifying `image` is currently required. This must be an image that includes - ROCm drivers. + AMD deployments require specifying an image that already includes ROCm + drivers. The SGLang and vLLM examples above use pinned ROCm images. To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`. +If you're using multiple AMD nodes, validate cluster networking with the +[NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) example. + ## Fine-tuning === "TRL" @@ -189,28 +213,29 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by ## Running a configuration -Once a configuration is ready, save it to a `.dstack.yml` file, then run -`dstack apply -f `, and `dstack` will automatically provision the -cloud resources and run the configuration. +Once a configuration is ready, save it to a `.dstack.yml` file. If your +configuration references environment variables such as `HF_TOKEN` or +`WANDB_API_KEY`, export them first. Then run +`dstack apply -f `, and `dstack` will automatically +provision the cloud resources and run the configuration.
```shell -$ HF_TOKEN=... -$ WANDB_API_KEY=... -$ WANDB_PROJECT=... -$ WANDB_NAME=axolotl-amd-llama31-train -$ HUB_MODEL_ID=... -$ dstack apply -f service.dstack.yml +$ dstack apply -f ```
## What's next? -1. Browse [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm), +1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) + and [vLLM](https://dstack.ai/examples/inference/vllm/) examples, plus [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), - [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and - [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) -2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and + [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html), + and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) +2. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) + to validate multi-node AMD cluster networking. +3. Check [dev environments](https://dstack.ai/docs/dev-environments), + [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). From 90ef47b88b7a11054fbf13bf9bf34e2e633ffcc0 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:41:37 +0200 Subject: [PATCH 04/11] Add Qwen 3.6 model example docs --- docs/examples.md | 17 ++- docs/examples/models/qwen36/index.md | 0 examples/models/qwen36/README.md | 168 +++++++++++++++++++++++++++ mkdocs.yml | 4 + 4 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 docs/examples/models/qwen36/index.md create mode 100644 examples/models/qwen36/README.md diff --git a/docs/examples.md b/docs/examples.md index 31d6521b7..9d75574dc 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1,6 +1,6 @@ --- title: Examples -description: Collection of examples for training, inference, and clusters +description: Collection of examples for models, training, inference, and clusters #template: examples.html hide: # - navigation @@ -144,6 +144,21 @@ hide:
+## Models + +
+ +

+ Qwen 3.6 +

+ +

+ Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD +

+
+
+ ## Inference
diff --git a/docs/examples/models/qwen36/index.md b/docs/examples/models/qwen36/index.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md new file mode 100644 index 000000000..713ac341e --- /dev/null +++ b/examples/models/qwen36/README.md @@ -0,0 +1,168 @@ +--- +title: Qwen 3.6 +description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs +--- + +# Qwen 3.6 + +This example shows how to deploy `Qwen/Qwen3.6-27B` as a +[service](https://dstack.ai/docs/services) using +[SGLang](https://github.com/sgl-project/sglang) and `dstack`. + +## Apply a configuration + +Save one of the following configurations as `qwen36.dstack.yml`. + +=== "NVIDIA" + +
+ + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10.post1 + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + shm_size: 16GB + gpu: H100:4 + ``` + +
+ +=== "AMD" + +
+ + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
+ +The NVIDIA and AMD configurations above use pinned SGLang images and the same +straightforward 4-GPU layout used across the Qwen 3.6 docs and examples. + +Apply the configuration with +[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md). + +
+ +```shell +$ dstack apply -f qwen36.dstack.yml +``` + +
+ +If no gateway is created, the service endpoint will be available at +`/proxy/services///`. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <dstack token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount." + } + ], + "max_tokens": 1024 + }' +``` + +
+ +## Thinking mode + +Qwen3.6 uses thinking mode by default. With SGLang, the reasoning stream is +returned separately as `reasoning_content`. + +To disable thinking, pass `chat_template_kwargs` in the request body. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <dstack token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "Summarize the benefits of container images in one sentence." + } + ], + "max_tokens": 256, + "chat_template_kwargs": { + "enable_thinking": false + } + }' +``` + +
+ +## What's next? + +1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) + and [vLLM](https://dstack.ai/examples/inference/vllm/) examples +2. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for + more AMD deployment and training configurations +3. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) + if you're validating multi-node cluster networking +4. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) diff --git a/mkdocs.yml b/mkdocs.yml index 8dbe0ad85..f9a22ff37 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -100,8 +100,10 @@ plugins: "docs/fleets.md": "docs/concepts/fleets.md" "docs/examples/llms/llama31.md": "examples/inference/vllm/index.md" "docs/examples/llms/llama32.md": "examples/inference/vllm/index.md" + "docs/examples/llms/qwen36.md": "examples/models/qwen36/index.md" "examples/llms/llama31/index.md": "examples/inference/vllm/index.md" "examples/llms/llama32/index.md": "examples/inference/vllm/index.md" + "examples/llms/qwen36/index.md": "examples/models/qwen36/index.md" "docs/examples/accelerators/amd/index.md": "examples/accelerators/amd/index.md" "docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md" "docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" @@ -285,6 +287,8 @@ nav: - skill.md: https://dstack.ai/skill.md - Examples: - examples.md + - Models: + - Qwen 3.6: examples/models/qwen36/index.md - Single-node training: - TRL: examples/single-node-training/trl/index.md - Axolotl: examples/single-node-training/axolotl/index.md From e54ef23028f0ba6a0153b56c7fb1aacf5af07fd2 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:42:33 +0200 Subject: [PATCH 05/11] Move models section to end of examples --- docs/examples.md | 30 +++++++++++++++--------------- mkdocs.yml | 4 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/examples.md b/docs/examples.md index 9d75574dc..b3e3e0d42 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -144,21 +144,6 @@ hide:
-## Models - -
- -

- Qwen 3.6 -

- -

- Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD -

-
-
- ## Inference
@@ -200,6 +185,21 @@ hide:
+## Models + +
+ +

+ Qwen 3.6 +

+ +

+ Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD +

+
+
+ ## Accelerators
diff --git a/mkdocs.yml b/mkdocs.yml index f9a22ff37..1baa53015 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -287,8 +287,6 @@ nav: - skill.md: https://dstack.ai/skill.md - Examples: - examples.md - - Models: - - Qwen 3.6: examples/models/qwen36/index.md - Single-node training: - TRL: examples/single-node-training/trl/index.md - Axolotl: examples/single-node-training/axolotl/index.md @@ -312,6 +310,8 @@ nav: - AMD: examples/accelerators/amd/index.md - TPU: examples/accelerators/tpu/index.md - Tenstorrent: examples/accelerators/tenstorrent/index.md + - Models: + - Qwen 3.6: examples/models/qwen36/index.md - Blog: - blog/index.md - Case studies: blog/case-studies.md From 74b186a263d855bb9bfa3e06af3412474f4e82f5 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:43:33 +0200 Subject: [PATCH 06/11] Switch quickstart Qwen service to SGLang --- docs/docs/quickstart.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md index 4b12e56fe..6e9be0406 100644 --- a/docs/docs/quickstart.md +++ b/docs/docs/quickstart.md @@ -221,17 +221,17 @@ description: Quick guide to creating fleets and submitting runs type: service name: qwen36-service - image: vllm/vllm-openai:v0.19.1 + image: lmsysorg/sglang:v0.5.10.post1 commands: - | - vllm serve Qwen/Qwen3.6-27B \ + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ --host 0.0.0.0 \ - --port 8000 \ - --max-model-len 32768 \ + --port 30000 \ --reasoning-parser qwen3 - # Expose the vLLM server port - port: 8000 + # Expose the SGLang server port + port: 30000 # Specify a name if it's an OpenAI-compatible model model: Qwen/Qwen3.6-27B From 96e616287d6c415f3bbf5abb2acbd8d66f2fc5f9 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:45:32 +0200 Subject: [PATCH 07/11] Add upstream Qwen 3.6 links to model page --- examples/models/qwen36/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md index 713ac341e..57f37a0a9 100644 --- a/examples/models/qwen36/README.md +++ b/examples/models/qwen36/README.md @@ -159,10 +159,12 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ ## What's next? -1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) +1. Read the [Qwen/Qwen3.6-27B model card](https://huggingface.co/Qwen/Qwen3.6-27B) +2. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) +3. Read the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) +4. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) and [vLLM](https://dstack.ai/examples/inference/vllm/) examples -2. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for +5. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for more AMD deployment and training configurations -3. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) +6. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) if you're validating multi-node cluster networking -4. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) From c1c07e5c1c42fa1276ee5db1fb1690ab16300bb4 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:46:28 +0200 Subject: [PATCH 08/11] Move AMD cluster note to training section --- examples/accelerators/amd/README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index e267f6138..ef1f6be58 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -98,11 +98,12 @@ Here are examples of a [service](https://dstack.ai/docs/services) that deploy To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`. -If you're using multiple AMD nodes, validate cluster networking with the -[NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) example. - ## Fine-tuning +If you're planning multi-node AMD training, validate cluster networking first +with the [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) +example. + === "TRL" Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html) @@ -234,8 +235,9 @@ $ dstack apply -f [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html), and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) -2. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) - to validate multi-node AMD cluster networking. +2. For multi-node training, run + [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) + to validate AMD cluster networking. 3. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). From a2955de9ec2165e777afa08854edd02d1abe457c Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 11:47:11 +0200 Subject: [PATCH 09/11] Remove cluster note from Qwen model page --- examples/models/qwen36/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md index 57f37a0a9..bc92271b2 100644 --- a/examples/models/qwen36/README.md +++ b/examples/models/qwen36/README.md @@ -166,5 +166,3 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ and [vLLM](https://dstack.ai/examples/inference/vllm/) examples 5. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for more AMD deployment and training configurations -6. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) - if you're validating multi-node cluster networking From 6fa22d3e982f27b60617ab358488f9cc2cca6238 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 12:31:52 +0200 Subject: [PATCH 10/11] Minor edit --- examples/accelerators/amd/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index ef1f6be58..b35b29c1c 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -100,7 +100,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by ## Fine-tuning -If you're planning multi-node AMD training, validate cluster networking first +> If you're planning multi-node AMD training, validate cluster networking first with the [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) example. From fc51d78ebf04937568c16b4bbd9c0b02c5e0efb6 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 23 Apr 2026 12:34:42 +0200 Subject: [PATCH 11/11] Simplify Qwen 3.6 quickstart service config --- docs/docs/quickstart.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md index 6e9be0406..80a98f79b 100644 --- a/docs/docs/quickstart.md +++ b/docs/docs/quickstart.md @@ -238,7 +238,6 @@ description: Quick guide to creating fleets and submitting runs # Required resources resources: - shm_size: 16GB gpu: H100 ```