From c5308dcac49f90629a54e5fecc2bb3cd8291492d Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:16:18 +0200
Subject: [PATCH 01/11] Update Qwen 3.6 service docs and examples

---
 docs/docs/concepts/services.md      | 123 ++++++++++------------------
 docs/examples.md                    |   4 +-
 examples/inference/sglang/README.md |  87 +++++++++-----------
 examples/inference/vllm/README.md   |  78 +++++++++++++-----
 4 files changed, 140 insertions(+), 152 deletions(-)
diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md
index fd0d2a2dc..f1a88e3dd 100644
--- a/docs/docs/concepts/services.md
+++ b/docs/docs/concepts/services.md
@@ -21,23 +21,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
 
     ```yaml
     type: service
-    name: qwen397
+    name: qwen36
 
     image: lmsysorg/sglang:v0.5.10.post1
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
           --port 30000 \
           --tp $DSTACK_GPUS_NUM \
-          --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
-          --enable-flashinfer-allreduce-fusion \
-          --mem-fraction-static 0.8
+          --mem-fraction-static 0.8 \
+          --context-length 262144 \
+          --reasoning-parser qwen3
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -46,11 +46,8 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
         optional: true
 
     resources:
-      cpu: x86:96..
-      memory: 512GB..
       shm_size: 16GB
-      disk: 500GB..
-      gpu: H100:80GB:8
+      gpu: H100:4
     ```
 
     </div>
@@ -61,38 +58,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
 
     ```yaml
     type: service
-    name: qwen397
+    name: qwen36
 
-    image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
-
-    env:
-      - HIP_FORCE_DEV_KERNARG=1
-      - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
-      - SGLANG_DISABLE_CUDNN_CHECK=1
-      - SGLANG_INT4_WEIGHT=0
-      - SGLANG_MOE_PADDING=1
-      - SGLANG_ROCM_DISABLE_LINEARQUANT=0
-      - SGLANG_ROCM_FUSED_DECODE_MLA=1
-      - SGLANG_SET_CPU_AFFINITY=1
-      - SGLANG_USE_AITER=1
-      - SGLANG_USE_ROCM700A=1
+    image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
           --tp $DSTACK_GPUS_NUM \
-          --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
           --mem-fraction-static 0.8 \
           --context-length 262144 \
-          --attention-backend triton \
-          --disable-cuda-graph \
-          --fp8-gemm-backend aiter \
-          --port 30000
+          --reasoning-parser qwen3
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -101,15 +83,18 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
         optional: true
 
     resources:
-      cpu: x86:52..
-      memory: 700GB..
+      cpu: 52..
+      memory: 896GB..
       shm_size: 16GB
-      disk: 600GB..
-      gpu: MI300X:192GB:4
+      disk: 450GB..
+      gpu: MI300X:4
     ```
 
     </div>
 
+The first startup on MI300X can take longer while SGLang compiles ROCm
+kernels.
+
 To run a service, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md):
 
 <div class="termy">
@@ -117,14 +102,14 @@ To run a service, pass the configuration to [`dstack apply`](../reference/cli/ds
 ```shell
 $ dstack apply -f .dstack.yml
 
-Submit the run qwen397? [y/n]: y
+Submit the run qwen36? [y/n]: y
 
 Provisioning...
 ---> 100%
 
 Service is published at:
-  http://localhost:3000/proxy/services/main/qwen397/
-Model Qwen/Qwen3.5-397B-A17B-FP8 is published at:
+  http://localhost:3000/proxy/services/main/qwen36/
+Model Qwen/Qwen3.6-27B is published at:
   http://localhost:3000/proxy/models/main/
 ```
 
@@ -138,11 +123,11 @@ If you do not have a [gateway](gateways.md) created, the service endpoint will b
 <div class="termy">
 
 ```shell
-$ curl http://localhost:3000/proxy/services/main/qwen397/v1/chat/completions \
+$ curl http://localhost:3000/proxy/services/main/qwen36/v1/chat/completions \
     -H 'Content-Type: application/json' \
     -H 'Authorization: Bearer &lt;dstack token&gt;' \
     -d '{
-        "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+        "model": "Qwen/Qwen3.6-27B",
         "messages": [
             {
                 "role": "user",
@@ -213,23 +198,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
 
     ```yaml
     type: service
-    name: qwen397-service
+    name: qwen36-service
 
     image: lmsysorg/sglang:v0.5.10.post1
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
           --port 30000 \
           --tp $DSTACK_GPUS_NUM \
           --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
-          --enable-flashinfer-allreduce-fusion \
-          --mem-fraction-static 0.8
+          --mem-fraction-static 0.8 \
+          --context-length 262144
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -238,11 +223,8 @@ You can configure the number of replicas as well as the auto-scaling rules.
         optional: true
 
     resources:
-      cpu: x86:96..
-      memory: 512GB..
       shm_size: 16GB
-      disk: 500GB..
-      gpu: H100:80GB:8
+      gpu: H100:4
 
     replicas: 1..2
     scaling:
@@ -258,38 +240,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
 
     ```yaml
     type: service
-    name: qwen397-service
-
-    image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
+    name: qwen36-service
 
-    env:
-      - HIP_FORCE_DEV_KERNARG=1
-      - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
-      - SGLANG_DISABLE_CUDNN_CHECK=1
-      - SGLANG_INT4_WEIGHT=0
-      - SGLANG_MOE_PADDING=1
-      - SGLANG_ROCM_DISABLE_LINEARQUANT=0
-      - SGLANG_ROCM_FUSED_DECODE_MLA=1
-      - SGLANG_SET_CPU_AFFINITY=1
-      - SGLANG_USE_AITER=1
-      - SGLANG_USE_ROCM700A=1
+    image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
           --tp $DSTACK_GPUS_NUM \
           --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
           --mem-fraction-static 0.8 \
-          --context-length 262144 \
-          --attention-backend triton \
-          --disable-cuda-graph \
-          --fp8-gemm-backend aiter \
-          --port 30000
+          --context-length 262144
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -298,11 +265,11 @@ You can configure the number of replicas as well as the auto-scaling rules.
         optional: true
 
     resources:
-      cpu: x86:52..
-      memory: 700GB..
+      cpu: 52..
+      memory: 896GB..
       shm_size: 16GB
-      disk: 600GB..
-      gpu: MI300X:192GB:4
+      disk: 450GB..
+      gpu: MI300X:4
 
     replicas: 1..2
     scaling:
diff --git a/docs/examples.md b/docs/examples.md
index 04cd5ff0f..31d6521b7 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -153,7 +153,7 @@ hide:
            SGLang
        </h3>
        <p>
-           Deploy DeepSeek distilled models with SGLang
+           Deploy Qwen3.6-27B with SGLang
       </p>
     </a>
     <a href="/examples/inference/vllm" 
@@ -162,7 +162,7 @@ hide:
            vLLM
        </h3>
        <p>
-            Deploy Llama 3.1 with vLLM
+            Deploy Qwen3.6-27B with vLLM
         </p>
     </a>
     <a href="/examples/inference/nim" 
diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md
index 9d08fe09c..ddf49fd31 100644
--- a/examples/inference/sglang/README.md
+++ b/examples/inference/sglang/README.md
@@ -1,41 +1,41 @@
 ---
 title: SGLang
-description: Deploying Qwen3.5-397B-A17B-FP8 using SGLang on NVIDIA and AMD GPUs
+description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs
 ---
 
 # SGLang
 
-This example shows how to deploy `Qwen/Qwen3.5-397B-A17B-FP8` using
+This example shows how to deploy `Qwen/Qwen3.6-27B` using
 [SGLang](https://github.com/sgl-project/sglang) and `dstack`.
 
 ## Apply a configuration
 
 Here's an example of a service that deploys
-`Qwen/Qwen3.5-397B-A17B-FP8` using SGLang.
+`Qwen/Qwen3.6-27B` using SGLang.
 
 === "NVIDIA"
 
-    <div editor-title="qwen397.dstack.yml">
+    <div editor-title="qwen36.dstack.yml">
 
     ```yaml
     type: service
-    name: qwen397
+    name: qwen36
 
     image: lmsysorg/sglang:v0.5.10.post1
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
           --port 30000 \
           --tp $DSTACK_GPUS_NUM \
           --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
-          --enable-flashinfer-allreduce-fusion \
-          --mem-fraction-static 0.8
+          --mem-fraction-static 0.8 \
+          --context-length 262144
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       - instance_path: /root/.cache
@@ -43,52 +43,34 @@ Here's an example of a service that deploys
         optional: true
 
     resources:
-      cpu: x86:96..
-      memory: 512GB..
       shm_size: 16GB
-      disk: 500GB..
-      gpu: H100:80GB:8
+      gpu: H100:4
     ```
     </div>
 
 === "AMD"
 
-    <div editor-title="qwen397.dstack.yml">
+    <div editor-title="qwen36.dstack.yml">
 
     ```yaml
     type: service
-    name: qwen397
-
-    image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
+    name: qwen36
 
-    env:
-      - HIP_FORCE_DEV_KERNARG=1
-      - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
-      - SGLANG_DISABLE_CUDNN_CHECK=1
-      - SGLANG_INT4_WEIGHT=0
-      - SGLANG_MOE_PADDING=1
-      - SGLANG_ROCM_DISABLE_LINEARQUANT=0
-      - SGLANG_ROCM_FUSED_DECODE_MLA=1
-      - SGLANG_SET_CPU_AFFINITY=1
-      - SGLANG_USE_AITER=1
-      - SGLANG_USE_ROCM700A=1
+    image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
           --tp $DSTACK_GPUS_NUM \
           --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
           --mem-fraction-static 0.8 \
-          --context-length 262144 \
-          --attention-backend triton \
-          --disable-cuda-graph \
-          --fp8-gemm-backend aiter \
-          --port 30000
+          --context-length 262144
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       - instance_path: /root/.cache
@@ -96,24 +78,26 @@ Here's an example of a service that deploys
         optional: true
 
     resources:
-      cpu: x86:52..
-      memory: 700GB..
+      cpu: 52..
+      memory: 896GB..
       shm_size: 16GB
-      disk: 600GB..
-      gpu: MI300X:192GB:4
+      disk: 450GB..
+      gpu: MI300X:4
     ```
     </div>
 
-The AMD example uses the exact validated MI300X configuration for this model,
-including the ROCm/AITER settings required for stable FP8 serving.
+The AMD example keeps the deployment close to the upstream Qwen and SGLang
+guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the
+standard `qwen3` reasoning parser without extra ROCm-specific tuning flags.
+The first startup on MI300X can take longer while SGLang compiles ROCm kernels.
 
-Save one of the configurations above as `qwen397.dstack.yml`, then use the
+Save one of the configurations above as `qwen36.dstack.yml`, then use the
 [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
 
 <div class="termy">
 
 ```shell
-$ dstack apply -f qwen397.dstack.yml
+$ dstack apply -f qwen36.dstack.yml
 ```
 
 </div>
@@ -123,26 +107,29 @@ If no gateway is created, the service endpoint will be available at `<dstack ser
 <div class="termy">
 
 ```shell
-curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
     -X POST \
     -H 'Authorization: Bearer &lt;dstack token&gt;' \
     -H 'Content-Type: application/json' \
     -d '{
-      "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+      "model": "Qwen/Qwen3.6-27B",
       "messages": [
         {
           "role": "user",
           "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount."
         }
       ],
-      "chat_template_kwargs": {"enable_thinking": true},
       "separate_reasoning": true,
       "max_tokens": 1024
     }'
 ```
 </div>
 
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397.<gateway domain>/`.
+Qwen3.6 uses thinking mode by default. To disable thinking, pass
+`"chat_template_kwargs": {"enable_thinking": false}` in the request body. To
+enable tool calling, add `--tool-call-parser qwen3_coder` to the serve command.
+
+> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36.<gateway domain>/`.
 
 ## Configuration options
 
@@ -232,4 +219,4 @@ Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics
 ## What's next?
 
 1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways)
-2. Browse the [Qwen 3.5 SGLang cookbook](https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html)
+2. Browse the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html)
diff --git a/examples/inference/vllm/README.md b/examples/inference/vllm/README.md
index 7497af669..75d6add9b 100644
--- a/examples/inference/vllm/README.md
+++ b/examples/inference/vllm/README.md
@@ -1,39 +1,39 @@
 ---
 title: vLLM
-description: Deploying Qwen3.5-397B-A17B-FP8 using vLLM on NVIDIA GPUs
+description: Deploying Qwen3.6-27B using vLLM on NVIDIA and AMD GPUs
 ---
 
 # vLLM
 
-This example shows how to deploy `Qwen/Qwen3.5-397B-A17B-FP8` using
+This example shows how to deploy `Qwen/Qwen3.6-27B` using
 [vLLM](https://docs.vllm.ai/en/latest/) and `dstack`.
 
 ## Apply a configuration
 
 Here's an example of a service that deploys
-`Qwen/Qwen3.5-397B-A17B-FP8` using vLLM.
+`Qwen/Qwen3.6-27B` using vLLM.
 
 === "NVIDIA"
 
-    <div editor-title="qwen397.dstack.yml">
+    <div editor-title="qwen36.dstack.yml">
 
     ```yaml
     type: service
-    name: qwen397
+    name: qwen36
 
     image: vllm/vllm-openai:v0.19.1
 
     commands:
       - |
-        vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
+        vllm serve Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
           --port 8000 \
           --tensor-parallel-size $DSTACK_GPUS_NUM \
           --max-model-len 262144 \
-          --reasoning-parser qwen3 \
-          --language-model-only
+          --reasoning-parser qwen3
 
     port: 8000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       - instance_path: /root/.cache
@@ -41,26 +41,60 @@ Here's an example of a service that deploys
         optional: true
 
     resources:
-      cpu: x86:96..
-      memory: 512GB..
       shm_size: 16GB
-      disk: 500GB..
-      gpu: H100:80GB:8
+      gpu: H100:4
     ```
 
     </div>
 
-The NVIDIA example serves `Qwen/Qwen3.5-397B-A17B-FP8` on `8x H100` GPUs using
-vLLM with tensor parallelism enabled. It uses `--language-model-only` because
-`Qwen/Qwen3.5-397B-A17B-FP8` is a text-only model.
+=== "AMD"
 
-Save the configuration above as `qwen397.dstack.yml`, then use the
+    <div editor-title="qwen36.dstack.yml">
+
+    ```yaml
+    type: service
+    name: qwen36
+
+    image: vllm/vllm-openai-rocm:v0.19.1
+
+    commands:
+      - |
+        vllm serve Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --tensor-parallel-size $DSTACK_GPUS_NUM \
+          --max-model-len 262144 \
+          --reasoning-parser qwen3
+
+    port: 8000
+    model: Qwen/Qwen3.6-27B
+
+    volumes:
+      - instance_path: /root/.cache
+        path: /root/.cache
+        optional: true
+
+    resources:
+      cpu: 52..
+      memory: 896GB..
+      shm_size: 16GB
+      disk: 450GB..
+      gpu: MI300X:4
+    ```
+
+    </div>
+
+Qwen3.6-27B is a multimodal model. For text-only workloads, add
+`--language-model-only` to free more memory for the KV cache. To enable tool
+calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder`.
+
+Save one of the configurations above as `qwen36.dstack.yml`, then use the
 [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
 
 <div class="termy">
 
 ```shell
-$ dstack apply -f qwen397.dstack.yml
+$ dstack apply -f qwen36.dstack.yml
 ```
 
 </div>
@@ -70,12 +104,12 @@ If no gateway is created, the service endpoint will be available at `<dstack ser
 <div class="termy">
 
 ```shell
-curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
     -X POST \
     -H 'Authorization: Bearer &lt;dstack token&gt;' \
     -H 'Content-Type: application/json' \
     -d '{
-      "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+      "model": "Qwen/Qwen3.6-27B",
       "messages": [
         {
           "role": "user",
@@ -88,9 +122,9 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
 
 </div>
 
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397.<gateway domain>/`.
+> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36.<gateway domain>/`.
 
 ## What's next?
 
 1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways)
-2. Browse the [SGLang](https://dstack.ai/examples/inference/sglang/) and [NIM](https://dstack.ai/examples/inference/nim/) examples
+2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](https://dstack.ai/examples/inference/sglang/) example

From 062d5e148c6b5a86bb56a1151250e150bb60936f Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:32:35 +0200
Subject: [PATCH 02/11] Update quickstart service example to Qwen 3.6

---
 docs/docs/quickstart.md | 52 ++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md
index 4cdecae5e..4b12e56fe 100644
--- a/docs/docs/quickstart.md
+++ b/docs/docs/quickstart.md
@@ -219,27 +219,27 @@ description: Quick guide to creating fleets and submitting runs
 
     ```yaml
     type: service
-    name: llama31-service
-    
-    # If `image` is not specified, dstack uses its default image
-    python: "3.11"
-    #image: dstackai/base:py3.13-0.7-cuda-12.1
-    
-    # Required environment variables
-    env:
-      - HF_TOKEN
+    name: qwen36-service
+
+    image: vllm/vllm-openai:v0.19.1
+
     commands:
-      - pip install vllm
-      - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096
-    # Expose the vllm server port
+      - |
+        vllm serve Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --max-model-len 32768 \
+          --reasoning-parser qwen3
+    # Expose the vLLM server port
     port: 8000
 
     # Specify a name if it's an OpenAI-compatible model
-    model: meta-llama/Meta-Llama-3.1-8B-Instruct
-    
+    model: Qwen/Qwen3.6-27B
+
     # Required resources
     resources:
-      gpu: 24GB
+      shm_size: 16GB
+      gpu: H100
     ```
 
     </div>
@@ -249,22 +249,20 @@ description: Quick guide to creating fleets and submitting runs
     <div class="termy">
 
     ```shell
-    $ HF_TOKEN=...
     $ dstack apply -f service.dstack.yml
-    
-     #  BACKEND  REGION     INSTANCE       RESOURCES                    SPOT  PRICE
-     1  aws      us-west-2  g5.4xlarge     16xCPU, 64GB, 1xA10G (24GB)  yes   $0.22
-     2  aws      us-east-2  g6.xlarge      4xCPU, 16GB, 1xL4 (24GB)     yes   $0.27
-     3  gcp      us-west1   g2-standard-4  4xCPU, 16GB, 1xL4 (24GB)     yes   $0.27
-     
-    Submit the run llama31-service? [y/n]: y
-    
-    Provisioning `llama31-service`...
+
+     #  BACKEND  REGION     INSTANCE              RESOURCES                          SPOT  PRICE
+     1  nebius   eu-north1  gpu-h100-sxm          16xCPU, 250GB, 1xH100 (80GB)      no    $2.95
+     2  runpod   US-CA-2    NVIDIA H100 80GB HBM3 64xCPU, 1004GB, 1xH100 (80GB)     no    $2.99
+
+    Submit the run qwen36-service? [y/n]: y
+
+    Provisioning `qwen36-service`...
     ---> 100%
 
     Service is published at: 
-      http://localhost:3000/proxy/services/main/llama31-service/
-    Model meta-llama/Meta-Llama-3.1-8B-Instruct is published at:
+      http://localhost:3000/proxy/services/main/qwen36-service/
+    Model Qwen/Qwen3.6-27B is published at:
       http://localhost:3000/proxy/models/main/
     ```
     

From d88df45a079258e9edb4318e03aa05f8f7aea46f Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:37:04 +0200
Subject: [PATCH 03/11] Refresh AMD Qwen 3.6 deployment example

---
 examples/accelerators/amd/README.md | 143 ++++++++++++++++------------
 1 file changed, 84 insertions(+), 59 deletions(-)

diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md
index 36be8044e..e267f6138 100644
--- a/examples/accelerators/amd/README.md
+++ b/examples/accelerators/amd/README.md
@@ -1,6 +1,6 @@
 ---
 title: AMD
-description: Deploying and fine-tuning models on AMD MI300X GPUs using vLLM, TRL, and Axolotl
+description: Deploying and fine-tuning models on AMD MI300X GPUs using SGLang, vLLM, TRL, and Axolotl
 ---
 
 # AMD
@@ -11,8 +11,49 @@ with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the
 
 ## Deployment
 
-vLLM supports AMD GPUs. Here's an example of a [service](https://dstack.ai/docs/services) that deploys
-Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html).
+Here are examples of a [service](https://dstack.ai/docs/services) that deploy
+`Qwen/Qwen3.6-27B` on AMD MI300X GPUs using
+[SGLang](https://github.com/sgl-project/sglang) and
+[vLLM](https://docs.vllm.ai/en/latest/).
+
+=== "SGLang"
+
+    <div editor-title="service.dstack.yml">
+
+    ```yaml
+    type: service
+    name: qwen36-service-sglang-amd
+
+    image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
+
+    commands:
+      - |
+        sglang serve \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
+          --tp $DSTACK_GPUS_NUM \
+          --reasoning-parser qwen3 \
+          --mem-fraction-static 0.8 \
+          --context-length 262144
+
+    port: 30000
+    model: Qwen/Qwen3.6-27B
+
+    volumes:
+      - instance_path: /root/.cache
+        path: /root/.cache
+        optional: true
+
+    resources:
+      cpu: 52..
+      memory: 896GB..
+      shm_size: 16GB
+      disk: 450GB..
+      gpu: MI300X:4
+    ```
+
+    </div>
 
 === "vLLM"
 
@@ -20,63 +61,46 @@ Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_starte
 
     ```yaml
     type: service
-    name: llama31-service-vllm-amd
+    name: qwen36-service-vllm-amd
+
+    image: vllm/vllm-openai-rocm:v0.19.1
 
-    # Using Runpod's ROCm Docker image
-    image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04
-    # Required environment variables
-    env:
-      - HF_TOKEN
-      - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct
-      - MAX_MODEL_LEN=126192
-    # Commands of the task
     commands:
-      - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
-      - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip
-      - unzip rocm-6.1.0.zip
-      - cd hipBLAS-rocm-6.1.0
-      - python rmake.py
-      - cd ..
-      - git clone https://github.com/vllm-project/vllm.git
-      - cd vllm
-      - pip install triton
-      - pip uninstall torch -y
-      - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
-      - pip install /opt/rocm/share/amd_smi
-      - pip install --upgrade numba scipy huggingface-hub[cli]
-      - pip install "numpy<2"
-      - pip install -r requirements-rocm.txt
-      - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
-      - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
-      - export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-      - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl
-      - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl
-      - vllm serve $MODEL_ID --max-model-len $MAX_MODEL_LEN --port 8000
-    # Service port
+      - |
+        vllm serve Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --tensor-parallel-size $DSTACK_GPUS_NUM \
+          --max-model-len 262144 \
+          --reasoning-parser qwen3
+
     port: 8000
-    # Register the model
-    model: meta-llama/Meta-Llama-3.1-70B-Instruct
+    model: Qwen/Qwen3.6-27B
 
-    # Uncomment to leverage spot instances
-    #spot_policy: auto
+    volumes:
+      - instance_path: /root/.cache
+        path: /root/.cache
+        optional: true
 
     resources:
-      gpu: MI300X
-      disk: 200GB
+      cpu: 52..
+      memory: 896GB..
+      shm_size: 16GB
+      disk: 450GB..
+      gpu: MI300X:4
     ```
 
     </div>
 
-    Note, maximum size of vLLM’s `KV cache` is 126192, consequently we must set `MAX_MODEL_LEN` to 126192. Adding `/opt/conda/envs/py_3.10/bin` to PATH ensures we use the Python 3.10 environment necessary for the pre-built binaries compiled specifically for this version.
-
-    > To speed up the `vLLM-ROCm` installation, this example uses a pre-built binary from S3.
-
 !!! info "Docker image"
-    If you want to use AMD, specifying `image` is currently required. This must be an image that includes
-    ROCm drivers.
+    AMD deployments require specifying an image that already includes ROCm
+    drivers. The SGLang and vLLM examples above use pinned ROCm images.
 
 To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`.
 
+If you're using multiple AMD nodes, validate cluster networking with the
+[NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) example.
+
 ## Fine-tuning
 
 === "TRL"
@@ -189,28 +213,29 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
 
 ## Running a configuration
 
-Once a configuration is ready, save it to a `.dstack.yml` file, then run
-`dstack apply -f <configuration file>`, and `dstack` will automatically provision the
-cloud resources and run the configuration.
+Once a configuration is ready, save it to a `.dstack.yml` file. If your
+configuration references environment variables such as `HF_TOKEN` or
+`WANDB_API_KEY`, export them first. Then run
+`dstack apply -f <configuration file>`, and `dstack` will automatically
+provision the cloud resources and run the configuration.
 
 <div class="termy">
 
 ```shell
-$ HF_TOKEN=...
-$ WANDB_API_KEY=...
-$ WANDB_PROJECT=...
-$ WANDB_NAME=axolotl-amd-llama31-train
-$ HUB_MODEL_ID=...
-$ dstack apply -f service.dstack.yml
+$ dstack apply -f <configuration file>
 ```
 
 </div>
 
 ## What's next?
 
-1. Browse [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm),
+1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/)
+   and [vLLM](https://dstack.ai/examples/inference/vllm/) examples, plus
    [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl),
-   [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and
-   [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes)
-2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and
+   [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html),
+   and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes)
+2. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
+   to validate multi-node AMD cluster networking.
+3. Check [dev environments](https://dstack.ai/docs/dev-environments),
+   [tasks](https://dstack.ai/docs/tasks), and
    [services](https://dstack.ai/docs/services).

From 90ef47b88b7a11054fbf13bf9bf34e2e633ffcc0 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:41:37 +0200
Subject: [PATCH 04/11] Add Qwen 3.6 model example docs

---
 docs/examples.md                     |  17 ++-
 docs/examples/models/qwen36/index.md |   0
 examples/models/qwen36/README.md     | 168 +++++++++++++++++++++++++++
 mkdocs.yml                           |   4 +
 4 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 docs/examples/models/qwen36/index.md
 create mode 100644 examples/models/qwen36/README.md

diff --git a/docs/examples.md b/docs/examples.md
index 31d6521b7..9d75574dc 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -1,6 +1,6 @@
 ---
 title: Examples
-description: Collection of examples for training, inference, and clusters
+description: Collection of examples for models, training, inference, and clusters
 #template: examples.html
 hide:
 #    - navigation
@@ -144,6 +144,21 @@ hide:
     </a>
 </div>
 
+## Models
+
+<div class="tx-landing__highlights_grid">
+    <a href="/examples/models/qwen36"
+       class="feature-cell">
+        <h3>
+            Qwen 3.6
+        </h3>
+
+        <p>
+            Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD
+        </p>
+    </a>
+</div>
+
 ## Inference
 
 <div class="tx-landing__highlights_grid">
diff --git a/docs/examples/models/qwen36/index.md b/docs/examples/models/qwen36/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md
new file mode 100644
index 000000000..713ac341e
--- /dev/null
+++ b/examples/models/qwen36/README.md
@@ -0,0 +1,168 @@
+---
+title: Qwen 3.6
+description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs
+---
+
+# Qwen 3.6
+
+This example shows how to deploy `Qwen/Qwen3.6-27B` as a
+[service](https://dstack.ai/docs/services) using
+[SGLang](https://github.com/sgl-project/sglang) and `dstack`.
+
+## Apply a configuration
+
+Save one of the following configurations as `qwen36.dstack.yml`.
+
+=== "NVIDIA"
+
+    <div editor-title="qwen36.dstack.yml">
+
+    ```yaml
+    type: service
+    name: qwen36
+
+    image: lmsysorg/sglang:v0.5.10.post1
+
+    commands:
+      - |
+        sglang serve \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
+          --tp $DSTACK_GPUS_NUM \
+          --reasoning-parser qwen3 \
+          --mem-fraction-static 0.8 \
+          --context-length 262144
+
+    port: 30000
+    model: Qwen/Qwen3.6-27B
+
+    volumes:
+      - instance_path: /root/.cache
+        path: /root/.cache
+        optional: true
+
+    resources:
+      shm_size: 16GB
+      gpu: H100:4
+    ```
+
+    </div>
+
+=== "AMD"
+
+    <div editor-title="qwen36.dstack.yml">
+
+    ```yaml
+    type: service
+    name: qwen36
+
+    image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
+
+    commands:
+      - |
+        sglang serve \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
+          --tp $DSTACK_GPUS_NUM \
+          --reasoning-parser qwen3 \
+          --mem-fraction-static 0.8 \
+          --context-length 262144
+
+    port: 30000
+    model: Qwen/Qwen3.6-27B
+
+    volumes:
+      - instance_path: /root/.cache
+        path: /root/.cache
+        optional: true
+
+    resources:
+      cpu: 52..
+      memory: 896GB..
+      shm_size: 16GB
+      disk: 450GB..
+      gpu: MI300X:4
+    ```
+
+    </div>
+
+The NVIDIA and AMD configurations above use pinned SGLang images and the same
+straightforward 4-GPU layout used across the Qwen 3.6 docs and examples.
+
+Apply the configuration with
+[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md).
+
+<div class="termy">
+
+```shell
+$ dstack apply -f qwen36.dstack.yml
+```
+
+</div>
+
+If no gateway is created, the service endpoint will be available at
+`<dstack server URL>/proxy/services/<project name>/<run name>/`.
+
+<div class="termy">
+
+```shell
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
+    -X POST \
+    -H 'Authorization: Bearer &lt;dstack token&gt;' \
+    -H 'Content-Type: application/json' \
+    -d '{
+      "model": "Qwen/Qwen3.6-27B",
+      "messages": [
+        {
+          "role": "user",
+          "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount."
+        }
+      ],
+      "max_tokens": 1024
+    }'
+```
+
+</div>
+
+## Thinking mode
+
+Qwen3.6 uses thinking mode by default. With SGLang, the reasoning stream is
+returned separately as `reasoning_content`.
+
+To disable thinking, pass `chat_template_kwargs` in the request body.
+
+<div class="termy">
+
+```shell
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
+    -X POST \
+    -H 'Authorization: Bearer &lt;dstack token&gt;' \
+    -H 'Content-Type: application/json' \
+    -d '{
+      "model": "Qwen/Qwen3.6-27B",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Summarize the benefits of container images in one sentence."
+        }
+      ],
+      "max_tokens": 256,
+      "chat_template_kwargs": {
+        "enable_thinking": false
+      }
+    }'
+```
+
+</div>
+
+## What's next?
+
+1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/)
+   and [vLLM](https://dstack.ai/examples/inference/vllm/) examples
+2. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for
+   more AMD deployment and training configurations
+3. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
+   if you're validating multi-node cluster networking
+4. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6)
diff --git a/mkdocs.yml b/mkdocs.yml
index 8dbe0ad85..f9a22ff37 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -100,8 +100,10 @@ plugins:
         "docs/fleets.md": "docs/concepts/fleets.md"
         "docs/examples/llms/llama31.md": "examples/inference/vllm/index.md"
         "docs/examples/llms/llama32.md": "examples/inference/vllm/index.md"
+        "docs/examples/llms/qwen36.md": "examples/models/qwen36/index.md"
         "examples/llms/llama31/index.md": "examples/inference/vllm/index.md"
         "examples/llms/llama32/index.md": "examples/inference/vllm/index.md"
+        "examples/llms/qwen36/index.md": "examples/models/qwen36/index.md"
         "docs/examples/accelerators/amd/index.md": "examples/accelerators/amd/index.md"
         "docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md"
         "docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md"
@@ -285,6 +287,8 @@ nav:
           - skill.md: https://dstack.ai/skill.md
   - Examples:
       - examples.md
+      - Models:
+          - Qwen 3.6: examples/models/qwen36/index.md
       - Single-node training:
           - TRL: examples/single-node-training/trl/index.md
           - Axolotl: examples/single-node-training/axolotl/index.md

From e54ef23028f0ba6a0153b56c7fb1aacf5af07fd2 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:42:33 +0200
Subject: [PATCH 05/11] Move models section to end of examples

---
 docs/examples.md | 30 +++++++++++++++---------------
 mkdocs.yml       |  4 ++--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/examples.md b/docs/examples.md
index 9d75574dc..b3e3e0d42 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -144,21 +144,6 @@ hide:
     </a>
 </div>
 
-## Models
-
-<div class="tx-landing__highlights_grid">
-    <a href="/examples/models/qwen36"
-       class="feature-cell">
-        <h3>
-            Qwen 3.6
-        </h3>
-
-        <p>
-            Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD
-        </p>
-    </a>
-</div>
-
 ## Inference
 
 <div class="tx-landing__highlights_grid">
@@ -200,6 +185,21 @@ hide:
     </a>
 </div>
 
+## Models
+
+<div class="tx-landing__highlights_grid">
+    <a href="/examples/models/qwen36"
+       class="feature-cell">
+        <h3>
+            Qwen 3.6
+        </h3>
+
+        <p>
+            Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD
+        </p>
+    </a>
+</div>
+
 ## Accelerators
 
 <div class="tx-landing__highlights_grid">
diff --git a/mkdocs.yml b/mkdocs.yml
index f9a22ff37..1baa53015 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -287,8 +287,6 @@ nav:
           - skill.md: https://dstack.ai/skill.md
   - Examples:
       - examples.md
-      - Models:
-          - Qwen 3.6: examples/models/qwen36/index.md
       - Single-node training:
           - TRL: examples/single-node-training/trl/index.md
           - Axolotl: examples/single-node-training/axolotl/index.md
@@ -312,6 +310,8 @@ nav:
           - AMD: examples/accelerators/amd/index.md
           - TPU: examples/accelerators/tpu/index.md
           - Tenstorrent: examples/accelerators/tenstorrent/index.md
+      - Models:
+          - Qwen 3.6: examples/models/qwen36/index.md
   - Blog:
       - blog/index.md
   - Case studies: blog/case-studies.md

From 74b186a263d855bb9bfa3e06af3412474f4e82f5 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:43:33 +0200
Subject: [PATCH 06/11] Switch quickstart Qwen service to SGLang

---
 docs/docs/quickstart.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md
index 4b12e56fe..6e9be0406 100644
--- a/docs/docs/quickstart.md
+++ b/docs/docs/quickstart.md
@@ -221,17 +221,17 @@ description: Quick guide to creating fleets and submitting runs
     type: service
     name: qwen36-service
 
-    image: vllm/vllm-openai:v0.19.1
+    image: lmsysorg/sglang:v0.5.10.post1
 
     commands:
       - |
-        vllm serve Qwen/Qwen3.6-27B \
+        sglang serve \
+          --model-path Qwen/Qwen3.6-27B \
           --host 0.0.0.0 \
-          --port 8000 \
-          --max-model-len 32768 \
+          --port 30000 \
           --reasoning-parser qwen3
-    # Expose the vLLM server port
-    port: 8000
+    # Expose the SGLang server port
+    port: 30000
 
     # Specify a name if it's an OpenAI-compatible model
     model: Qwen/Qwen3.6-27B

From 96e616287d6c415f3bbf5abb2acbd8d66f2fc5f9 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:45:32 +0200
Subject: [PATCH 07/11] Add upstream Qwen 3.6 links to model page

---
 examples/models/qwen36/README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md
index 713ac341e..57f37a0a9 100644
--- a/examples/models/qwen36/README.md
+++ b/examples/models/qwen36/README.md
@@ -159,10 +159,12 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
 
 ## What's next?
 
-1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/)
+1. Read the [Qwen/Qwen3.6-27B model card](https://huggingface.co/Qwen/Qwen3.6-27B)
+2. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6)
+3. Read the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html)
+4. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/)
    and [vLLM](https://dstack.ai/examples/inference/vllm/) examples
-2. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for
+5. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for
    more AMD deployment and training configurations
-3. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
+6. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
    if you're validating multi-node cluster networking
-4. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6)

From c1c07e5c1c42fa1276ee5db1fb1690ab16300bb4 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:46:28 +0200
Subject: [PATCH 08/11] Move AMD cluster note to training section

---
 examples/accelerators/amd/README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md
index e267f6138..ef1f6be58 100644
--- a/examples/accelerators/amd/README.md
+++ b/examples/accelerators/amd/README.md
@@ -98,11 +98,12 @@ Here are examples of a [service](https://dstack.ai/docs/services) that deploy
 
 To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`.
 
-If you're using multiple AMD nodes, validate cluster networking with the
-[NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) example.
-
 ## Fine-tuning
 
+If you're planning multi-node AMD training, validate cluster networking first
+with the [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
+example.
+
 === "TRL"
 
     Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html)
@@ -234,8 +235,9 @@ $ dstack apply -f <configuration file>
    [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl),
    [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html),
    and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes)
-2. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
-   to validate multi-node AMD cluster networking.
+2. For multi-node training, run
+   [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
+   to validate AMD cluster networking.
 3. Check [dev environments](https://dstack.ai/docs/dev-environments),
    [tasks](https://dstack.ai/docs/tasks), and
    [services](https://dstack.ai/docs/services).

From a2955de9ec2165e777afa08854edd02d1abe457c Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 11:47:11 +0200
Subject: [PATCH 09/11] Remove cluster note from Qwen model page

---
 examples/models/qwen36/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md
index 57f37a0a9..bc92271b2 100644
--- a/examples/models/qwen36/README.md
+++ b/examples/models/qwen36/README.md
@@ -166,5 +166,3 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
    and [vLLM](https://dstack.ai/examples/inference/vllm/) examples
 5. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for
    more AMD deployment and training configurations
-6. Run [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
-   if you're validating multi-node cluster networking

From 6fa22d3e982f27b60617ab358488f9cc2cca6238 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 12:31:52 +0200
Subject: [PATCH 10/11] Minor edit

---
 examples/accelerators/amd/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md
index ef1f6be58..b35b29c1c 100644
--- a/examples/accelerators/amd/README.md
+++ b/examples/accelerators/amd/README.md
@@ -100,7 +100,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
 
 ## Fine-tuning
 
-If you're planning multi-node AMD training, validate cluster networking first
+> If you're planning multi-node AMD training, validate cluster networking first
 with the [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
 example.
 

From fc51d78ebf04937568c16b4bbd9c0b02c5e0efb6 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 23 Apr 2026 12:34:42 +0200
Subject: [PATCH 11/11] Simplify Qwen 3.6 quickstart service config

---
 docs/docs/quickstart.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md
index 6e9be0406..80a98f79b 100644
--- a/docs/docs/quickstart.md
+++ b/docs/docs/quickstart.md
@@ -238,7 +238,6 @@ description: Quick guide to creating fleets and submitting runs
 
     # Required resources
     resources:
-      shm_size: 16GB
       gpu: H100
     ```