ModelEngine-Group
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 3 deletions b/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/unifiedcache_test.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/unifiedcache_test.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/trace_replay.py‎
Lines changed: 13 additions & 3 deletions b/‎benchmarks/trace_replay.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 3 additions & 1 deletion b/‎docker/Dockerfile‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docker/Dockerfile-NPU‎
Lines changed: 3 additions & 3 deletions b/‎docker/Dockerfile-NPU‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/getting-started/installation_gpu.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/source/getting-started/installation_gpu.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source/getting-started/installation_npu.md‎
Lines changed: 14 additions & 1 deletion b/‎docs/source/getting-started/installation_npu.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎docs/source/getting-started/quick_start.md‎
Lines changed: 5 additions & 3 deletions b/‎docs/source/getting-started/quick_start.md‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎docs/source/user-guide/pd-disaggregation/1p1d.md‎
Lines changed: 10 additions & 9 deletions b/‎docs/source/user-guide/pd-disaggregation/1p1d.md‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎docs/source/user-guide/pd-disaggregation/npgd.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/user-guide/pd-disaggregation/npgd.md‎
Lines changed: 4 additions & 4 deletions
@@ -17,7 +17,7 @@
 /ucm/store/mooncakestore @chinesezyc @mag1c-h @ygwpz
 /ucm/store/nfsstore @mag1c-h @ygwpz
 
-/ucm/integration @qyh111 @harrisonyhq @ygwpz @mag1c-h
+/ucm/integration @qyh111 @harrisonyhq @ygwpz @mag1c-h @hek14
 
 /ucm/pd @flesher0813 @ygwpz @mag1c-h
 
@@ -27,12 +27,12 @@
 
 /docker @harrisonyhq @ygwpz @mag1c-h
 
-/docs @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
+/docs @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei @hek14
 /docs/source/user-guide/sparse-attention/esa.md @wangwenxin0312 @hek14 @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
 /docs/source/user-guide/sparse-attention/gsa.md @Zbm1996 @zbb200819 @yxkyong @HaoLi980405 @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
 /docs/source/user-guide/sparse-attention/kvcomp.md @leideng @pengwwang @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
 /docs/source/user-guide/sparse-attention/kvstar.md @saki-daisuki @summer-ai007 @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
 
-/examples @harrisonyhq @ygwpz @mag1c-h
+/examples @harrisonyhq @ygwpz @mag1c-h @hek14
 
 /test @Wwwzff @ygwpz @mag1c-h
@@ -49,7 +49,9 @@ jobs:
               set -euo pipefail
               pip install -v -e . --no-build-isolation
               cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
-              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch
+              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch
+              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
               cd /workspace/unified-cache-management
               python3 -m unittest discover -s test
             "
@@ -540,14 +540,16 @@ async def send_group(r=reqs, d=delay):
         flat_requests.extend(reqs)
 
     group_results = await asyncio.gather(*tasks)
+
+    if pbar is not None:
+        pbar.close()
+    await session.close()
+
     outputs = []
     for res in group_results:
         if isinstance(res, list):
             outputs.extend(res)
 
-    if pbar is not None:
-        pbar.close()
-
     benchmark_duration = time.perf_counter() - start_time
     metrics, actual_output_lens = calculate_metrics(
         input_requests=flat_requests,
@@ -678,6 +680,14 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
+    # Check openpyxl for Excel export
+    try:
+        import openpyxl
+    except ImportError:
+        print("\nMissing package: openpyxl")
+        print("Please install openpyxl via pip install.\n")
+        sys.exit(1)
+
     parser = create_argument_trace()
     args = parser.parse_args()
     main(args)
@@ -15,6 +15,8 @@ RUN export PLATFORM="cuda" && \
 
 # Apply patch for vLLM
 RUN cd $(pip show vllm | grep Location | awk '{print $2}') \
-    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch \
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch \
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
 
 ENTRYPOINT ["/bin/bash"]
@@ -16,12 +16,12 @@ RUN export PLATFORM="ascend" && \
 
 # Apply patch for vLLM
 RUN cd /vllm-workspace/vllm \
-    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch \
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch \
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch \
     && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
 
 # Apply patch for vLLM-Ascend
 RUN cd /vllm-workspace/vllm-ascend \
-    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch \
-    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt-sparse.patch
+    && git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
 
 CMD ["/bin/bash"]
@@ -48,7 +48,8 @@ After installation, please apply patch to ensure uc_connector can be used:
 
 ```bash
 cd $(pip show vllm | grep Location | awk '{print $2}')
-git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch
 git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
 ``` 
 
 
@@ -42,7 +42,10 @@ docker run --rm \
 Codes of vLLM and vLLM Ascend are placed in /vllm-workspace, you can refer to [vLLM-Ascend Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more information. After installation, please apply patches to ensure uc_connector can be used:
 ```bash
 cd /vllm-workspace/vllm
-git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
+
 cd /vllm-workspace/vllm-ascend
 git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
 ```
@@ -59,6 +62,16 @@ pip install -v -e . --no-build-isolation
 cd ..
 ```
 
+Codes of vLLM and vLLM Ascend are placed in /vllm-workspace, you can refer to [vLLM-Ascend Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more information. After installation, please apply patches to ensure uc_connector can be used:
+```bash
+cd /vllm-workspace/vllm
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+cd /vllm-workspace/vllm-ascend
+git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
+```
+Refer to these issues [vllm-issue](https://github.com/vllm-project/vllm/issues/21702) and [vllm-ascend-issue](https://github.com/vllm-project/vllm-ascend/issues/2057) to see details of patches' changes.
+
+
 ## Setup from docker
 Download the pre-built docker image provided or build unified-cache-management docker image by commands below:
  ```bash
 
@@ -21,14 +21,16 @@ Before you start with UCM, please make sure that you have installed UCM correctl
 
 ## Features Overview
 
-UCM supports two key features: **Prefix Cache** and **GSA Sparsity**. 
+UCM supports two key features: **Prefix Cache** and **Sparse attention**. 
 
 Each feature supports both **Offline Inference** and **Online API** modes. 
 
 For quick start, just follow the [usage](./quick_start.md) guide below to launch your own inference experience;
 
-For further research, click on the links blow to see more details of each feature:
+For further research on Prefix Cache, more details are available via the link below:
 - [Prefix Cache](../user-guide/prefix-cache/index.md)
+
+Various Sparse Attention features are now available, try GSA Sparsity via the link below:
 - [GSA Sparsity](../user-guide/sparse-attention/gsa.md)
 
 ## Usage
@@ -47,7 +49,7 @@ python offline_inference.py
 
 </details>
 
-<details>
+<details open>
 <summary><b>OpenAI-Compatible Online API</b></summary>
 
 For online inference , vLLM with our connector can also be deployed as a server that implements the OpenAI API protocol.
 
@@ -5,16 +5,17 @@ This example demonstrates how to run unified-cache-management with disaggregated
 
 ## Prerequisites
 - UCM: Installed with reference to the Installation documentation.
-- Hardware: At least 2 GPUs
+- Hardware: At least 2 GPUs or 2 NPUs
 
 ## Start disaggregated service
-For illustration purposes, let us assume that the model used is Qwen2.5-7B-Instruct.
+For illustration purposes, let us take GPU as an example and assume the model used is Qwen2.5-7B-Instruct.Using ASCEND_RT_VISIBLE_DEVICES instead of CUDA_VISIBLE_DEVICES to specify visible devices when starting service on Ascend platform.
 
 ### Run prefill server
 Prefiller Launch Command:
 ```bash
 export PYTHONHASHSEED=123456
-CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
+export CUDA_VISIBLE_DEVICES=0 
+vllm serve /home/models/Qwen2.5-7B-Instruct \
 --max-model-len 20000 \
 --tensor-parallel-size 1 \
 --gpu_memory_utilization 0.87 \
@@ -41,8 +42,9 @@ CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
 ### Run decode server
 Decoder Launch Command:
 ```bash
-export PYTHONHASHSEED=123456
-CUDA_VISIBLE_DEVICES=1 vllm serve /home/models/Qwen2.5-7B-Instruct \
+export PYTHONHASHSEED=123456 
+export CUDA_VISIBLE_DEVICES=0 
+vllm serve /home/models/Qwen2.5-7B-Instruct \
 --max-model-len 20000 \
 --tensor-parallel-size 1 \
 --gpu_memory_utilization 0.87 \
@@ -68,8 +70,8 @@ CUDA_VISIBLE_DEVICES=1 vllm serve /home/models/Qwen2.5-7B-Instruct \
 ### Run proxy server
 Make sure prefill nodes and decode nodes can connect to each other.
 ```bash
-cd vllm-workspace/unified-cache-management/test/
-python3 toy_proxy_server.py --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <decode-node-ip> --decoder-port 7801
+cd /vllm-workspace/unified-cache-management/ucm/pd
+python3 toy_proxy_server.py --pd-disaggregation --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <decode-node-ip> --decoder-port 7801
 ```
 
 ## Testing and Benchmarking
@@ -88,8 +90,7 @@ curl http://localhost:7802/v1/completions \
 ### Benchmark Test
 Use the benchmark scripts provided by vLLM.
 ```bash
-cd /vllm-workspace/vllm/benchmarks
-python3 benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --dataset-name random \
     --random-input-len 4096 \
 
@@ -50,7 +50,8 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
 Decoder Launch Command:
 ```bash
 export PYTHONHASHSEED=123456
-CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
+export CUDA_VISIBLE_DEVICES=0 
+vllm serve /home/models/Qwen2.5-7B-Instruct \
 --max-model-len 20000 \
 --tensor-parallel-size 1 \
 --gpu_memory_utilization 0.87 \
@@ -77,7 +78,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
 ### Run proxy server
 Make sure prefill nodes and decode nodes can connect to each other.
 ```bash
-cd vllm-workspace/unified-cache-management/test/
+cd /vllm-workspace/unified-cache-management/ucm/pd
 python3 toy_proxy_server.py --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <decode-node-ip> --decoder-port 7801
 ```
 
@@ -97,8 +98,7 @@ curl http://localhost:7802/v1/completions \
 ### Benchmark Test
 Use the benchmark scripts provided by vLLM.
 ```bash
-cd /vllm-workspace/vllm/benchmarks
-python3 benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --dataset-name random \
     --random-input-len 4096 \