Skip to content

Commit bdc07cc

Browse files
authored
Merge branch 'ModelEngine-Group:develop' into develop
2 parents 4b8b8de + a7d051e commit bdc07cc

File tree

85 files changed

+6127
-2553
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+6127
-2553
lines changed

.github/CODEOWNERS

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
/ucm/store/mooncakestore @chinesezyc @mag1c-h @ygwpz
1818
/ucm/store/nfsstore @mag1c-h @ygwpz
1919

20-
/ucm/integration @qyh111 @harrisonyhq @ygwpz @mag1c-h
20+
/ucm/integration @qyh111 @harrisonyhq @ygwpz @mag1c-h @hek14
2121

2222
/ucm/pd @flesher0813 @ygwpz @mag1c-h
2323

@@ -27,12 +27,12 @@
2727

2828
/docker @harrisonyhq @ygwpz @mag1c-h
2929

30-
/docs @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
30+
/docs @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei @hek14
3131
/docs/source/user-guide/sparse-attention/esa.md @wangwenxin0312 @hek14 @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
3232
/docs/source/user-guide/sparse-attention/gsa.md @Zbm1996 @zbb200819 @yxkyong @HaoLi980405 @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
3333
/docs/source/user-guide/sparse-attention/kvcomp.md @leideng @pengwwang @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
3434
/docs/source/user-guide/sparse-attention/kvstar.md @saki-daisuki @summer-ai007 @flesher0813 @ygwpz @mag1c-h @FangRun2 @Tarrei
3535

36-
/examples @harrisonyhq @ygwpz @mag1c-h
36+
/examples @harrisonyhq @ygwpz @mag1c-h @hek14
3737

3838
/test @Wwwzff @ygwpz @mag1c-h

.github/workflows/unifiedcache_test.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ jobs:
4949
set -euo pipefail
5050
pip install -v -e . --no-build-isolation
5151
cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
52-
git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
52+
git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch
53+
git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch
54+
git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
5355
cd /workspace/unified-cache-management
5456
python3 -m unittest discover -s test
5557
"

benchmarks/trace_replay.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -540,14 +540,16 @@ async def send_group(r=reqs, d=delay):
540540
flat_requests.extend(reqs)
541541

542542
group_results = await asyncio.gather(*tasks)
543+
544+
if pbar is not None:
545+
pbar.close()
546+
await session.close()
547+
543548
outputs = []
544549
for res in group_results:
545550
if isinstance(res, list):
546551
outputs.extend(res)
547552

548-
if pbar is not None:
549-
pbar.close()
550-
551553
benchmark_duration = time.perf_counter() - start_time
552554
metrics, actual_output_lens = calculate_metrics(
553555
input_requests=flat_requests,
@@ -678,6 +680,14 @@ def main(args: argparse.Namespace):
678680

679681

680682
if __name__ == "__main__":
683+
# Check openpyxl for Excel export
684+
try:
685+
import openpyxl
686+
except ImportError:
687+
print("\nMissing package: openpyxl")
688+
print("Please install openpyxl via pip install.\n")
689+
sys.exit(1)
690+
681691
parser = create_argument_trace()
682692
args = parser.parse_args()
683693
main(args)

docker/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ RUN export PLATFORM="cuda" && \
1515

1616
# Apply patch for vLLM
1717
RUN cd $(pip show vllm | grep Location | awk '{print $2}') \
18-
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
18+
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch \
19+
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch \
20+
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
1921

2022
ENTRYPOINT ["/bin/bash"]

docker/Dockerfile-NPU

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@ RUN export PLATFORM="ascend" && \
1616

1717
# Apply patch for vLLM
1818
RUN cd /vllm-workspace/vllm \
19-
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch \
19+
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch \
20+
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch \
2021
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
2122

2223
# Apply patch for vLLM-Ascend
2324
RUN cd /vllm-workspace/vllm-ascend \
24-
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch \
25-
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt-sparse.patch
25+
&& git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
2626

2727
CMD ["/bin/bash"]

docs/source/getting-started/installation_gpu.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ After installation, please apply patch to ensure uc_connector can be used:
4848

4949
```bash
5050
cd $(pip show vllm | grep Location | awk '{print $2}')
51-
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
51+
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch
52+
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch
5253
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
5354
```
5455

docs/source/getting-started/installation_npu.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ docker run --rm \
4242
Codes of vLLM and vLLM Ascend are placed in /vllm-workspace, you can refer to [vLLM-Ascend Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more information. After installation, please apply patches to ensure uc_connector can be used:
4343
```bash
4444
cd /vllm-workspace/vllm
45-
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
45+
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-pc.patch
46+
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-aggre.patch
47+
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
48+
4649
cd /vllm-workspace/vllm-ascend
4750
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
4851
```
@@ -59,6 +62,16 @@ pip install -v -e . --no-build-isolation
5962
cd ..
6063
```
6164

65+
Codes of vLLM and vLLM Ascend are placed in /vllm-workspace, you can refer to [vLLM-Ascend Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more information. After installation, please apply patches to ensure uc_connector can be used:
66+
```bash
67+
cd /vllm-workspace/vllm
68+
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
69+
cd /vllm-workspace/vllm-ascend
70+
git apply /vllm-workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
71+
```
72+
Refer to these issues [vllm-issue](https://github.com/vllm-project/vllm/issues/21702) and [vllm-ascend-issue](https://github.com/vllm-project/vllm-ascend/issues/2057) to see details of patches' changes.
73+
74+
6275
## Setup from docker
6376
Download the pre-built docker image provided or build unified-cache-management docker image by commands below:
6477
```bash

docs/source/getting-started/quick_start.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@ Before you start with UCM, please make sure that you have installed UCM correctl
2121

2222
## Features Overview
2323

24-
UCM supports two key features: **Prefix Cache** and **GSA Sparsity**.
24+
UCM supports two key features: **Prefix Cache** and **Sparse attention**.
2525

2626
Each feature supports both **Offline Inference** and **Online API** modes.
2727

2828
For quick start, just follow the [usage](./quick_start.md) guide below to launch your own inference experience;
2929

30-
For further research, click on the links blow to see more details of each feature:
30+
For further research on Prefix Cache, more details are available via the link below:
3131
- [Prefix Cache](../user-guide/prefix-cache/index.md)
32+
33+
Various Sparse Attention features are now available, try GSA Sparsity via the link below:
3234
- [GSA Sparsity](../user-guide/sparse-attention/gsa.md)
3335

3436
## Usage
@@ -47,7 +49,7 @@ python offline_inference.py
4749

4850
</details>
4951

50-
<details>
52+
<details open>
5153
<summary><b>OpenAI-Compatible Online API</b></summary>
5254

5355
For online inference , vLLM with our connector can also be deployed as a server that implements the OpenAI API protocol.

docs/source/user-guide/pd-disaggregation/1p1d.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@ This example demonstrates how to run unified-cache-management with disaggregated
55

66
## Prerequisites
77
- UCM: Installed with reference to the Installation documentation.
8-
- Hardware: At least 2 GPUs
8+
- Hardware: At least 2 GPUs or 2 NPUs
99

1010
## Start disaggregated service
11-
For illustration purposes, let us assume that the model used is Qwen2.5-7B-Instruct.
11+
For illustration purposes, let us take GPU as an example and assume the model used is Qwen2.5-7B-Instruct.Using ASCEND_RT_VISIBLE_DEVICES instead of CUDA_VISIBLE_DEVICES to specify visible devices when starting service on Ascend platform.
1212

1313
### Run prefill server
1414
Prefiller Launch Command:
1515
```bash
1616
export PYTHONHASHSEED=123456
17-
CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
17+
export CUDA_VISIBLE_DEVICES=0
18+
vllm serve /home/models/Qwen2.5-7B-Instruct \
1819
--max-model-len 20000 \
1920
--tensor-parallel-size 1 \
2021
--gpu_memory_utilization 0.87 \
@@ -41,8 +42,9 @@ CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
4142
### Run decode server
4243
Decoder Launch Command:
4344
```bash
44-
export PYTHONHASHSEED=123456
45-
CUDA_VISIBLE_DEVICES=1 vllm serve /home/models/Qwen2.5-7B-Instruct \
45+
export PYTHONHASHSEED=123456
46+
export CUDA_VISIBLE_DEVICES=0
47+
vllm serve /home/models/Qwen2.5-7B-Instruct \
4648
--max-model-len 20000 \
4749
--tensor-parallel-size 1 \
4850
--gpu_memory_utilization 0.87 \
@@ -68,8 +70,8 @@ CUDA_VISIBLE_DEVICES=1 vllm serve /home/models/Qwen2.5-7B-Instruct \
6870
### Run proxy server
6971
Make sure prefill nodes and decode nodes can connect to each other.
7072
```bash
71-
cd vllm-workspace/unified-cache-management/test/
72-
python3 toy_proxy_server.py --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <decode-node-ip> --decoder-port 7801
73+
cd /vllm-workspace/unified-cache-management/ucm/pd
74+
python3 toy_proxy_server.py --pd-disaggregation --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <decode-node-ip> --decoder-port 7801
7375
```
7476

7577
## Testing and Benchmarking
@@ -88,8 +90,7 @@ curl http://localhost:7802/v1/completions \
8890
### Benchmark Test
8991
Use the benchmark scripts provided by vLLM.
9092
```bash
91-
cd /vllm-workspace/vllm/benchmarks
92-
python3 benchmark_serving.py \
93+
vllm bench serve \
9394
--backend vllm \
9495
--dataset-name random \
9596
--random-input-len 4096 \

docs/source/user-guide/pd-disaggregation/npgd.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
5050
Decoder Launch Command:
5151
```bash
5252
export PYTHONHASHSEED=123456
53-
CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
53+
export CUDA_VISIBLE_DEVICES=0
54+
vllm serve /home/models/Qwen2.5-7B-Instruct \
5455
--max-model-len 20000 \
5556
--tensor-parallel-size 1 \
5657
--gpu_memory_utilization 0.87 \
@@ -77,7 +78,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve /home/models/Qwen2.5-7B-Instruct \
7778
### Run proxy server
7879
Make sure prefill nodes and decode nodes can connect to each other.
7980
```bash
80-
cd vllm-workspace/unified-cache-management/test/
81+
cd /vllm-workspace/unified-cache-management/ucm/pd
8182
python3 toy_proxy_server.py --host localhost --port 7802 --prefiller-host <prefill-node-ip> --prefiller-port 7800 --decoder-host <decode-node-ip> --decoder-port 7801
8283
```
8384

@@ -97,8 +98,7 @@ curl http://localhost:7802/v1/completions \
9798
### Benchmark Test
9899
Use the benchmark scripts provided by vLLM.
99100
```bash
100-
cd /vllm-workspace/vllm/benchmarks
101-
python3 benchmark_serving.py \
101+
vllm bench serve \
102102
--backend vllm \
103103
--dataset-name random \
104104
--random-input-len 4096 \

0 commit comments

Comments
 (0)