|
| 1 | +""" |
| 2 | +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License" |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | +""" |
| 16 | + |
1 | 17 | import paddle |
2 | 18 |
|
3 | 19 | from fastdeploy.platforms import current_platform |
4 | 20 |
|
5 | | -if current_platform.is_cuda(): |
6 | | - from fastdeploy.model_executor.ops.gpu import ( |
7 | | - cuda_host_alloc, |
8 | | - cuda_host_free, |
9 | | - get_data_ptr_ipc, |
10 | | - get_output_kv_signal, |
11 | | - ipc_sent_key_value_cache_by_remote_ptr, |
12 | | - ipc_sent_key_value_cache_by_remote_ptr_block_sync, |
13 | | - set_data_ipc, |
14 | | - share_external_data, |
15 | | - swap_cache_all_layers, |
16 | | - unset_data_ipc, |
17 | | - ) |
18 | | - |
19 | | - memory_allocated = paddle.device.cuda.memory_allocated |
20 | | - |
21 | | - def get_peer_mem_addr(*args, **kwargs): |
22 | | - raise RuntimeError("CUDA no need of get_peer_mem_addr!") |
23 | | - |
24 | | -elif current_platform.is_xpu(): |
25 | | - from fastdeploy.model_executor.ops.xpu import ( |
26 | | - cuda_host_alloc, |
27 | | - cuda_host_free, |
28 | | - get_output_kv_signal, |
29 | | - get_peer_mem_addr, |
30 | | - set_data_ipc, |
31 | | - share_external_data, |
32 | | - swap_cache_all_layers, |
33 | | - ) |
34 | | - |
35 | | - unset_data_ipc = None |
36 | | - memory_allocated = paddle.device.xpu.memory_allocated |
37 | | - |
38 | | - def get_data_ptr_ipc(*args, **kwargs): |
39 | | - raise RuntimeError("XPU get_data_ptr_ipc UNIMPLENENTED!") |
40 | | - |
41 | | - def ipc_sent_key_value_cache_by_remote_ptr(*args, **kwargs): |
42 | | - raise RuntimeError("XPU ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED") |
43 | | - |
44 | | - def ipc_sent_key_value_cache_by_remote_ptr_block_sync(*args, **kwargs): |
45 | | - raise RuntimeError("XPU No ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED") |
46 | | - |
47 | | -else: |
48 | | - raise RuntimeError("Prefix cache ops only supported CUDA nor XPU platform ") |
49 | | - |
50 | | - |
51 | | -def set_device(device): |
| 21 | +try: |
52 | 22 | if current_platform.is_cuda(): |
53 | | - paddle.set_device(f"gpu:{device}") |
| 23 | + from fastdeploy.model_executor.ops.gpu import ( |
| 24 | + cuda_host_alloc, |
| 25 | + cuda_host_free, |
| 26 | + get_data_ptr_ipc, |
| 27 | + get_output_kv_signal, |
| 28 | + ipc_sent_key_value_cache_by_remote_ptr, |
| 29 | + ipc_sent_key_value_cache_by_remote_ptr_block_sync, |
| 30 | + set_data_ipc, |
| 31 | + share_external_data, |
| 32 | + swap_cache_all_layers, |
| 33 | + unset_data_ipc, |
| 34 | + ) |
| 35 | + |
| 36 | + memory_allocated = paddle.device.cuda.memory_allocated |
| 37 | + |
| 38 | + def get_peer_mem_addr(*args, **kwargs): |
| 39 | + raise RuntimeError("CUDA no need of get_peer_mem_addr!") |
| 40 | + |
54 | 41 | elif current_platform.is_xpu(): |
55 | | - paddle.set_device(f"xpu:{device}") |
56 | | - else: |
57 | | - raise RuntimeError("No supported platform") |
| 42 | + from fastdeploy.model_executor.ops.xpu import ( |
| 43 | + cuda_host_alloc, |
| 44 | + cuda_host_free, |
| 45 | + get_output_kv_signal, |
| 46 | + get_peer_mem_addr, |
| 47 | + set_data_ipc, |
| 48 | + share_external_data, |
| 49 | + swap_cache_all_layers, |
| 50 | + ) |
58 | 51 |
|
| 52 | + unset_data_ipc = None |
| 53 | + memory_allocated = paddle.device.xpu.memory_allocated |
59 | 54 |
|
60 | | -def share_external_data_(cache, cache_name, cache_shape, use_ipc): |
61 | | - if current_platform.is_cuda(): |
62 | | - cache = share_external_data(cache, cache_name, cache_shape) |
63 | | - elif current_platform.is_xpu(): |
64 | | - cache = share_external_data(cache, cache_name, cache_shape, use_ipc) |
65 | | - else: |
66 | | - raise RuntimeError("No supported platform") |
67 | | - return cache |
| 55 | + def get_data_ptr_ipc(*args, **kwargs): |
| 56 | + raise RuntimeError("XPU get_data_ptr_ipc UNIMPLENENTED!") |
68 | 57 |
|
| 58 | + def ipc_sent_key_value_cache_by_remote_ptr(*args, **kwargs): |
| 59 | + raise RuntimeError("XPU ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED") |
| 60 | + |
| 61 | + def ipc_sent_key_value_cache_by_remote_ptr_block_sync(*args, **kwargs): |
| 62 | + raise RuntimeError("XPU No ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED") |
69 | 63 |
|
70 | | -def get_all_visible_devices(): |
71 | | - if current_platform.is_xpu(): |
72 | | - return "XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" |
73 | 64 | else: |
74 | | - return "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" |
| 65 | + raise RuntimeError("Prefix cache ops only supported CUDA nor XPU platform ") |
| 66 | + |
| 67 | + def set_device(device): |
| 68 | + if current_platform.is_cuda(): |
| 69 | + paddle.set_device(f"gpu:{device}") |
| 70 | + elif current_platform.is_xpu(): |
| 71 | + paddle.set_device(f"xpu:{device}") |
| 72 | + else: |
| 73 | + raise RuntimeError("No supported platform") |
| 74 | + |
| 75 | + def share_external_data_(cache, cache_name, cache_shape, use_ipc): |
| 76 | + if current_platform.is_cuda(): |
| 77 | + cache = share_external_data(cache, cache_name, cache_shape) |
| 78 | + elif current_platform.is_xpu(): |
| 79 | + cache = share_external_data(cache, cache_name, cache_shape, use_ipc) |
| 80 | + else: |
| 81 | + raise RuntimeError("No supported platform") |
| 82 | + return cache |
| 83 | + |
| 84 | + def get_all_visible_devices(): |
| 85 | + if current_platform.is_xpu(): |
| 86 | + return "XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" |
| 87 | + else: |
| 88 | + return "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" |
| 89 | + |
| 90 | +except: |
| 91 | + cuda_host_alloc = None |
| 92 | + cuda_host_free = None |
| 93 | + set_data_ipc = None |
| 94 | + share_external_data_ = None |
| 95 | + swap_cache_all_layers = None |
| 96 | + unset_data_ipc = None |
| 97 | + set_device = None |
| 98 | + memory_allocated = None |
| 99 | + get_output_kv_signal = None |
| 100 | + get_data_ptr_ipc = None |
| 101 | + ipc_sent_key_value_cache_by_remote_ptr = None |
| 102 | + ipc_sent_key_value_cache_by_remote_ptr_block_sync = None |
| 103 | + get_peer_mem_addr = None |
| 104 | + get_all_visible_devices = None |
75 | 105 |
|
76 | 106 |
|
77 | 107 | __all__ = [ |
|
0 commit comments