|
16 | 16 | #include "paddle/extension.h" |
17 | 17 |
|
18 | 18 | template <paddle::DataType D> |
19 | | -void SwapCacheImplAllLayers(const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu |
20 | | - const std::vector<int64_t>& cache_cpu_ptrs, // cpu |
21 | | - const int64_t& max_block_num_cpu, |
22 | | - const std::vector<int64_t>& swap_block_ids_gpu, |
23 | | - const std::vector<int64_t>& swap_block_ids_cpu, |
24 | | - int mode) { |
25 | | - typedef PDTraits<D> traits_; |
26 | | - typedef typename traits_::DataType DataType_; |
27 | | - typedef typename traits_::data_t data_t; |
28 | | - auto stream = cache_gpu_tensors[0].stream(); |
29 | | - for(int layer_idx=0; layer_idx < cache_gpu_tensors.size(); layer_idx++){ |
30 | | - const paddle::Tensor& cache_gpu = cache_gpu_tensors[layer_idx]; |
31 | | - const int64_t& cache_cpu_pointer = cache_cpu_ptrs[layer_idx]; |
32 | | - data_t* cache_gpu_ptr = const_cast<data_t*>(cache_gpu.data<data_t>()); |
33 | | - auto* cache_cpu_ptr = reinterpret_cast<data_t*>(cache_cpu_pointer); |
34 | | - auto cache_shape = cache_gpu.shape(); |
35 | | - const int64_t max_block_num_gpu = cache_shape[0]; |
36 | | - const int64_t num_heads = cache_shape[1]; |
37 | | - const int64_t block_size = cache_shape[2]; |
38 | | - const int64_t head_dim = cache_shape[3]; |
39 | | - const int64_t cache_stride = num_heads * block_size * head_dim; |
| 19 | +void SwapCacheImplAllLayers( |
| 20 | + const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu |
| 21 | + const std::vector<int64_t>& cache_cpu_ptrs, // cpu |
| 22 | + const int64_t& max_block_num_cpu, |
| 23 | + const std::vector<int64_t>& swap_block_ids_gpu, |
| 24 | + const std::vector<int64_t>& swap_block_ids_cpu, |
| 25 | + int mode) { |
| 26 | + typedef PDTraits<D> traits_; |
| 27 | + typedef typename traits_::DataType DataType_; |
| 28 | + typedef typename traits_::data_t data_t; |
| 29 | + auto stream = cache_gpu_tensors[0].stream(); |
| 30 | + for (int layer_idx = 0; layer_idx < cache_gpu_tensors.size(); layer_idx++) { |
| 31 | + const paddle::Tensor& cache_gpu = cache_gpu_tensors[layer_idx]; |
| 32 | + const int64_t& cache_cpu_pointer = cache_cpu_ptrs[layer_idx]; |
| 33 | + data_t* cache_gpu_ptr = const_cast<data_t*>(cache_gpu.data<data_t>()); |
| 34 | + auto* cache_cpu_ptr = reinterpret_cast<data_t*>(cache_cpu_pointer); |
| 35 | + auto cache_shape = cache_gpu.shape(); |
| 36 | + const int64_t max_block_num_gpu = cache_shape[0]; |
| 37 | + const int64_t num_heads = cache_shape[1]; |
| 38 | + const int64_t block_size = cache_shape[2]; |
| 39 | + int64_t head_dim = 1; |
| 40 | + if (cache_shape.size() == 4) { |
| 41 | + head_dim = cache_shape[3]; |
| 42 | + } |
| 43 | + const int64_t cache_stride = num_heads * block_size * head_dim; |
40 | 44 |
|
41 | | - auto stream = cache_gpu.stream(); |
42 | | - if (swap_block_ids_gpu.size() == 0) { |
43 | | - return; |
44 | | - } |
45 | | - int i = 0; |
46 | | - int64_t consecutive_block_count = 1; |
47 | | - int64_t last_gpu_block_id = swap_block_ids_gpu[i]; |
48 | | - int64_t last_cpu_block_id = swap_block_ids_cpu[i]; |
49 | | - int64_t first_gpu_block_id = last_gpu_block_id; // first block id in a consecutive block ids |
50 | | - int64_t first_cpu_block_id = last_cpu_block_id; |
51 | | - i += 1; |
52 | | - while(true){ |
53 | | - if (i >= swap_block_ids_gpu.size()) { |
54 | | - break; |
55 | | - } |
56 | | - int64_t gpu_block_id = swap_block_ids_gpu[i]; |
57 | | - int64_t cpu_block_id = swap_block_ids_cpu[i]; |
58 | | - assert(gpu_block_id >= 0 && gpu_block_id < max_block_num_gpu); |
59 | | - assert(cpu_block_id >= 0 && cpu_block_id < max_block_num_cpu); |
60 | | - if (gpu_block_id == last_gpu_block_id + 1 && cpu_block_id == last_cpu_block_id + 1){ // consecutive |
61 | | - consecutive_block_count += 1; |
62 | | - last_gpu_block_id = gpu_block_id; |
63 | | - last_cpu_block_id = cpu_block_id; |
64 | | - } else{ |
65 | | - // end of a consecutive block ids |
66 | | - auto *cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride; |
67 | | - auto *cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride; |
68 | | - if (mode == 0) { // copy from device to host |
69 | | - cudaMemcpyAsync(cache_cpu_ptr_now, cache_gpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyDeviceToHost, stream); |
70 | | - } else { // copy from host to device |
71 | | - cudaMemcpyAsync(cache_gpu_ptr_now, cache_cpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyHostToDevice, stream); |
72 | | - } |
73 | | - first_gpu_block_id = gpu_block_id; |
74 | | - first_cpu_block_id = cpu_block_id; |
75 | | - last_gpu_block_id = gpu_block_id; |
76 | | - last_cpu_block_id = cpu_block_id; |
77 | | - consecutive_block_count = 1; |
78 | | - } |
79 | | - i += 1; |
80 | | - } |
81 | | - // last batch |
82 | | - auto *cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride; |
83 | | - auto *cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride; |
84 | | - if (mode == 0) { // copy from device to host |
85 | | - cudaMemcpyAsync(cache_cpu_ptr_now, cache_gpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyDeviceToHost, stream); |
86 | | - } else { // copy from host to device |
87 | | - cudaMemcpyAsync(cache_gpu_ptr_now, cache_cpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyHostToDevice, stream); |
| 45 | + auto stream = cache_gpu.stream(); |
| 46 | + if (swap_block_ids_gpu.size() == 0) { |
| 47 | + return; |
| 48 | + } |
| 49 | + int i = 0; |
| 50 | + int64_t consecutive_block_count = 1; |
| 51 | + int64_t last_gpu_block_id = swap_block_ids_gpu[i]; |
| 52 | + int64_t last_cpu_block_id = swap_block_ids_cpu[i]; |
| 53 | + int64_t first_gpu_block_id = |
| 54 | + last_gpu_block_id; // first block id in a consecutive block ids |
| 55 | + int64_t first_cpu_block_id = last_cpu_block_id; |
| 56 | + i += 1; |
| 57 | + while (true) { |
| 58 | + if (i >= swap_block_ids_gpu.size()) { |
| 59 | + break; |
| 60 | + } |
| 61 | + int64_t gpu_block_id = swap_block_ids_gpu[i]; |
| 62 | + int64_t cpu_block_id = swap_block_ids_cpu[i]; |
| 63 | + assert(gpu_block_id >= 0 && gpu_block_id < max_block_num_gpu); |
| 64 | + assert(cpu_block_id >= 0 && cpu_block_id < max_block_num_cpu); |
| 65 | + if (gpu_block_id == last_gpu_block_id + 1 && |
| 66 | + cpu_block_id == last_cpu_block_id + 1) { // consecutive |
| 67 | + consecutive_block_count += 1; |
| 68 | + last_gpu_block_id = gpu_block_id; |
| 69 | + last_cpu_block_id = cpu_block_id; |
| 70 | + } else { |
| 71 | + // end of a consecutive block ids |
| 72 | + auto* cache_gpu_ptr_now = |
| 73 | + cache_gpu_ptr + first_gpu_block_id * cache_stride; |
| 74 | + auto* cache_cpu_ptr_now = |
| 75 | + cache_cpu_ptr + first_cpu_block_id * cache_stride; |
| 76 | + if (mode == 0) { // copy from device to host |
| 77 | + cudaMemcpyAsync( |
| 78 | + cache_cpu_ptr_now, |
| 79 | + cache_gpu_ptr_now, |
| 80 | + cache_stride * sizeof(DataType_) * consecutive_block_count, |
| 81 | + cudaMemcpyDeviceToHost, |
| 82 | + stream); |
| 83 | + } else { // copy from host to device |
| 84 | + cudaMemcpyAsync( |
| 85 | + cache_gpu_ptr_now, |
| 86 | + cache_cpu_ptr_now, |
| 87 | + cache_stride * sizeof(DataType_) * consecutive_block_count, |
| 88 | + cudaMemcpyHostToDevice, |
| 89 | + stream); |
88 | 90 | } |
| 91 | + first_gpu_block_id = gpu_block_id; |
| 92 | + first_cpu_block_id = cpu_block_id; |
| 93 | + last_gpu_block_id = gpu_block_id; |
| 94 | + last_cpu_block_id = cpu_block_id; |
| 95 | + consecutive_block_count = 1; |
| 96 | + } |
| 97 | + i += 1; |
| 98 | + } |
| 99 | + // last batch |
| 100 | + auto* cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride; |
| 101 | + auto* cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride; |
| 102 | + if (mode == 0) { // copy from device to host |
| 103 | + cudaMemcpyAsync( |
| 104 | + cache_cpu_ptr_now, |
| 105 | + cache_gpu_ptr_now, |
| 106 | + cache_stride * sizeof(DataType_) * consecutive_block_count, |
| 107 | + cudaMemcpyDeviceToHost, |
| 108 | + stream); |
| 109 | + } else { // copy from host to device |
| 110 | + cudaMemcpyAsync( |
| 111 | + cache_gpu_ptr_now, |
| 112 | + cache_cpu_ptr_now, |
| 113 | + cache_stride * sizeof(DataType_) * consecutive_block_count, |
| 114 | + cudaMemcpyHostToDevice, |
| 115 | + stream); |
89 | 116 | } |
90 | | - cudaStreamSynchronize(stream); |
| 117 | + } |
| 118 | + cudaStreamSynchronize(stream); |
91 | 119 | } |
92 | 120 |
|
93 | | -void SwapCacheAllLayers(const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu |
94 | | - const std::vector<int64_t>& cache_cpu_ptrs, // cpu memory pointer |
95 | | - int64_t max_block_num_cpu, // cpu max block num |
96 | | - const std::vector<int64_t>& swap_block_ids_gpu, |
97 | | - const std::vector<int64_t>& swap_block_ids_cpu, |
98 | | - int rank, |
99 | | - int mode) { |
100 | | - cudaSetDevice(rank); // used for distributed launch |
101 | | - assert(cache_gpu_tensors.size() > 0 && cache_gpu_tensors.size() == cache_cpu_ptrs.size()); |
102 | | - switch (cache_gpu_tensors[0].dtype()) { |
103 | | - case paddle::DataType::BFLOAT16: |
104 | | - return SwapCacheImplAllLayers<paddle::DataType::BFLOAT16>( |
105 | | - cache_gpu_tensors, |
106 | | - cache_cpu_ptrs, |
107 | | - max_block_num_cpu, |
108 | | - swap_block_ids_gpu, |
109 | | - swap_block_ids_cpu, |
110 | | - mode); |
111 | | - case paddle::DataType::FLOAT16: |
112 | | - return SwapCacheImplAllLayers<paddle::DataType::FLOAT16>( |
113 | | - cache_gpu_tensors, |
114 | | - cache_cpu_ptrs, |
115 | | - max_block_num_cpu, |
116 | | - swap_block_ids_gpu, |
117 | | - swap_block_ids_cpu, |
118 | | - mode); |
119 | | - case paddle::DataType::UINT8: |
120 | | - return SwapCacheImplAllLayers<paddle::DataType::UINT8>( |
121 | | - cache_gpu_tensors, |
122 | | - cache_cpu_ptrs, |
123 | | - max_block_num_cpu, |
124 | | - swap_block_ids_gpu, |
125 | | - swap_block_ids_cpu, |
126 | | - mode); |
127 | | - default: |
128 | | - PD_THROW("Unsupported data type."); |
129 | | - } |
| 121 | +void SwapCacheAllLayers( |
| 122 | + const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu |
| 123 | + const std::vector<int64_t>& cache_cpu_ptrs, // cpu memory pointer |
| 124 | + int64_t max_block_num_cpu, // cpu max block num |
| 125 | + const std::vector<int64_t>& swap_block_ids_gpu, |
| 126 | + const std::vector<int64_t>& swap_block_ids_cpu, |
| 127 | + int rank, |
| 128 | + int mode) { |
| 129 | + cudaSetDevice(rank); // used for distributed launch |
| 130 | + assert(cache_gpu_tensors.size() > 0 && |
| 131 | + cache_gpu_tensors.size() == cache_cpu_ptrs.size()); |
| 132 | + switch (cache_gpu_tensors[0].dtype()) { |
| 133 | + case paddle::DataType::BFLOAT16: |
| 134 | + return SwapCacheImplAllLayers<paddle::DataType::BFLOAT16>( |
| 135 | + cache_gpu_tensors, |
| 136 | + cache_cpu_ptrs, |
| 137 | + max_block_num_cpu, |
| 138 | + swap_block_ids_gpu, |
| 139 | + swap_block_ids_cpu, |
| 140 | + mode); |
| 141 | + case paddle::DataType::FLOAT16: |
| 142 | + return SwapCacheImplAllLayers<paddle::DataType::FLOAT16>( |
| 143 | + cache_gpu_tensors, |
| 144 | + cache_cpu_ptrs, |
| 145 | + max_block_num_cpu, |
| 146 | + swap_block_ids_gpu, |
| 147 | + swap_block_ids_cpu, |
| 148 | + mode); |
| 149 | + case paddle::DataType::UINT8: |
| 150 | + return SwapCacheImplAllLayers<paddle::DataType::UINT8>(cache_gpu_tensors, |
| 151 | + cache_cpu_ptrs, |
| 152 | + max_block_num_cpu, |
| 153 | + swap_block_ids_gpu, |
| 154 | + swap_block_ids_cpu, |
| 155 | + mode); |
| 156 | + default: |
| 157 | + PD_THROW("Unsupported data type."); |
| 158 | + } |
130 | 159 | } |
131 | 160 |
|
132 | 161 | PD_BUILD_STATIC_OP(swap_cache_all_layers) |
133 | 162 | .Inputs({paddle::Vec("cache_gpu_tensors")}) |
134 | | - .Attrs({"cache_cpu_ptrs: std::vector<int64_t>", |
135 | | - "max_block_num_cpu: int64_t", |
136 | | - "swap_block_ids_gpu: std::vector<int64_t>", |
137 | | - "swap_block_ids_cpu: std::vector<int64_t>", |
138 | | - "rank: int", |
139 | | - "mode: int",}) |
| 163 | + .Attrs({ |
| 164 | + "cache_cpu_ptrs: std::vector<int64_t>", |
| 165 | + "max_block_num_cpu: int64_t", |
| 166 | + "swap_block_ids_gpu: std::vector<int64_t>", |
| 167 | + "swap_block_ids_cpu: std::vector<int64_t>", |
| 168 | + "rank: int", |
| 169 | + "mode: int", |
| 170 | + }) |
140 | 171 | .Outputs({paddle::Vec("cache_dst_outs")}) |
141 | | - .SetInplaceMap({{paddle::Vec("cache_gpu_tensors"), paddle::Vec("cache_dst_outs")}}) |
| 172 | + .SetInplaceMap({{paddle::Vec("cache_gpu_tensors"), |
| 173 | + paddle::Vec("cache_dst_outs")}}) |
142 | 174 | .SetKernelFn(PD_KERNEL(SwapCacheAllLayers)); |
0 commit comments