Skip to content

Commit c068a4f

Browse files
authored
[Feature] dyc8 support prefixcache (#5125)
* dyc8 support prefixcache * fix cache_trans test case * update code
1 parent ab3a2e4 commit c068a4f

File tree

5 files changed

+270
-119
lines changed

5 files changed

+270
-119
lines changed

custom_ops/gpu_ops/swap_cache_batch.cu

Lines changed: 145 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -16,127 +16,159 @@
1616
#include "paddle/extension.h"
1717

1818
template <paddle::DataType D>
19-
void SwapCacheImplAllLayers(const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu
20-
const std::vector<int64_t>& cache_cpu_ptrs, // cpu
21-
const int64_t& max_block_num_cpu,
22-
const std::vector<int64_t>& swap_block_ids_gpu,
23-
const std::vector<int64_t>& swap_block_ids_cpu,
24-
int mode) {
25-
typedef PDTraits<D> traits_;
26-
typedef typename traits_::DataType DataType_;
27-
typedef typename traits_::data_t data_t;
28-
auto stream = cache_gpu_tensors[0].stream();
29-
for(int layer_idx=0; layer_idx < cache_gpu_tensors.size(); layer_idx++){
30-
const paddle::Tensor& cache_gpu = cache_gpu_tensors[layer_idx];
31-
const int64_t& cache_cpu_pointer = cache_cpu_ptrs[layer_idx];
32-
data_t* cache_gpu_ptr = const_cast<data_t*>(cache_gpu.data<data_t>());
33-
auto* cache_cpu_ptr = reinterpret_cast<data_t*>(cache_cpu_pointer);
34-
auto cache_shape = cache_gpu.shape();
35-
const int64_t max_block_num_gpu = cache_shape[0];
36-
const int64_t num_heads = cache_shape[1];
37-
const int64_t block_size = cache_shape[2];
38-
const int64_t head_dim = cache_shape[3];
39-
const int64_t cache_stride = num_heads * block_size * head_dim;
19+
void SwapCacheImplAllLayers(
20+
const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu
21+
const std::vector<int64_t>& cache_cpu_ptrs, // cpu
22+
const int64_t& max_block_num_cpu,
23+
const std::vector<int64_t>& swap_block_ids_gpu,
24+
const std::vector<int64_t>& swap_block_ids_cpu,
25+
int mode) {
26+
typedef PDTraits<D> traits_;
27+
typedef typename traits_::DataType DataType_;
28+
typedef typename traits_::data_t data_t;
29+
auto stream = cache_gpu_tensors[0].stream();
30+
for (int layer_idx = 0; layer_idx < cache_gpu_tensors.size(); layer_idx++) {
31+
const paddle::Tensor& cache_gpu = cache_gpu_tensors[layer_idx];
32+
const int64_t& cache_cpu_pointer = cache_cpu_ptrs[layer_idx];
33+
data_t* cache_gpu_ptr = const_cast<data_t*>(cache_gpu.data<data_t>());
34+
auto* cache_cpu_ptr = reinterpret_cast<data_t*>(cache_cpu_pointer);
35+
auto cache_shape = cache_gpu.shape();
36+
const int64_t max_block_num_gpu = cache_shape[0];
37+
const int64_t num_heads = cache_shape[1];
38+
const int64_t block_size = cache_shape[2];
39+
int64_t head_dim = 1;
40+
if (cache_shape.size() == 4) {
41+
head_dim = cache_shape[3];
42+
}
43+
const int64_t cache_stride = num_heads * block_size * head_dim;
4044

41-
auto stream = cache_gpu.stream();
42-
if (swap_block_ids_gpu.size() == 0) {
43-
return;
44-
}
45-
int i = 0;
46-
int64_t consecutive_block_count = 1;
47-
int64_t last_gpu_block_id = swap_block_ids_gpu[i];
48-
int64_t last_cpu_block_id = swap_block_ids_cpu[i];
49-
int64_t first_gpu_block_id = last_gpu_block_id; // first block id in a consecutive block ids
50-
int64_t first_cpu_block_id = last_cpu_block_id;
51-
i += 1;
52-
while(true){
53-
if (i >= swap_block_ids_gpu.size()) {
54-
break;
55-
}
56-
int64_t gpu_block_id = swap_block_ids_gpu[i];
57-
int64_t cpu_block_id = swap_block_ids_cpu[i];
58-
assert(gpu_block_id >= 0 && gpu_block_id < max_block_num_gpu);
59-
assert(cpu_block_id >= 0 && cpu_block_id < max_block_num_cpu);
60-
if (gpu_block_id == last_gpu_block_id + 1 && cpu_block_id == last_cpu_block_id + 1){ // consecutive
61-
consecutive_block_count += 1;
62-
last_gpu_block_id = gpu_block_id;
63-
last_cpu_block_id = cpu_block_id;
64-
} else{
65-
// end of a consecutive block ids
66-
auto *cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
67-
auto *cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
68-
if (mode == 0) { // copy from device to host
69-
cudaMemcpyAsync(cache_cpu_ptr_now, cache_gpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyDeviceToHost, stream);
70-
} else { // copy from host to device
71-
cudaMemcpyAsync(cache_gpu_ptr_now, cache_cpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyHostToDevice, stream);
72-
}
73-
first_gpu_block_id = gpu_block_id;
74-
first_cpu_block_id = cpu_block_id;
75-
last_gpu_block_id = gpu_block_id;
76-
last_cpu_block_id = cpu_block_id;
77-
consecutive_block_count = 1;
78-
}
79-
i += 1;
80-
}
81-
// last batch
82-
auto *cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
83-
auto *cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
84-
if (mode == 0) { // copy from device to host
85-
cudaMemcpyAsync(cache_cpu_ptr_now, cache_gpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyDeviceToHost, stream);
86-
} else { // copy from host to device
87-
cudaMemcpyAsync(cache_gpu_ptr_now, cache_cpu_ptr_now, cache_stride * sizeof(DataType_) * consecutive_block_count, cudaMemcpyHostToDevice, stream);
45+
auto stream = cache_gpu.stream();
46+
if (swap_block_ids_gpu.size() == 0) {
47+
return;
48+
}
49+
int i = 0;
50+
int64_t consecutive_block_count = 1;
51+
int64_t last_gpu_block_id = swap_block_ids_gpu[i];
52+
int64_t last_cpu_block_id = swap_block_ids_cpu[i];
53+
int64_t first_gpu_block_id =
54+
last_gpu_block_id; // first block id in a consecutive block ids
55+
int64_t first_cpu_block_id = last_cpu_block_id;
56+
i += 1;
57+
while (true) {
58+
if (i >= swap_block_ids_gpu.size()) {
59+
break;
60+
}
61+
int64_t gpu_block_id = swap_block_ids_gpu[i];
62+
int64_t cpu_block_id = swap_block_ids_cpu[i];
63+
assert(gpu_block_id >= 0 && gpu_block_id < max_block_num_gpu);
64+
assert(cpu_block_id >= 0 && cpu_block_id < max_block_num_cpu);
65+
if (gpu_block_id == last_gpu_block_id + 1 &&
66+
cpu_block_id == last_cpu_block_id + 1) { // consecutive
67+
consecutive_block_count += 1;
68+
last_gpu_block_id = gpu_block_id;
69+
last_cpu_block_id = cpu_block_id;
70+
} else {
71+
// end of a consecutive block ids
72+
auto* cache_gpu_ptr_now =
73+
cache_gpu_ptr + first_gpu_block_id * cache_stride;
74+
auto* cache_cpu_ptr_now =
75+
cache_cpu_ptr + first_cpu_block_id * cache_stride;
76+
if (mode == 0) { // copy from device to host
77+
cudaMemcpyAsync(
78+
cache_cpu_ptr_now,
79+
cache_gpu_ptr_now,
80+
cache_stride * sizeof(DataType_) * consecutive_block_count,
81+
cudaMemcpyDeviceToHost,
82+
stream);
83+
} else { // copy from host to device
84+
cudaMemcpyAsync(
85+
cache_gpu_ptr_now,
86+
cache_cpu_ptr_now,
87+
cache_stride * sizeof(DataType_) * consecutive_block_count,
88+
cudaMemcpyHostToDevice,
89+
stream);
8890
}
91+
first_gpu_block_id = gpu_block_id;
92+
first_cpu_block_id = cpu_block_id;
93+
last_gpu_block_id = gpu_block_id;
94+
last_cpu_block_id = cpu_block_id;
95+
consecutive_block_count = 1;
96+
}
97+
i += 1;
98+
}
99+
// last batch
100+
auto* cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
101+
auto* cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
102+
if (mode == 0) { // copy from device to host
103+
cudaMemcpyAsync(
104+
cache_cpu_ptr_now,
105+
cache_gpu_ptr_now,
106+
cache_stride * sizeof(DataType_) * consecutive_block_count,
107+
cudaMemcpyDeviceToHost,
108+
stream);
109+
} else { // copy from host to device
110+
cudaMemcpyAsync(
111+
cache_gpu_ptr_now,
112+
cache_cpu_ptr_now,
113+
cache_stride * sizeof(DataType_) * consecutive_block_count,
114+
cudaMemcpyHostToDevice,
115+
stream);
89116
}
90-
cudaStreamSynchronize(stream);
117+
}
118+
cudaStreamSynchronize(stream);
91119
}
92120

93-
void SwapCacheAllLayers(const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu
94-
const std::vector<int64_t>& cache_cpu_ptrs, // cpu memory pointer
95-
int64_t max_block_num_cpu, // cpu max block num
96-
const std::vector<int64_t>& swap_block_ids_gpu,
97-
const std::vector<int64_t>& swap_block_ids_cpu,
98-
int rank,
99-
int mode) {
100-
cudaSetDevice(rank); // used for distributed launch
101-
assert(cache_gpu_tensors.size() > 0 && cache_gpu_tensors.size() == cache_cpu_ptrs.size());
102-
switch (cache_gpu_tensors[0].dtype()) {
103-
case paddle::DataType::BFLOAT16:
104-
return SwapCacheImplAllLayers<paddle::DataType::BFLOAT16>(
105-
cache_gpu_tensors,
106-
cache_cpu_ptrs,
107-
max_block_num_cpu,
108-
swap_block_ids_gpu,
109-
swap_block_ids_cpu,
110-
mode);
111-
case paddle::DataType::FLOAT16:
112-
return SwapCacheImplAllLayers<paddle::DataType::FLOAT16>(
113-
cache_gpu_tensors,
114-
cache_cpu_ptrs,
115-
max_block_num_cpu,
116-
swap_block_ids_gpu,
117-
swap_block_ids_cpu,
118-
mode);
119-
case paddle::DataType::UINT8:
120-
return SwapCacheImplAllLayers<paddle::DataType::UINT8>(
121-
cache_gpu_tensors,
122-
cache_cpu_ptrs,
123-
max_block_num_cpu,
124-
swap_block_ids_gpu,
125-
swap_block_ids_cpu,
126-
mode);
127-
default:
128-
PD_THROW("Unsupported data type.");
129-
}
121+
void SwapCacheAllLayers(
122+
const std::vector<paddle::Tensor>& cache_gpu_tensors, // gpu
123+
const std::vector<int64_t>& cache_cpu_ptrs, // cpu memory pointer
124+
int64_t max_block_num_cpu, // cpu max block num
125+
const std::vector<int64_t>& swap_block_ids_gpu,
126+
const std::vector<int64_t>& swap_block_ids_cpu,
127+
int rank,
128+
int mode) {
129+
cudaSetDevice(rank); // used for distributed launch
130+
assert(cache_gpu_tensors.size() > 0 &&
131+
cache_gpu_tensors.size() == cache_cpu_ptrs.size());
132+
switch (cache_gpu_tensors[0].dtype()) {
133+
case paddle::DataType::BFLOAT16:
134+
return SwapCacheImplAllLayers<paddle::DataType::BFLOAT16>(
135+
cache_gpu_tensors,
136+
cache_cpu_ptrs,
137+
max_block_num_cpu,
138+
swap_block_ids_gpu,
139+
swap_block_ids_cpu,
140+
mode);
141+
case paddle::DataType::FLOAT16:
142+
return SwapCacheImplAllLayers<paddle::DataType::FLOAT16>(
143+
cache_gpu_tensors,
144+
cache_cpu_ptrs,
145+
max_block_num_cpu,
146+
swap_block_ids_gpu,
147+
swap_block_ids_cpu,
148+
mode);
149+
case paddle::DataType::UINT8:
150+
return SwapCacheImplAllLayers<paddle::DataType::UINT8>(cache_gpu_tensors,
151+
cache_cpu_ptrs,
152+
max_block_num_cpu,
153+
swap_block_ids_gpu,
154+
swap_block_ids_cpu,
155+
mode);
156+
default:
157+
PD_THROW("Unsupported data type.");
158+
}
130159
}
131160

132161
PD_BUILD_STATIC_OP(swap_cache_all_layers)
133162
.Inputs({paddle::Vec("cache_gpu_tensors")})
134-
.Attrs({"cache_cpu_ptrs: std::vector<int64_t>",
135-
"max_block_num_cpu: int64_t",
136-
"swap_block_ids_gpu: std::vector<int64_t>",
137-
"swap_block_ids_cpu: std::vector<int64_t>",
138-
"rank: int",
139-
"mode: int",})
163+
.Attrs({
164+
"cache_cpu_ptrs: std::vector<int64_t>",
165+
"max_block_num_cpu: int64_t",
166+
"swap_block_ids_gpu: std::vector<int64_t>",
167+
"swap_block_ids_cpu: std::vector<int64_t>",
168+
"rank: int",
169+
"mode: int",
170+
})
140171
.Outputs({paddle::Vec("cache_dst_outs")})
141-
.SetInplaceMap({{paddle::Vec("cache_gpu_tensors"), paddle::Vec("cache_dst_outs")}})
172+
.SetInplaceMap({{paddle::Vec("cache_gpu_tensors"),
173+
paddle::Vec("cache_dst_outs")}})
142174
.SetKernelFn(PD_KERNEL(SwapCacheAllLayers));

0 commit comments

Comments
 (0)