Skip to content

Commit d5e1b02

Browse files
authored
Merge pull request #58 from InfiniTensor/dev
perf(kernel): 使用 __ldg 优化性能
2 parents de3b474 + 59939c6 commit d5e1b02

File tree

20 files changed

+273
-283
lines changed

20 files changed

+273
-283
lines changed

src/04kernel/cuda/include/kernel/cuda/bench.cuh

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/04kernel/cuda/src/bench.cu

Lines changed: 0 additions & 34 deletions
This file was deleted.

src/04kernel/cuda/src/concat.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ namespace refactor::kernel::cuda {
3737
concatKernel<<<
3838
params.gridSize,
3939
params.blockSize,
40-
inputCount *(sizeof(unsigned int) + sizeof(void *)),
40+
inputCount * (sizeof(unsigned int) + sizeof(void *)),
4141
reinterpret_cast<cudaStream_t>(params.stream)>>>(
4242
params.n,
4343
reinterpret_cast<uint8_t const **>(inputs),

src/04kernel/cuda/src/expand.cu

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,21 @@ namespace refactor::kernel::cuda {
66

77
__global__ static void expandKernel(
88
unsigned long long n,
9-
uint8_t const *data, expand::DimStride const *strides, uint8_t *output,
9+
uint8_t const *__restrict__ data,
10+
expand::DimStride const *__restrict__ strides,
11+
uint8_t *__restrict__ output,
1012
unsigned int rank,
1113
unsigned int eleSize) {
12-
extern __shared__ expand::DimStride shared[];
13-
for (auto i = threadIdx.x; i < rank; i += blockDim.x) {
14-
shared[i] = strides[i];
15-
}
16-
__syncthreads();
1714
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
1815
step = blockDim.x * gridDim.x;
1916
tid < n;
2017
tid += step) {
2118
long rem = tid, i = 0;
2219
for (auto j = 0; j < rank; ++j) {
23-
auto s = shared[j];
24-
i += rem / s.o * s.i;
25-
rem %= s.o;
20+
auto o_ = __ldg(&(strides[j].o));
21+
auto i_ = __ldg(&(strides[j].i));
22+
i += rem / o_ * i_;
23+
rem %= o_;
2624
}
2725
optimizedMemcpy(output + tid * eleSize, data + i * eleSize, eleSize);
2826
}
@@ -37,7 +35,7 @@ namespace refactor::kernel::cuda {
3735
expandKernel<<<
3836
params.gridSize,
3937
params.blockSize,
40-
rank * sizeof(expand::DimStride),
38+
0,
4139
reinterpret_cast<cudaStream_t>(params.stream)>>>(
4240
params.n,
4341
reinterpret_cast<uint8_t const *>(data),

src/04kernel/cuda/src/gather.cu

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,30 @@
55
namespace refactor::kernel::cuda {
66

77
template<class index_t>
8-
__global__ void gatherKernel(
8+
__global__ static void gatherKernel(
99
unsigned long long n,
10-
uint8_t const *data,
11-
index_t const *indices,
12-
uint8_t *output,
10+
uint8_t const *__restrict__ data,
11+
index_t const *__restrict__ indices,
12+
uint8_t *__restrict__ output,
1313
unsigned int batch,
1414
unsigned int unit,
1515
unsigned int midSizeI,
1616
unsigned int midSizeO) {
17-
extern __shared__ uint32_t shared[];
18-
for (auto i = threadIdx.x; i < midSizeO; i += blockDim.x) {
19-
shared[i] = indices[i];
20-
}
21-
__syncthreads();
2217
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
2318
step = blockDim.x * gridDim.x;
2419
tid < n;
2520
tid += step) {
2621
auto i = tid / batch,
2722
j = tid % batch;
23+
auto index = __ldg(indices + i % midSizeO);
2824
optimizedMemcpy(unit * tid + output,
29-
unit * (batch * (i / midSizeO * midSizeI + shared[i % midSizeO]) + j) + data,
25+
unit * (batch * (i / midSizeO * midSizeI + index) + j) + data,
3026
unit);
3127
}
3228
}
3329

3430
template<class index_t>
35-
void launchGather(
31+
void static launchGather(
3632
KernelLaunchParameters const &params,
3733
void const *data, void const *indices, void *output,
3834
unsigned int batch,
@@ -42,7 +38,7 @@ namespace refactor::kernel::cuda {
4238
gatherKernel<<<
4339
params.gridSize,
4440
params.blockSize,
45-
midSizeO * sizeof(uint32_t),
41+
0,
4642
reinterpret_cast<cudaStream_t>(params.stream)>>>(
4743
params.n,
4844
reinterpret_cast<uint8_t const *>(data),

src/04kernel/cuda/src/scatter_nd.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ namespace refactor::kernel::cuda {
66

77
__global__ void scatterNDKernel(
88
size_t n,
9-
uint8_t *out,
10-
uint8_t const *in,
11-
int64_t const *indices,
12-
unsigned int const *strides,
9+
uint8_t *__restrict__ out,
10+
uint8_t const *__restrict__ in,
11+
int64_t const *__restrict__ indices,
12+
unsigned int const *__restrict__ strides,
1313
size_t rank,
1414
size_t blockSize) {
1515
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
@@ -19,7 +19,7 @@ namespace refactor::kernel::cuda {
1919
unsigned int j = 0;
2020
auto i = indices + tid * rank;
2121
for (auto k = 0; k < rank; ++k) {
22-
j += i[k] * strides[k];
22+
j += i[k] * __ldg(strides + k);
2323
}
2424
optimizedMemcpy(out + j * blockSize,
2525
in + tid * blockSize,

src/04kernel/cuda/src/slice.cu

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,22 @@ namespace refactor::kernel::cuda {
66

77
__global__ static void sliceKernel(
88
unsigned long long n,
9-
uint8_t const *src, DimInfo const *dims, uint8_t *dst,
9+
uint8_t const *__restrict__ src,
10+
DimInfo const *__restrict__ dims,
11+
uint8_t *__restrict__ dst,
1012
unsigned int rank,
1113
unsigned int blockSize) {
12-
extern __shared__ DimInfo dimInfo[];
13-
for (auto i = threadIdx.x; i < rank; i += blockDim.x) {
14-
dimInfo[i] = dims[i];
15-
}
16-
__syncthreads();
1714
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
1815
step = blockDim.x * gridDim.x;
1916
tid < n;
2017
tid += step) {
2118
long rem = tid, j = 0;
2219
for (auto i = 0; i < rank; ++i) {
23-
auto const &dim = dimInfo[i];
24-
j += rem / dim.strideO * dim.strideI + dim.skip;
25-
rem %= dim.strideO;
20+
auto strideO = __ldg(&(dims[i].strideO));
21+
auto strideI = __ldg(&(dims[i].strideI));
22+
auto skip = __ldg(&(dims[i].skip));
23+
j += rem / strideO * strideI + skip;
24+
rem %= strideO;
2625
}
2726
optimizedMemcpy(dst + tid * blockSize, src + j * blockSize, blockSize);
2827
}
@@ -36,7 +35,7 @@ namespace refactor::kernel::cuda {
3635
sliceKernel<<<
3736
params.gridSize,
3837
params.blockSize,
39-
rank * sizeof(DimInfo),
38+
0,
4039
reinterpret_cast<cudaStream_t>(params.stream)>>>(
4140
params.n,
4241
reinterpret_cast<uint8_t const *>(src),

src/04kernel/cuda/src/split.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ namespace refactor::kernel::cuda {
3737
splitKernel<<<
3838
params.gridSize,
3939
params.blockSize,
40-
outputCount *(sizeof(unsigned int) + sizeof(void *)),
40+
outputCount * (sizeof(unsigned int) + sizeof(void *)),
4141
reinterpret_cast<cudaStream_t>(params.stream)>>>(
4242
params.n,
4343
reinterpret_cast<uint8_t const *>(data),

src/04kernel/cuda/src/transpose.cu

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,21 @@ namespace refactor::kernel::cuda {
66

77
__global__ static void transposeKernel(
88
unsigned long long n,
9-
uint8_t const *data, transpose::DimStride const *strides, uint8_t *output,
9+
uint8_t const *__restrict__ data,
10+
transpose::DimStride const *__restrict__ strides,
11+
uint8_t *__restrict__ output,
1012
unsigned int rank,
1113
unsigned int eleSize) {
12-
extern __shared__ transpose::DimStride shared[];
13-
for (auto i = threadIdx.x; i < rank; i += blockDim.x) {
14-
shared[i] = strides[i];
15-
}
16-
__syncthreads();
1714
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
1815
step = blockDim.x * gridDim.x;
1916
tid < n;
2017
tid += step) {
2118
auto j = 0u, rem = tid;
2219
for (auto k = 0u; k < rank; ++k) {
23-
auto d = shared[k];
24-
j += rem / d.o * d.i;
25-
rem %= d.o;
20+
auto o_ = __ldg(&(strides[k].o));
21+
auto i_ = __ldg(&(strides[k].i));
22+
j += rem / o_ * i_;
23+
rem %= o_;
2624
}
2725

2826
optimizedMemcpy(output + tid * eleSize, data + j * eleSize, eleSize);
@@ -37,7 +35,7 @@ namespace refactor::kernel::cuda {
3735
transposeKernel<<<
3836
params.gridSize,
3937
params.blockSize,
40-
rank * sizeof(transpose::DimStride),
38+
0,
4139
reinterpret_cast<cudaStream_t>(params.stream)>>>(
4240
params.n,
4341
reinterpret_cast<uint8_t const *>(data),

src/04kernel/cuda/src/where.cu

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,30 +6,25 @@ namespace refactor::kernel::cuda {
66

77
__global__ static void whereKernel(
88
unsigned long long n,
9-
unsigned int const *strides,
10-
bool const *c,
11-
uint8_t const *x,
12-
uint8_t const *y,
13-
uint8_t *output,
9+
unsigned int const *__restrict__ strides,
10+
bool const *__restrict__ c,
11+
uint8_t const *__restrict__ x,
12+
uint8_t const *__restrict__ y,
13+
uint8_t *__restrict__ output,
1414
unsigned int rank,
1515
unsigned int eleSize) {
16-
extern __shared__ unsigned int shared[];
17-
for (auto i = threadIdx.x; i < rank * 4; i += blockDim.x) {
18-
shared[i] = strides[i];
19-
}
20-
__syncthreads();
2116
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
2217
step = blockDim.x * gridDim.x;
2318
tid < n;
2419
tid += step) {
2520
auto ic = 0u, ix = 0u, iy = 0u, rem = tid;
2621
for (auto j = 0u; j < rank; ++j) {
27-
auto dim = shared + 4 * j;
28-
auto quot = rem / dim[3];
29-
rem %= dim[3];
30-
ic += quot * dim[0];
31-
ix += quot * dim[1];
32-
iy += quot * dim[2];
22+
auto dim = strides + 4 * j;
23+
auto quot = rem / __ldg(dim + 3);
24+
rem %= __ldg(dim + 3);
25+
ic += quot * __ldg(dim + 0);
26+
ix += quot * __ldg(dim + 1);
27+
iy += quot * __ldg(dim + 2);
3328
}
3429

3530
optimizedMemcpy(output + tid * eleSize,
@@ -52,7 +47,7 @@ namespace refactor::kernel::cuda {
5247
whereKernel<<<
5348
params.gridSize,
5449
params.blockSize,
55-
rank * sizeof(unsigned int) * 4,
50+
0,
5651
reinterpret_cast<cudaStream_t>(params.stream)>>>(
5752
params.n,
5853
strides,

0 commit comments

Comments
 (0)