Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion paddle/phi/kernels/funcs/broadcast_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,9 @@ struct BroadcastDataLoader<Index, VecSize, false, kElementwise> {
using VecType = phi::kps::details::VectorType<Type, VecSize>;
VecType vec_temp;

int thread_offset = threadIdx.x + blockIdx.x * blockDim.x;
int64_t thread_offset =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
const VecType *__restrict__ vec_input =
reinterpret_cast<const VecType *__restrict__>(ins[Index]);
vec_temp = vec_input[thread_offset];
Expand Down
8 changes: 6 additions & 2 deletions paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
T c0 = 0.0f;
T b0[Tiled_size];

int COL = blockIdx.x * blockDim.x + threadIdx.x;
int64_t COL =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
int Tiled_mask = ((1 << Tiled_size) - 1);
// Tiled matrix multiply using register shift, faster than sm.
if (prev_output_value) {
Expand Down Expand Up @@ -185,7 +187,9 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
int frame_size,
ActivationType act_node,
bool origin_mode) {
int COL = blockIdx.x * blockDim.x + threadIdx.x;
int64_t COL =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);

T a0 = 0.0f;
T b0[Tiled_size];
Expand Down
12 changes: 9 additions & 3 deletions paddle/phi/kernels/funcs/fake_quantize_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ struct QuantizeDataType<phi::float16> {

template <typename T>
__global__ void FindAbsMaxKernel(const T *in, const int64_t n, T *out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int64_t bid =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
int tid = threadIdx.x;

extern __shared__ char *shared_max_data_tmp[];
Expand Down Expand Up @@ -70,7 +72,9 @@ __global__ void ClipAndQuantKernel(const T *in,
const int round_type,
const int64_t n,
T *out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int64_t bid =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
int tid = threadIdx.x;

using ComputeDataType = typename QuantizeDataType<T>::type;
Expand Down Expand Up @@ -155,7 +159,9 @@ __global__ void ClipAndQuantDequantKernel(const T *in,
const int round_type,
const int64_t n,
T *out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int64_t bid =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
int tid = threadIdx.x;

using ComputeDataType = typename QuantizeDataType<T>::type;
Expand Down
4 changes: 3 additions & 1 deletion paddle/phi/kernels/funcs/fc_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ struct FcTypeTraits<float16> {

template <typename T, bool DoRelu>
__global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int64_t tid =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
if (tid < num) {
int bias_idx = tid % K;
const T bias_ptr = bias[bias_idx];
Expand Down
5 changes: 4 additions & 1 deletion paddle/phi/kernels/funcs/math_function.cu
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,10 @@ DEFINE_GPU_TRANS(6);

template <typename T>
__global__ void FillConstantKernel(const int N, T* a, const T val) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
for (int64_t i =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
i < N;
i += blockDim.x * gridDim.x) {
a[i] = val;
}
Expand Down
8 changes: 6 additions & 2 deletions paddle/phi/kernels/funcs/norm_utils.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,9 @@ __global__ void DoubleGradComputeDXWithGlobal(const T *dy,
const int sample_size,
const int64_t num,
T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int64_t gid =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
int stride = blockDim.x * gridDim.x;
if (ddscale != nullptr) {
for (int64_t i = gid; i < num; i += stride) {
Expand All @@ -397,7 +399,9 @@ __global__ void DoubleGradComputeDDYWithGlobal(const T *ddx,
const int sample_size,
const int64_t num,
T *ddy) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int64_t gid =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
int stride = blockDim.x * gridDim.x;

if (ddx != nullptr) {
Expand Down
54 changes: 42 additions & 12 deletions paddle/phi/kernels/funcs/quant_dequant.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,13 @@ __global__ void QuantKernel(const T* input,
const int round_type,
const float max_bound,
const float min_bound) {
int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
int m_id = blockIdx.y * blockDim.y + threadIdx.y;
int64_t n_id =
(static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x))
<< 2;
int64_t m_id =
static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
static_cast<int64_t>(threadIdx.y);

bool check = ((m_id < m) && (n_id < n));
if (check) {
Expand All @@ -118,8 +123,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
const int round_type,
const float max_bound,
const float min_bound) {
int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
int m_id = blockIdx.y * blockDim.y + threadIdx.y;
int64_t n_id =
(static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x))
<< 2;
int64_t m_id =
static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
static_cast<int64_t>(threadIdx.y);

bool check = ((m_id < m) && (n_id < n));
if (check) {
Expand All @@ -145,8 +155,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
const int round_type,
const float max_bound,
const float min_bound) {
int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 3;
int m_id = blockIdx.y * blockDim.y + threadIdx.y;
int64_t n_id =
(static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x)) *
3;
int64_t m_id =
static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
static_cast<int64_t>(threadIdx.y);

bool check = ((m_id < m) && (n_id < n));
if (check) {
Expand All @@ -170,8 +185,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
const int round_type,
const float max_bound,
const float min_bound) {
int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
int m_id = blockIdx.y * blockDim.y + threadIdx.y;
int64_t n_id =
(static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x)) *
2;
int64_t m_id =
static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
static_cast<int64_t>(threadIdx.y);

bool check = ((m_id < m) && (n_id < n));
if (check) {
Expand All @@ -193,8 +213,12 @@ __global__ void QuantKernelWithVecSize(const T* input,
const int round_type,
const float max_bound,
const float min_bound) {
int n_id = (blockIdx.x * blockDim.x + threadIdx.x);
int m_id = blockIdx.y * blockDim.y + threadIdx.y;
int64_t n_id =
(static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x));
int64_t m_id =
static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
static_cast<int64_t>(threadIdx.y);

bool check = ((m_id < m) && (n_id < n));
if (check) {
Expand Down Expand Up @@ -320,7 +344,10 @@ __global__ void DequantKernel(T* output,
const float* dequant_out_scale_data) {
int numel = m * n;
int stride = blockDim.x * gridDim.x * VecSize;
int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
int64_t idx =
(static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x)) *
VecSize;
int col_id = idx % n;

phi::AlignedVector<int32_t, VecSize> in_vec;
Expand Down Expand Up @@ -366,7 +393,10 @@ __global__ void DequantKernelWithScaleOfInputAndWeight(
float quant_max_bound) {
int numel = m * n;
int stride = blockDim.x * gridDim.x * VecSize;
int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
int64_t idx =
(static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x)) *
VecSize;
int col_id = idx % n;

phi::AlignedVector<int32_t, VecSize> in_vec;
Expand Down
3 changes: 2 additions & 1 deletion paddle/phi/kernels/funcs/scatter.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,8 @@ inline DenseTensor restride_dim(const phi::DenseTensor& src,
template <int nt, int vt, typename func_t>
__global__ void scatter_gather_elementwise_kernel(int N, func_t f) {
constexpr int nv = nt * vt;
int idx = nv * blockIdx.x + threadIdx.x;
int64_t idx =
nv * static_cast<int64_t>(blockIdx.x) + static_cast<int64_t>(threadIdx.x);

#pragma unroll
for (int i = 0; i < vt; ++i) {
Expand Down
8 changes: 6 additions & 2 deletions paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ __global__ void FlattenIndicesKernel(const IntT* indices,
const int64_t non_zero_num,
const int64_t sparse_dim,
IntT* out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int64_t tid =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
phi::funcs::sparse::FlattenIndices<IntT>(indices,
sparse_offsets,
non_zero_num,
Expand All @@ -42,7 +44,9 @@ __global__ void IndexToCoordinateKernel(const IntT* index,
const int64_t non_zero_num,
const int64_t sparse_dim,
IntT* indices) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int64_t tid =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
IndexToCoordinate(index,
dims,
non_zero_num,
Expand Down
8 changes: 6 additions & 2 deletions paddle/phi/kernels/funcs/sparse/scatter.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ __global__ void ScatterKernel(const T* input,
const int rulebook_len,
const int channels,
T* out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int64_t tid =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
const int vec_channels = channels / VecSize;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
Expand Down Expand Up @@ -82,7 +84,9 @@ __global__ void ScatterKernelV2(const T* input,
const int channels,
const int buffer_counts,
T* out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int64_t tid =
static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
const int vec_channels = channels / VecSize;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
Expand Down
31 changes: 24 additions & 7 deletions paddle/phi/kernels/funcs/sync_batch_norm_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ __global__ void KeSyncAndMovingStats(BatchNormParamType<T> *means,
BatchNormParamType<T> *moving_means,
BatchNormParamType<T> *moving_variances) {
// sync stats across multi-devices
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int64_t gid =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < C; i += stride) {
auto mean = means[i] / (*num_dev);
Expand Down Expand Up @@ -117,7 +119,9 @@ static __global__ void KeNormAffine(const T *x,
const int M,
const int64_t num,
T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int64_t gid =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
int stride = blockDim.x * gridDim.x;
for (int64_t i = gid; i < num; i += stride) {
const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
Expand Down Expand Up @@ -180,12 +184,18 @@ __global__ void KeBackwardLocalStats2D(const T *dy,
BatchNormParamType<T> *sum_dy_prod) {
__shared__ BatchNormParamType<T> smem_sum[BlockDim];
__shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < C;
for (int64_t k =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
k < C;
k += gridDim.x * blockDim.x) {
BatchNormParamType<T> sum1 = 0.;
BatchNormParamType<T> sum2 = 0.;
auto mean = means[k];
for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < N * M;
for (int64_t i = static_cast<int64_t>(blockIdx.y) *
static_cast<int64_t>(blockDim.y) +
static_cast<int64_t>(threadIdx.y);
i < N * M;
i += gridDim.y * blockDim.y) {
int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
: i * C + k;
Expand Down Expand Up @@ -287,7 +297,10 @@ static __global__ void KeBNBackwardScaleBias2D(
__shared__ BatchNormParamType<T> smem_sum[BlockDim];
__shared__ BatchNormParamType<T> smem_square_sum[BlockDim];

for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
for (int64_t i =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
i < outer_size;
i += gridDim.x * blockDim.x) {
BatchNormParamType<T> ds_sum = 0.;
BatchNormParamType<T> db_sum = 0.;
Expand Down Expand Up @@ -341,7 +354,9 @@ static __global__ void KeBNRestoreData(T *x,
int M,
int64_t num,
const T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int64_t gid =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
int stride = blockDim.x * gridDim.x;
for (int64_t i = gid; i < num; i += stride) {
const int64_t c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
Expand All @@ -366,7 +381,9 @@ static __global__ void KeBNBackwardData(
const int64_t HxW,
const int64_t num,
T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int64_t gid =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
static_cast<int64_t>(threadIdx.x);
int stride = blockDim.x * gridDim.x;
auto scale = static_cast<BatchNormParamType<T>>(C) / num;
auto dev_num = num_dev[0];
Expand Down
16 changes: 12 additions & 4 deletions paddle/phi/kernels/funcs/weight_dequant_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight,
AlignedVector<T, 16> vec_out;

int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
int64_t tile_id =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
warp_id;
// Every two rows of the original weights are interleaved into a row with
// stride of 64, so if each thread processes 16 elements(for int8, we can use
// ldg.128 to load weights), then every group of four adjacent threads will
Expand Down Expand Up @@ -184,7 +186,9 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight,
AlignedVector<T, 32> vec_out;

int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
int64_t tile_id =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
warp_id;
// Every 4 rows of the original weights are interleaved into a row with
// stride of 32, so if each thread processes 16 elements(for int8, we can use
// ldg.128 to load weights), then every group of two adjacent threads will
Expand Down Expand Up @@ -242,7 +246,9 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight,
AlignedVector<T, 16> vec_out;

int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
int64_t tile_id =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
warp_id;
// Every two rows of the original weights are interleaved into a row with
// stride of 64, so if each thread processes 16 elements(for int8, we can use
// ldg.128 to load weights), then every group of four adjacent threads will
Expand Down Expand Up @@ -302,7 +308,9 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight,
AlignedVector<T, 32> vec_out;

int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
int64_t tile_id =
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
warp_id;
// Every two rows of the original weights are interleaved into a row with
// stride of 64, so if each thread processes 16 elements(for int8, we can use
// ldg.128 to load weights), then every group of four adjacent threads will
Expand Down
Loading
Loading