PaddlePaddle · zrr1999 · Nov 11, 2025 · Nov 7, 2025
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -212,7 +212,9 @@ struct BroadcastDataLoader<Index, VecSize, false, kElementwise> {
     using VecType = phi::kps::details::VectorType<Type, VecSize>;
     VecType vec_temp;
 
-    int thread_offset = threadIdx.x + blockIdx.x * blockDim.x;
+    int64_t thread_offset =
+        static_cast<int64_t>(threadIdx.x) +
+        static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
     const VecType *__restrict__ vec_input =
         reinterpret_cast<const VecType *__restrict__>(ins[Index]);
     vec_temp = vec_input[thread_offset];

diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -128,7 +128,9 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
   T c0 = 0.0f;
   T b0[Tiled_size];
 
-  int COL = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t COL =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int Tiled_mask = ((1 << Tiled_size) - 1);
   // Tiled  matrix multiply using register shift, faster than sm.
   if (prev_output_value) {
@@ -185,7 +187,9 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
                                        int frame_size,
                                        ActivationType act_node,
                                        bool origin_mode) {
-  int COL = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t COL =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
 
   T a0 = 0.0f;
   T b0[Tiled_size];

diff --git a/paddle/phi/kernels/funcs/fake_quantize_functor.cu b/paddle/phi/kernels/funcs/fake_quantize_functor.cu
@@ -29,7 +29,9 @@ struct QuantizeDataType<phi::float16> {
 
 template <typename T>
 __global__ void FindAbsMaxKernel(const T *in, const int64_t n, T *out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t bid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   int tid = threadIdx.x;
 
   extern __shared__ char *shared_max_data_tmp[];
@@ -70,7 +72,9 @@ __global__ void ClipAndQuantKernel(const T *in,
                                    const int round_type,
                                    const int64_t n,
                                    T *out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t bid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   int tid = threadIdx.x;
 
   using ComputeDataType = typename QuantizeDataType<T>::type;
@@ -155,7 +159,9 @@ __global__ void ClipAndQuantDequantKernel(const T *in,
                                           const int round_type,
                                           const int64_t n,
                                           T *out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t bid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   int tid = threadIdx.x;
 
   using ComputeDataType = typename QuantizeDataType<T>::type;

diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -63,7 +63,9 @@ struct FcTypeTraits<float16> {
 
 template <typename T, bool DoRelu>
 __global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t tid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   if (tid < num) {
     int bias_idx = tid % K;
     const T bias_ptr = bias[bias_idx];

diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
@@ -209,7 +209,10 @@ DEFINE_GPU_TRANS(6);
 
 template <typename T>
 __global__ void FillConstantKernel(const int N, T* a, const T val) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+  for (int64_t i =
+           static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+           static_cast<int64_t>(threadIdx.x);
+       i < N;
        i += blockDim.x * gridDim.x) {
     a[i] = val;
   }

diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -370,7 +370,9 @@ __global__ void DoubleGradComputeDXWithGlobal(const T *dy,
                                               const int sample_size,
                                               const int64_t num,
                                               T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
   if (ddscale != nullptr) {
     for (int64_t i = gid; i < num; i += stride) {
@@ -397,7 +399,9 @@ __global__ void DoubleGradComputeDDYWithGlobal(const T *ddx,
                                                const int sample_size,
                                                const int64_t num,
                                                T *ddy) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
 
   if (ddx != nullptr) {

diff --git a/paddle/phi/kernels/funcs/quant_dequant.h b/paddle/phi/kernels/funcs/quant_dequant.h
@@ -91,8 +91,13 @@ __global__ void QuantKernel(const T* input,
                             const int round_type,
                             const float max_bound,
                             const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x))
+      << 2;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -118,8 +123,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x))
+      << 2;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -145,8 +155,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 3;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      3;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -170,8 +185,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      2;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -193,8 +213,12 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x);
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x));
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -320,7 +344,10 @@ __global__ void DequantKernel(T* output,
                               const float* dequant_out_scale_data) {
   int numel = m * n;
   int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
+  int64_t idx =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      VecSize;
   int col_id = idx % n;
 
   phi::AlignedVector<int32_t, VecSize> in_vec;
@@ -366,7 +393,10 @@ __global__ void DequantKernelWithScaleOfInputAndWeight(
     float quant_max_bound) {
   int numel = m * n;
   int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
+  int64_t idx =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      VecSize;
   int col_id = idx % n;
 
   phi::AlignedVector<int32_t, VecSize> in_vec;

diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -402,7 +402,8 @@ inline DenseTensor restride_dim(const phi::DenseTensor& src,
 template <int nt, int vt, typename func_t>
 __global__ void scatter_gather_elementwise_kernel(int N, func_t f) {
   constexpr int nv = nt * vt;
-  int idx = nv * blockIdx.x + threadIdx.x;
+  int64_t idx =
+      nv * static_cast<int64_t>(blockIdx.x) + static_cast<int64_t>(threadIdx.x);
 
 #pragma unroll
   for (int i = 0; i < vt; ++i) {

diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h
@@ -26,7 +26,9 @@ __global__ void FlattenIndicesKernel(const IntT* indices,
                                      const int64_t non_zero_num,
                                      const int64_t sparse_dim,
                                      IntT* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   phi::funcs::sparse::FlattenIndices<IntT>(indices,
                                            sparse_offsets,
                                            non_zero_num,
@@ -42,7 +44,9 @@ __global__ void IndexToCoordinateKernel(const IntT* index,
                                         const int64_t non_zero_num,
                                         const int64_t sparse_dim,
                                         IntT* indices) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   IndexToCoordinate(index,
                     dims,
                     non_zero_num,

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -41,7 +41,9 @@ __global__ void ScatterKernel(const T* input,
                               const int rulebook_len,
                               const int channels,
                               T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   const int vec_channels = channels / VecSize;
   using LoadT = phi::AlignedVector<T, VecSize>;
   using StoreT = phi::AlignedVector<T, VecSize>;
@@ -82,7 +84,9 @@ __global__ void ScatterKernelV2(const T* input,
                                 const int channels,
                                 const int buffer_counts,
                                 T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   const int vec_channels = channels / VecSize;
   using LoadT = phi::AlignedVector<T, VecSize>;
   using StoreT = phi::AlignedVector<T, VecSize>;

diff --git a/paddle/phi/kernels/funcs/sync_batch_norm_utils.h b/paddle/phi/kernels/funcs/sync_batch_norm_utils.h
@@ -87,7 +87,9 @@ __global__ void KeSyncAndMovingStats(BatchNormParamType<T> *means,
                                      BatchNormParamType<T> *moving_means,
                                      BatchNormParamType<T> *moving_variances) {
   // sync stats across multi-devices
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
   for (int i = gid; i < C; i += stride) {
     auto mean = means[i] / (*num_dev);
@@ -117,7 +119,9 @@ static __global__ void KeNormAffine(const T *x,
                                     const int M,
                                     const int64_t num,
                                     T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
   for (int64_t i = gid; i < num; i += stride) {
     const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
@@ -180,12 +184,18 @@ __global__ void KeBackwardLocalStats2D(const T *dy,
                                        BatchNormParamType<T> *sum_dy_prod) {
   __shared__ BatchNormParamType<T> smem_sum[BlockDim];
   __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
-  for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < C;
+  for (int64_t k =
+           static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+           static_cast<int64_t>(threadIdx.x);
+       k < C;
        k += gridDim.x * blockDim.x) {
     BatchNormParamType<T> sum1 = 0.;
     BatchNormParamType<T> sum2 = 0.;
     auto mean = means[k];
-    for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < N * M;
+    for (int64_t i = static_cast<int64_t>(blockIdx.y) *
+                         static_cast<int64_t>(blockDim.y) +
+                     static_cast<int64_t>(threadIdx.y);
+         i < N * M;
          i += gridDim.y * blockDim.y) {
       int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
                                            : i * C + k;
@@ -287,7 +297,10 @@ static __global__ void KeBNBackwardScaleBias2D(
   __shared__ BatchNormParamType<T> smem_sum[BlockDim];
   __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
 
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
+  for (int64_t i =
+           static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+           static_cast<int64_t>(threadIdx.x);
+       i < outer_size;
        i += gridDim.x * blockDim.x) {
     BatchNormParamType<T> ds_sum = 0.;
     BatchNormParamType<T> db_sum = 0.;
@@ -341,7 +354,9 @@ static __global__ void KeBNRestoreData(T *x,
                                        int M,
                                        int64_t num,
                                        const T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
   for (int64_t i = gid; i < num; i += stride) {
     const int64_t c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
@@ -366,7 +381,9 @@ static __global__ void KeBNBackwardData(
     const int64_t HxW,
     const int64_t num,
     T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
   auto scale = static_cast<BatchNormParamType<T>>(C) / num;
   auto dev_num = num_dev[0];

diff --git a/paddle/phi/kernels/funcs/weight_dequant_functor.h b/paddle/phi/kernels/funcs/weight_dequant_functor.h
@@ -128,7 +128,9 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight,
   AlignedVector<T, 16> vec_out;
 
   int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
-  int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
+  int64_t tile_id =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
+      warp_id;
   // Every two rows of the original weights are interleaved into a row with
   // stride of 64, so if each thread processes 16 elements(for int8, we can use
   // ldg.128 to load weights), then every group of four adjacent threads will
@@ -184,7 +186,9 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight,
   AlignedVector<T, 32> vec_out;
 
   int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
-  int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
+  int64_t tile_id =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
+      warp_id;
   // Every 4 rows of the original weights are interleaved into a row with
   // stride of 32, so if each thread processes 16 elements(for int8, we can use
   // ldg.128 to load weights), then every group of two adjacent threads will
@@ -242,7 +246,9 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight,
   AlignedVector<T, 16> vec_out;
 
   int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
-  int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
+  int64_t tile_id =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
+      warp_id;
   // Every two rows of the original weights are interleaved into a row with
   // stride of 64, so if each thread processes 16 elements(for int8, we can use
   // ldg.128 to load weights), then every group of four adjacent threads will
@@ -302,7 +308,9 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight,
   AlignedVector<T, 32> vec_out;
 
   int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
-  int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
+  int64_t tile_id =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) / 32 +
+      warp_id;
   // Every two rows of the original weights are interleaved into a row with
   // stride of 64, so if each thread processes 16 elements(for int8, we can use
   // ldg.128 to load weights), then every group of four adjacent threads will