From f6375158b9a1f2d3b0f2373414a49e18dafaec8e Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 31 Oct 2024 14:18:18 +0800 Subject: [PATCH 01/10] random_sample_workspace --- src/ops/random_sample/cuda/random_sample.cu | 34 ++++++++----------- src/ops/random_sample/cuda/random_sample.cuh | 5 +-- .../random_sample/cuda/random_sample_cuda.cc | 8 +++-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu index 40761e89..7d00e4a1 100644 --- a/src/ops/random_sample/cuda/random_sample.cu +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -84,26 +84,26 @@ void inclusive_sum( data, data, voc, stream); } -template + void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, - int voc, cudaStream_t stream) { + int voc) { - sort_pairs_descending(nullptr, size_radix_sort, - nullptr, nullptr, - nullptr, nullptr, - voc, stream); + sort_pairs_descending(nullptr, size_radix_sort, + nullptr, nullptr, + nullptr, nullptr, + voc, nullptr); - inclusive_sum( + inclusive_sum( nullptr, size_scan, nullptr, voc, - stream); + nullptr); } __global__ void random_sample_kernel(uint64_t *result, uint64_t *key_out) { result[0] = key_out[0]; } -void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, void *result, +void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *result, void const *probs, float random_val, float topp, @@ -121,14 +121,11 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc); //下面开始计算workspace空间 - size_t size_radix_sort; - size_t size_scan; - random_sample_workspace(size_radix_sort, size_scan, - voc, (cudaStream_t) stream); - void *workspace_extra; - cudaMalloc(&workspace_extra, size_radix_sort + size_scan); + + void *workspace_extra = reinterpret_cast(workspace) + 2 * voc * sizeof(half) + voc * sizeof(uint64_t); + uint64_t workspace_len = workspace_size - 2 * voc * sizeof(half) - voc * sizeof(uint64_t); sort_pairs_descending( - workspace_extra, size_radix_sort, + workspace_extra, workspace_len, (half *) probs, val_out, key_in, key_out, voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上 @@ -141,7 +138,7 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace inclusive_sum( - workspace_extra, size_scan, + workspace_extra, workspace_len, val_out, voc, (cudaStream_t) stream);//该函数会实现scan功能不断累加结果 random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result, @@ -155,7 +152,6 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result, key_out); } - cudaFree(workspace_extra); } infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, @@ -172,7 +168,7 @@ infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, return STATUS_BAD_DEVICE; } if (dtype_eq(desc->dtype, F16)) { - random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream); + random_sample_nv_gpu_f16(desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); return STATUS_SUCCESS; } diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh index d3fff76d..235bfa24 100644 --- a/src/ops/random_sample/cuda/random_sample.cuh +++ b/src/ops/random_sample/cuda/random_sample.cuh @@ -18,8 +18,9 @@ typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t; infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs); - -infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size); +void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, + int voc); +infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size); infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, void *workspace, diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc index 022a113b..15bd69b9 100644 --- a/src/ops/random_sample/cuda/random_sample_cuda.cc +++ b/src/ops/random_sample/cuda/random_sample_cuda.cc @@ -26,8 +26,12 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, return STATUS_SUCCESS; } -infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size) { - *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)); +infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size) { + size_t size_radix_sort; + size_t size_scan; + random_sample_workspace(size_radix_sort, size_scan, + desc->voc); + *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)) + std::max(size_radix_sort, size_scan); return STATUS_SUCCESS; } From d1583b1e428c78c9481bdc0358970e06836cc27a Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 31 Oct 2024 14:30:18 +0800 Subject: [PATCH 02/10] modified workspace_size --- src/ops/random_sample/cuda/random_sample.cu | 42 ++++++++++++++----- src/ops/random_sample/cuda/random_sample.cuh | 2 +- .../random_sample/cuda/random_sample_cuda.cc | 2 +- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu index 7d00e4a1..81bb43b6 100644 --- a/src/ops/random_sample/cuda/random_sample.cu +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -86,18 +86,38 @@ void inclusive_sum( } void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, - int voc) { + int voc, DT dtype) { + if (dtype_eq(dtype, F16)) { + sort_pairs_descending(nullptr, size_radix_sort, + nullptr, nullptr, + nullptr, nullptr, + voc, nullptr); - - sort_pairs_descending(nullptr, size_radix_sort, - nullptr, nullptr, - nullptr, nullptr, - voc, nullptr); - - inclusive_sum( - nullptr, size_scan, - nullptr, voc, - nullptr); + inclusive_sum( + nullptr, size_scan, + nullptr, voc, + nullptr); + } else if (dtype_eq(dtype, F32)) { + sort_pairs_descending(nullptr, size_radix_sort, + nullptr, nullptr, + nullptr, nullptr, + voc, nullptr); + + inclusive_sum( + nullptr, size_scan, + nullptr, voc, + nullptr); + } else if (dtype_eq(dtype, F64)) { + sort_pairs_descending(nullptr, size_radix_sort, + nullptr, nullptr, + nullptr, nullptr, + voc, nullptr); + + inclusive_sum( + nullptr, size_scan, + nullptr, voc, + nullptr); + } } __global__ void random_sample_kernel(uint64_t *result, uint64_t *key_out) { diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh index 235bfa24..9d464bcc 100644 --- a/src/ops/random_sample/cuda/random_sample.cuh +++ b/src/ops/random_sample/cuda/random_sample.cuh @@ -19,7 +19,7 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs); void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, - int voc); + int voc, DT dtype); infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size); infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc index 15bd69b9..dd0b4751 100644 --- a/src/ops/random_sample/cuda/random_sample_cuda.cc +++ b/src/ops/random_sample/cuda/random_sample_cuda.cc @@ -30,7 +30,7 @@ infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t d size_t size_radix_sort; size_t size_scan; random_sample_workspace(size_radix_sort, size_scan, - desc->voc); + desc->voc, desc->dtype); *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)) + std::max(size_radix_sort, size_scan); return STATUS_SUCCESS; } From 71603495d9b73bd419da4b5d4a005f50b5c9aaf1 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 14 Nov 2024 13:50:52 +0800 Subject: [PATCH 03/10] add step --- src/ops/random_sample/cuda/random_sample.cu | 4 ++-- src/ops/random_sample/cuda/random_sample.cuh | 1 + src/ops/random_sample/cuda/random_sample_cuda.cc | 10 +++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu index 81bb43b6..117fbdf8 100644 --- a/src/ops/random_sample/cuda/random_sample.cu +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -142,8 +142,8 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc); //下面开始计算workspace空间 - void *workspace_extra = reinterpret_cast(workspace) + 2 * voc * sizeof(half) + voc * sizeof(uint64_t); - uint64_t workspace_len = workspace_size - 2 * voc * sizeof(half) - voc * sizeof(uint64_t); + void *workspace_extra = reinterpret_cast(workspace) + desc->step; + uint64_t workspace_len = workspace_size - desc->step; sort_pairs_descending( workspace_extra, workspace_len, (half *) probs, val_out, diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh index 9d464bcc..35581466 100644 --- a/src/ops/random_sample/cuda/random_sample.cuh +++ b/src/ops/random_sample/cuda/random_sample.cuh @@ -11,6 +11,7 @@ struct RandomSampleCudaDescriptor { int voc; DT rDtype; int rLength; + int step; }; typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t; diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc index dd0b4751..6a13e0a2 100644 --- a/src/ops/random_sample/cuda/random_sample_cuda.cc +++ b/src/ops/random_sample/cuda/random_sample_cuda.cc @@ -8,20 +8,24 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, if (probs->ndim != 1) { return STATUS_BAD_TENSOR_SHAPE; } - if (!dtype_eq(result->dt, U64)) + if (!dtype_eq(probs->dt, F16) && !dtype_eq(result->dt, U64)) { return STATUS_BAD_TENSOR_DTYPE; + } + int voc = probs->shape[0]; int rLength = result->shape[0]; if (result->ndim != 1 && rLength != 1) { return STATUS_BAD_TENSOR_SHAPE; } + int step = 2 * voc * sizeof(uint64_t) + voc * sizeof(probs->dt); *desc_ptr = new RandomSampleCudaDescriptor{ handle->device, handle->device_id, probs->dt, voc, result->dt, - rLength}; + rLength, + step}; return STATUS_SUCCESS; } @@ -31,7 +35,7 @@ infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t d size_t size_scan; random_sample_workspace(size_radix_sort, size_scan, desc->voc, desc->dtype); - *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)) + std::max(size_radix_sort, size_scan); + *size = desc->voc * desc->step + std::max(size_radix_sort, size_scan); return STATUS_SUCCESS; } From 2563904acfb66edb10b1af5fbd409fd85854d23e Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Fri, 15 Nov 2024 10:07:48 +0800 Subject: [PATCH 04/10] modified cpu and cuda argmax --- src/ops/random_sample/cpu/random_sample.cc | 11 +--- src/ops/random_sample/cuda/random_sample.cu | 64 +++++++++++++++------ 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/src/ops/random_sample/cpu/random_sample.cc b/src/ops/random_sample/cpu/random_sample.cc index 28de5b93..4a58de8f 100644 --- a/src/ops/random_sample/cpu/random_sample.cc +++ b/src/ops/random_sample/cpu/random_sample.cc @@ -135,16 +135,11 @@ void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc, auto index_ = reinterpret_cast(result); auto source = reinterpret_cast(probs); - char *origin = reinterpret_cast(workspace); - uint16_t *logits_ = (uint16_t *) origin; - - std::copy(source, source + voc, logits_); - - float M = f16_to_f32(logits_[0]); + float M = f16_to_f32(source[0]); int index = 0; for (int j = 1; j < voc; j++) { - if (M < f16_to_f32(logits_[j])) { - M = f16_to_f32(logits_[j]); + if (M < f16_to_f32(source[j])) { + M = f16_to_f32(source[j]); index = j; } } diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu index 117fbdf8..ffcde6e3 100644 --- a/src/ops/random_sample/cuda/random_sample.cu +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -3,7 +3,31 @@ #include "random_sample.cuh" #include #include - +template +__global__ void argmaxKernel(T *val_out, int voc, uint64_t *result) { + float localM = -__FLT_MAX__; + uint64_t index = threadIdx.x; + for (int i = threadIdx.x; i < voc; i += BLOCK_DIM) { + if (localM < static_cast(val_out[i])) { + localM = static_cast(val_out[i]); + index = i; + } + } + __shared__ uint64_t globalInd[BLOCK_DIM]; + __shared__ float globalM[BLOCK_DIM]; + globalInd[threadIdx.x] = index; + globalM[threadIdx.x] = localM; + for (int strip = BLOCK_DIM / 2; strip > 0; strip /= 2) { + if (threadIdx.x < strip) { + if (globalM[threadIdx.x] < globalM[threadIdx.x + strip]) { + globalM[threadIdx.x] = globalM[threadIdx.x + strip]; + globalInd[threadIdx.x] = globalInd[threadIdx.x + strip]; + } + } + __syncthreads(); + } + result[0] = globalInd[0]; +} template __global__ void softmax( T *val_out, @@ -132,25 +156,26 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace void *stream) { int voc = desc->voc; //下面这段代码在排序 - char *origin = reinterpret_cast(workspace); - char *keyTmp = origin + voc * sizeof(half); - half *val_out = (half *) origin; - uint64_t *key_in = (uint64_t *) keyTmp; - uint64_t *key_out = key_in + voc; + if (topp > 0 && topk > 1) { + char *origin = reinterpret_cast(workspace); + char *keyTmp = origin + voc * sizeof(half); + half *val_out = (half *) origin; - index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc); - //下面开始计算workspace空间 + uint64_t *key_in = (uint64_t *) keyTmp; + uint64_t *key_out = key_in + voc; - void *workspace_extra = reinterpret_cast(workspace) + desc->step; - uint64_t workspace_len = workspace_size - desc->step; - sort_pairs_descending( - workspace_extra, workspace_len, - (half *) probs, val_out, - key_in, key_out, - voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上 - //排序结束,然后开始做softmax变换 - if (topp > 0 && topk > 1) { + index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc); + //下面开始计算workspace空间 + + void *workspace_extra = reinterpret_cast(workspace) + desc->step; + uint64_t workspace_len = workspace_size - desc->step; + sort_pairs_descending( + workspace_extra, workspace_len, + (half *) probs, val_out, + key_in, key_out, + voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上 + //排序结束,然后开始做softmax变换 int BLOCK_DIM = 1024; int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM; softmax<<>>(val_out, topk, @@ -169,8 +194,9 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace key_out); } else { - random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result, - key_out); + int BLOCK_DIM = 1024; + int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM; + argmaxKernel<<>>((half *) probs, voc, (uint64_t *) result); } } From c65cad300f92cd98f446bff0df9c13d1147a89eb Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Wed, 20 Nov 2024 10:42:37 +0800 Subject: [PATCH 05/10] modified workspace --- src/ops/random_sample/cuda/random_sample_cuda.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc index 6a13e0a2..df995d54 100644 --- a/src/ops/random_sample/cuda/random_sample_cuda.cc +++ b/src/ops/random_sample/cuda/random_sample_cuda.cc @@ -35,7 +35,7 @@ infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t d size_t size_scan; random_sample_workspace(size_radix_sort, size_scan, desc->voc, desc->dtype); - *size = desc->voc * desc->step + std::max(size_radix_sort, size_scan); + *size = desc->step + std::max(size_radix_sort, size_scan); return STATUS_SUCCESS; } From eca6ff802b3a0559f9ed3af86445253185dd605f Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Wed, 20 Nov 2024 14:09:26 +0800 Subject: [PATCH 06/10] add checkout dtype --- src/ops/random_sample/cuda/random_sample.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu index ffcde6e3..1793e507 100644 --- a/src/ops/random_sample/cuda/random_sample.cu +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -141,6 +141,8 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, nullptr, size_scan, nullptr, voc, nullptr); + } else { + throw std::invalid_argument("Unsupported dtype provided."); } } __global__ void random_sample_kernel(uint64_t *result, From 54b2a2f51f7ba4cb273aa2c9062df8049fa39707 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Fri, 22 Nov 2024 13:59:19 +0800 Subject: [PATCH 07/10] modified workspace return --- src/ops/random_sample/cuda/random_sample.cu | 9 ++++++--- src/ops/random_sample/cuda/random_sample.cuh | 4 ++-- src/ops/random_sample/cuda/random_sample_cuda.cc | 7 +++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu index 1793e507..014dea35 100644 --- a/src/ops/random_sample/cuda/random_sample.cu +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -109,8 +109,8 @@ void inclusive_sum( stream); } -void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, - int voc, DT dtype) { +infiniopStatus_t random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, + int voc, DT dtype) { if (dtype_eq(dtype, F16)) { sort_pairs_descending(nullptr, size_radix_sort, nullptr, nullptr, @@ -121,6 +121,7 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, nullptr, size_scan, nullptr, voc, nullptr); + return STATUS_SUCCESS; } else if (dtype_eq(dtype, F32)) { sort_pairs_descending(nullptr, size_radix_sort, nullptr, nullptr, @@ -131,6 +132,7 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, nullptr, size_scan, nullptr, voc, nullptr); + return STATUS_SUCCESS; } else if (dtype_eq(dtype, F64)) { sort_pairs_descending(nullptr, size_radix_sort, nullptr, nullptr, @@ -141,8 +143,9 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, nullptr, size_scan, nullptr, voc, nullptr); + return STATUS_SUCCESS; } else { - throw std::invalid_argument("Unsupported dtype provided."); + return STATUS_BAD_TENSOR_DTYPE; } } __global__ void random_sample_kernel(uint64_t *result, diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh index 35581466..d99b034a 100644 --- a/src/ops/random_sample/cuda/random_sample.cuh +++ b/src/ops/random_sample/cuda/random_sample.cuh @@ -19,8 +19,8 @@ typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t; infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs); -void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, - int voc, DT dtype); +infiniopStatus_t random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, + int voc, DT dtype); infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size); infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc index df995d54..283808ba 100644 --- a/src/ops/random_sample/cuda/random_sample_cuda.cc +++ b/src/ops/random_sample/cuda/random_sample_cuda.cc @@ -33,8 +33,11 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size) { size_t size_radix_sort; size_t size_scan; - random_sample_workspace(size_radix_sort, size_scan, - desc->voc, desc->dtype); + infiniopStatus_t status = random_sample_workspace(size_radix_sort, size_scan, + desc->voc, desc->dtype); + if (status != STATUS_SUCCESS) { + return status; + } *size = desc->step + std::max(size_radix_sort, size_scan); return STATUS_SUCCESS; } From f8f62141cf8506f1bdc68728facb36c00a1fb0e2 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 3 Dec 2024 11:21:08 +0800 Subject: [PATCH 08/10] modified random sample softmax --- src/ops/random_sample/cuda/random_sample.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu index 014dea35..b0e8d2e2 100644 --- a/src/ops/random_sample/cuda/random_sample.cu +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -34,7 +34,7 @@ __global__ void softmax( int topk, float temperature, int voc) { float sum_s = 0.0f; - for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) { + for (int i = threadIdx.x; i < voc; i += BLOCK_DIM) { sum_s += __expf(static_cast(val_out[i] - val_out[0]) / temperature); } __shared__ float sum_inverse_total; From 80689708d3044efb78745b37e8694eab1f20f0da Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 10 Dec 2024 10:50:48 +0800 Subject: [PATCH 09/10] modified uint64 bang --- operatorspy/tests/random_sample.py | 11 +++++++---- src/ops/utils.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py index ea680c57..e0735ca3 100644 --- a/operatorspy/tests/random_sample.py +++ b/operatorspy/tests/random_sample.py @@ -30,7 +30,10 @@ class RandomSampleDescriptor(Structure): def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): - indices = torch.zeros([topk], dtype = torch.int64) + if(torch_device == "cuda"): + indices = torch.zeros([topk], dtype = torch.uint64) + else: + indices = torch.zeros([topk], dtype = torch.int64) dataNp = data.clone().detach() sorted_indices = torch.arange(voc) @@ -52,7 +55,7 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): globalM = dataNp[0] dataNp = (dataNp - globalM) / temperature - dataNp = torch.softmax(dataNp.float(), dim = 0) + dataNp = torch.softmax(dataNp, dim = 0) sum_s = 0 for end in range(topk): sum_s += dataNp[end] @@ -88,7 +91,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_ ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu") else: ans = random_sample_0(data) - if(torch_device == 'mlu' or torch_device == 'npu'): + if(torch_device != "cuda"): indices = torch.zeros([1], dtype = torch.int64).to(torch_device) else: @@ -96,7 +99,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_ indices = torch.zeros([1], dtype = torch.uint64).to(torch_device) x_tensor = to_tensor(data, lib) indices_tensor = to_tensor(indices, lib) - if(torch_device == 'mlu' or torch_device == 'npu'): + if(torch_device == 'mlu'): indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64 diff --git a/src/ops/utils.h b/src/ops/utils.h index b48cf419..8e0286a1 100644 --- a/src/ops/utils.h +++ b/src/ops/utils.h @@ -224,7 +224,7 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uin // split the dimension dim of a tensor descriptor into multiple dimensions inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector &dims) { uint64_t ndim = desc->ndim; - if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) { + if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t) 1, std::multiplies{})) { return nullptr; } uint64_t new_ndim = ndim + dims.size() - 1; From c4c33549cbdfb831126592b55032cf67d3ee8c87 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Wed, 18 Dec 2024 15:05:07 +0800 Subject: [PATCH 10/10] modified cpu --- operatorspy/tests/random_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py index e0735ca3..d09f293f 100644 --- a/operatorspy/tests/random_sample.py +++ b/operatorspy/tests/random_sample.py @@ -99,7 +99,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_ indices = torch.zeros([1], dtype = torch.uint64).to(torch_device) x_tensor = to_tensor(data, lib) indices_tensor = to_tensor(indices, lib) - if(torch_device == 'mlu'): + if(torch_device != 'cuda'): indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64