From f6375158b9a1f2d3b0f2373414a49e18dafaec8e Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 31 Oct 2024 14:18:18 +0800
Subject: [PATCH 01/10] random_sample_workspace

---
 src/ops/random_sample/cuda/random_sample.cu   | 34 ++++++++-----------
 src/ops/random_sample/cuda/random_sample.cuh  |  5 +--
 .../random_sample/cuda/random_sample_cuda.cc  |  8 +++--
 3 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
index 40761e89..7d00e4a1 100644
--- a/src/ops/random_sample/cuda/random_sample.cu
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -84,26 +84,26 @@ void inclusive_sum(
         data, data, voc,
         stream);
 }
-template<class T, class I>
+
 void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
-                             int voc, cudaStream_t stream) {
+                             int voc) {
 
 
-    sort_pairs_descending<T, I>(nullptr, size_radix_sort,
-                                nullptr, nullptr,
-                                nullptr, nullptr,
-                                voc, stream);
+    sort_pairs_descending<half, uint64_t>(nullptr, size_radix_sort,
+                                          nullptr, nullptr,
+                                          nullptr, nullptr,
+                                          voc, nullptr);
 
-    inclusive_sum<T>(
+    inclusive_sum<half>(
         nullptr, size_scan,
         nullptr, voc,
-        stream);
+        nullptr);
 }
 __global__ void random_sample_kernel(uint64_t *result,
                                      uint64_t *key_out) {
     result[0] = key_out[0];
 }
-void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, void *result,
+void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *result,
                               void const *probs,
                               float random_val,
                               float topp,
@@ -121,14 +121,11 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
 
     index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
     //下面开始计算workspace空间
-    size_t size_radix_sort;
-    size_t size_scan;
-    random_sample_workspace<half, uint64_t>(size_radix_sort, size_scan,
-                                            voc, (cudaStream_t) stream);
-    void *workspace_extra;
-    cudaMalloc(&workspace_extra, size_radix_sort + size_scan);
+
+    void *workspace_extra = reinterpret_cast<char *>(workspace) + 2 * voc * sizeof(half) + voc * sizeof(uint64_t);
+    uint64_t workspace_len = workspace_size - 2 * voc * sizeof(half) - voc * sizeof(uint64_t);
     sort_pairs_descending<half, uint64_t>(
-        workspace_extra, size_radix_sort,
+        workspace_extra, workspace_len,
         (half *) probs, val_out,
         key_in, key_out,
         voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
@@ -141,7 +138,7 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
 
 
         inclusive_sum<half>(
-            workspace_extra, size_scan,
+            workspace_extra, workspace_len,
             val_out, voc,
             (cudaStream_t) stream);//该函数会实现scan功能不断累加结果
         random_sample_kernel<half><<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
@@ -155,7 +152,6 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
         random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
                                                                  key_out);
     }
-    cudaFree(workspace_extra);
 }
 
 infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
@@ -172,7 +168,7 @@ infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
         return STATUS_BAD_DEVICE;
     }
     if (dtype_eq(desc->dtype, F16)) {
-        random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream);
+        random_sample_nv_gpu_f16(desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
         return STATUS_SUCCESS;
     }
 
diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh
index d3fff76d..235bfa24 100644
--- a/src/ops/random_sample/cuda/random_sample.cuh
+++ b/src/ops/random_sample/cuda/random_sample.cuh
@@ -18,8 +18,9 @@ typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t;
 infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
                                                   RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
                                                   infiniopTensorDescriptor_t probs);
-
-infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size);
+void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
+                             int voc);
+infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size);
 
 infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
                                   void *workspace,
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc
index 022a113b..15bd69b9 100644
--- a/src/ops/random_sample/cuda/random_sample_cuda.cc
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cc
@@ -26,8 +26,12 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
     return STATUS_SUCCESS;
 }
 
-infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size) {
-    *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype));
+infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size) {
+    size_t size_radix_sort;
+    size_t size_scan;
+    random_sample_workspace(size_radix_sort, size_scan,
+                            desc->voc);
+    *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)) + std::max(size_radix_sort, size_scan);
     return STATUS_SUCCESS;
 }
 

From d1583b1e428c78c9481bdc0358970e06836cc27a Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 31 Oct 2024 14:30:18 +0800
Subject: [PATCH 02/10] modified workspace_size

---
 src/ops/random_sample/cuda/random_sample.cu   | 42 ++++++++++++++-----
 src/ops/random_sample/cuda/random_sample.cuh  |  2 +-
 .../random_sample/cuda/random_sample_cuda.cc  |  2 +-
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
index 7d00e4a1..81bb43b6 100644
--- a/src/ops/random_sample/cuda/random_sample.cu
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -86,18 +86,38 @@ void inclusive_sum(
 }
 
 void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
-                             int voc) {
+                             int voc, DT dtype) {
+    if (dtype_eq(dtype, F16)) {
+        sort_pairs_descending<half, uint64_t>(nullptr, size_radix_sort,
+                                              nullptr, nullptr,
+                                              nullptr, nullptr,
+                                              voc, nullptr);
 
-
-    sort_pairs_descending<half, uint64_t>(nullptr, size_radix_sort,
-                                          nullptr, nullptr,
-                                          nullptr, nullptr,
-                                          voc, nullptr);
-
-    inclusive_sum<half>(
-        nullptr, size_scan,
-        nullptr, voc,
-        nullptr);
+        inclusive_sum<half>(
+            nullptr, size_scan,
+            nullptr, voc,
+            nullptr);
+    } else if (dtype_eq(dtype, F32)) {
+        sort_pairs_descending<float, uint64_t>(nullptr, size_radix_sort,
+                                               nullptr, nullptr,
+                                               nullptr, nullptr,
+                                               voc, nullptr);
+
+        inclusive_sum<float>(
+            nullptr, size_scan,
+            nullptr, voc,
+            nullptr);
+    } else if (dtype_eq(dtype, F64)) {
+        sort_pairs_descending<double, uint64_t>(nullptr, size_radix_sort,
+                                                nullptr, nullptr,
+                                                nullptr, nullptr,
+                                                voc, nullptr);
+
+        inclusive_sum<double>(
+            nullptr, size_scan,
+            nullptr, voc,
+            nullptr);
+    }
 }
 __global__ void random_sample_kernel(uint64_t *result,
                                      uint64_t *key_out) {
diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh
index 235bfa24..9d464bcc 100644
--- a/src/ops/random_sample/cuda/random_sample.cuh
+++ b/src/ops/random_sample/cuda/random_sample.cuh
@@ -19,7 +19,7 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
                                                   RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
                                                   infiniopTensorDescriptor_t probs);
 void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
-                             int voc);
+                             int voc, DT dtype);
 infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size);
 
 infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc
index 15bd69b9..dd0b4751 100644
--- a/src/ops/random_sample/cuda/random_sample_cuda.cc
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cc
@@ -30,7 +30,7 @@ infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t d
     size_t size_radix_sort;
     size_t size_scan;
     random_sample_workspace(size_radix_sort, size_scan,
-                            desc->voc);
+                            desc->voc, desc->dtype);
     *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)) + std::max(size_radix_sort, size_scan);
     return STATUS_SUCCESS;
 }

From 71603495d9b73bd419da4b5d4a005f50b5c9aaf1 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 14 Nov 2024 13:50:52 +0800
Subject: [PATCH 03/10] add step

---
 src/ops/random_sample/cuda/random_sample.cu      |  4 ++--
 src/ops/random_sample/cuda/random_sample.cuh     |  1 +
 src/ops/random_sample/cuda/random_sample_cuda.cc | 10 +++++++---
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
index 81bb43b6..117fbdf8 100644
--- a/src/ops/random_sample/cuda/random_sample.cu
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -142,8 +142,8 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
     index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
     //下面开始计算workspace空间
 
-    void *workspace_extra = reinterpret_cast<char *>(workspace) + 2 * voc * sizeof(half) + voc * sizeof(uint64_t);
-    uint64_t workspace_len = workspace_size - 2 * voc * sizeof(half) - voc * sizeof(uint64_t);
+    void *workspace_extra = reinterpret_cast<char *>(workspace) + desc->step;
+    uint64_t workspace_len = workspace_size - desc->step;
     sort_pairs_descending<half, uint64_t>(
         workspace_extra, workspace_len,
         (half *) probs, val_out,
diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh
index 9d464bcc..35581466 100644
--- a/src/ops/random_sample/cuda/random_sample.cuh
+++ b/src/ops/random_sample/cuda/random_sample.cuh
@@ -11,6 +11,7 @@ struct RandomSampleCudaDescriptor {
     int voc;
     DT rDtype;
     int rLength;
+    int step;
 };
 
 typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t;
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc
index dd0b4751..6a13e0a2 100644
--- a/src/ops/random_sample/cuda/random_sample_cuda.cc
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cc
@@ -8,20 +8,24 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
     if (probs->ndim != 1) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
-    if (!dtype_eq(result->dt, U64))
+    if (!dtype_eq(probs->dt, F16) && !dtype_eq(result->dt, U64)) {
         return STATUS_BAD_TENSOR_DTYPE;
+    }
+
     int voc = probs->shape[0];
     int rLength = result->shape[0];
     if (result->ndim != 1 && rLength != 1) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
+    int step = 2 * voc * sizeof(uint64_t) + voc * sizeof(probs->dt);
     *desc_ptr = new RandomSampleCudaDescriptor{
         handle->device,
         handle->device_id,
         probs->dt,
         voc,
         result->dt,
-        rLength};
+        rLength,
+        step};
 
     return STATUS_SUCCESS;
 }
@@ -31,7 +35,7 @@ infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t d
     size_t size_scan;
     random_sample_workspace(size_radix_sort, size_scan,
                             desc->voc, desc->dtype);
-    *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)) + std::max(size_radix_sort, size_scan);
+    *size = desc->voc * desc->step + std::max(size_radix_sort, size_scan);
     return STATUS_SUCCESS;
 }
 

From 2563904acfb66edb10b1af5fbd409fd85854d23e Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 15 Nov 2024 10:07:48 +0800
Subject: [PATCH 04/10] modified cpu and cuda argmax

---
 src/ops/random_sample/cpu/random_sample.cc  | 11 +---
 src/ops/random_sample/cuda/random_sample.cu | 64 +++++++++++++++------
 2 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/src/ops/random_sample/cpu/random_sample.cc b/src/ops/random_sample/cpu/random_sample.cc
index 28de5b93..4a58de8f 100644
--- a/src/ops/random_sample/cpu/random_sample.cc
+++ b/src/ops/random_sample/cpu/random_sample.cc
@@ -135,16 +135,11 @@ void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc,
     auto index_ = reinterpret_cast<uint64_t *>(result);
     auto source = reinterpret_cast<const uint16_t *>(probs);
 
-    char *origin = reinterpret_cast<char *>(workspace);
-    uint16_t *logits_ = (uint16_t *) origin;
-
-    std::copy(source, source + voc, logits_);
-
-    float M = f16_to_f32(logits_[0]);
+    float M = f16_to_f32(source[0]);
     int index = 0;
     for (int j = 1; j < voc; j++) {
-        if (M < f16_to_f32(logits_[j])) {
-            M = f16_to_f32(logits_[j]);
+        if (M < f16_to_f32(source[j])) {
+            M = f16_to_f32(source[j]);
             index = j;
         }
     }
diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
index 117fbdf8..ffcde6e3 100644
--- a/src/ops/random_sample/cuda/random_sample.cu
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -3,7 +3,31 @@
 #include "random_sample.cuh"
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
-
+template<class T, int BLOCK_DIM>
+__global__ void argmaxKernel(T *val_out, int voc, uint64_t *result) {
+    float localM = -__FLT_MAX__;
+    uint64_t index = threadIdx.x;
+    for (int i = threadIdx.x; i < voc; i += BLOCK_DIM) {
+        if (localM < static_cast<float>(val_out[i])) {
+            localM = static_cast<float>(val_out[i]);
+            index = i;
+        }
+    }
+    __shared__ uint64_t globalInd[BLOCK_DIM];
+    __shared__ float globalM[BLOCK_DIM];
+    globalInd[threadIdx.x] = index;
+    globalM[threadIdx.x] = localM;
+    for (int strip = BLOCK_DIM / 2; strip > 0; strip /= 2) {
+        if (threadIdx.x < strip) {
+            if (globalM[threadIdx.x] < globalM[threadIdx.x + strip]) {
+                globalM[threadIdx.x] = globalM[threadIdx.x + strip];
+                globalInd[threadIdx.x] = globalInd[threadIdx.x + strip];
+            }
+        }
+        __syncthreads();
+    }
+    result[0] = globalInd[0];
+}
 template<class T, int BLOCK_DIM>
 __global__ void softmax(
     T *val_out,
@@ -132,25 +156,26 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
                               void *stream) {
     int voc = desc->voc;
     //下面这段代码在排序
-    char *origin = reinterpret_cast<char *>(workspace);
-    char *keyTmp = origin + voc * sizeof(half);
-    half *val_out = (half *) origin;
 
-    uint64_t *key_in = (uint64_t *) keyTmp;
-    uint64_t *key_out = key_in + voc;
+    if (topp > 0 && topk > 1) {
+        char *origin = reinterpret_cast<char *>(workspace);
+        char *keyTmp = origin + voc * sizeof(half);
+        half *val_out = (half *) origin;
 
-    index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
-    //下面开始计算workspace空间
+        uint64_t *key_in = (uint64_t *) keyTmp;
+        uint64_t *key_out = key_in + voc;
 
-    void *workspace_extra = reinterpret_cast<char *>(workspace) + desc->step;
-    uint64_t workspace_len = workspace_size - desc->step;
-    sort_pairs_descending<half, uint64_t>(
-        workspace_extra, workspace_len,
-        (half *) probs, val_out,
-        key_in, key_out,
-        voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
-    //排序结束，然后开始做softmax变换
-    if (topp > 0 && topk > 1) {
+        index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
+        //下面开始计算workspace空间
+
+        void *workspace_extra = reinterpret_cast<char *>(workspace) + desc->step;
+        uint64_t workspace_len = workspace_size - desc->step;
+        sort_pairs_descending<half, uint64_t>(
+            workspace_extra, workspace_len,
+            (half *) probs, val_out,
+            key_in, key_out,
+            voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
+                                        //排序结束，然后开始做softmax变换
         int BLOCK_DIM = 1024;
         int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
         softmax<half, 1024><<<num_blocks, BLOCK_DIM, 0, (cudaStream_t) stream>>>(val_out, topk,
@@ -169,8 +194,9 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
                                                                        key_out);
 
     } else {
-        random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
-                                                                 key_out);
+        int BLOCK_DIM = 1024;
+        int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
+        argmaxKernel<half, 1024><<<num_blocks, BLOCK_DIM, 0, (cudaStream_t) stream>>>((half *) probs, voc, (uint64_t *) result);
     }
 }
 

From c65cad300f92cd98f446bff0df9c13d1147a89eb Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 20 Nov 2024 10:42:37 +0800
Subject: [PATCH 05/10] modified workspace

---
 src/ops/random_sample/cuda/random_sample_cuda.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc
index 6a13e0a2..df995d54 100644
--- a/src/ops/random_sample/cuda/random_sample_cuda.cc
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cc
@@ -35,7 +35,7 @@ infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t d
     size_t size_scan;
     random_sample_workspace(size_radix_sort, size_scan,
                             desc->voc, desc->dtype);
-    *size = desc->voc * desc->step + std::max(size_radix_sort, size_scan);
+    *size = desc->step + std::max(size_radix_sort, size_scan);
     return STATUS_SUCCESS;
 }
 

From eca6ff802b3a0559f9ed3af86445253185dd605f Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 20 Nov 2024 14:09:26 +0800
Subject: [PATCH 06/10] add checkout dtype

---
 src/ops/random_sample/cuda/random_sample.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
index ffcde6e3..1793e507 100644
--- a/src/ops/random_sample/cuda/random_sample.cu
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -141,6 +141,8 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
             nullptr, size_scan,
             nullptr, voc,
             nullptr);
+    } else {
+        throw std::invalid_argument("Unsupported dtype provided.");
     }
 }
 __global__ void random_sample_kernel(uint64_t *result,

From 54b2a2f51f7ba4cb273aa2c9062df8049fa39707 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 22 Nov 2024 13:59:19 +0800
Subject: [PATCH 07/10] modified workspace return

---
 src/ops/random_sample/cuda/random_sample.cu      | 9 ++++++---
 src/ops/random_sample/cuda/random_sample.cuh     | 4 ++--
 src/ops/random_sample/cuda/random_sample_cuda.cc | 7 +++++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
index 1793e507..014dea35 100644
--- a/src/ops/random_sample/cuda/random_sample.cu
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -109,8 +109,8 @@ void inclusive_sum(
         stream);
 }
 
-void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
-                             int voc, DT dtype) {
+infiniopStatus_t random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
+                                         int voc, DT dtype) {
     if (dtype_eq(dtype, F16)) {
         sort_pairs_descending<half, uint64_t>(nullptr, size_radix_sort,
                                               nullptr, nullptr,
@@ -121,6 +121,7 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
             nullptr, size_scan,
             nullptr, voc,
             nullptr);
+        return STATUS_SUCCESS;
     } else if (dtype_eq(dtype, F32)) {
         sort_pairs_descending<float, uint64_t>(nullptr, size_radix_sort,
                                                nullptr, nullptr,
@@ -131,6 +132,7 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
             nullptr, size_scan,
             nullptr, voc,
             nullptr);
+        return STATUS_SUCCESS;
     } else if (dtype_eq(dtype, F64)) {
         sort_pairs_descending<double, uint64_t>(nullptr, size_radix_sort,
                                                 nullptr, nullptr,
@@ -141,8 +143,9 @@ void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
             nullptr, size_scan,
             nullptr, voc,
             nullptr);
+        return STATUS_SUCCESS;
     } else {
-        throw std::invalid_argument("Unsupported dtype provided.");
+        return STATUS_BAD_TENSOR_DTYPE;
     }
 }
 __global__ void random_sample_kernel(uint64_t *result,
diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh
index 35581466..d99b034a 100644
--- a/src/ops/random_sample/cuda/random_sample.cuh
+++ b/src/ops/random_sample/cuda/random_sample.cuh
@@ -19,8 +19,8 @@ typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t;
 infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
                                                   RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
                                                   infiniopTensorDescriptor_t probs);
-void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
-                             int voc, DT dtype);
+infiniopStatus_t random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
+                                         int voc, DT dtype);
 infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size);
 
 infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc
index df995d54..283808ba 100644
--- a/src/ops/random_sample/cuda/random_sample_cuda.cc
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cc
@@ -33,8 +33,11 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
 infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size) {
     size_t size_radix_sort;
     size_t size_scan;
-    random_sample_workspace(size_radix_sort, size_scan,
-                            desc->voc, desc->dtype);
+    infiniopStatus_t status = random_sample_workspace(size_radix_sort, size_scan,
+                                                      desc->voc, desc->dtype);
+    if (status != STATUS_SUCCESS) {
+        return status;
+    }
     *size = desc->step + std::max(size_radix_sort, size_scan);
     return STATUS_SUCCESS;
 }

From f8f62141cf8506f1bdc68728facb36c00a1fb0e2 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Tue, 3 Dec 2024 11:21:08 +0800
Subject: [PATCH 08/10] modified random sample softmax

---
 src/ops/random_sample/cuda/random_sample.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
index 014dea35..b0e8d2e2 100644
--- a/src/ops/random_sample/cuda/random_sample.cu
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -34,7 +34,7 @@ __global__ void softmax(
     int topk,
     float temperature, int voc) {
     float sum_s = 0.0f;
-    for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) {
+    for (int i = threadIdx.x; i < voc; i += BLOCK_DIM) {
         sum_s += __expf(static_cast<float>(val_out[i] - val_out[0]) / temperature);
     }
     __shared__ float sum_inverse_total;

From 80689708d3044efb78745b37e8694eab1f20f0da Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Tue, 10 Dec 2024 10:50:48 +0800
Subject: [PATCH 09/10] modified uint64 bang

---
 operatorspy/tests/random_sample.py | 11 +++++++----
 src/ops/utils.h                    |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
index ea680c57..e0735ca3 100644
--- a/operatorspy/tests/random_sample.py
+++ b/operatorspy/tests/random_sample.py
@@ -30,7 +30,10 @@ class RandomSampleDescriptor(Structure):
 
 
 def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
-    indices = torch.zeros([topk], dtype = torch.int64)
+    if(torch_device == "cuda"):
+        indices = torch.zeros([topk], dtype = torch.uint64)
+    else:
+        indices = torch.zeros([topk], dtype = torch.int64)
     dataNp = data.clone().detach()
     sorted_indices = torch.arange(voc)
     
@@ -52,7 +55,7 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
     
     globalM = dataNp[0]
     dataNp = (dataNp - globalM) / temperature
-    dataNp = torch.softmax(dataNp.float(), dim = 0)
+    dataNp = torch.softmax(dataNp, dim = 0)
     sum_s = 0
     for end in range(topk):
         sum_s += dataNp[end]
@@ -88,7 +91,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
         ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
     else:
         ans = random_sample_0(data)
-    if(torch_device == 'mlu' or torch_device == 'npu'):
+    if(torch_device != "cuda"):
         
         indices = torch.zeros([1], dtype = torch.int64).to(torch_device)
     else:
@@ -96,7 +99,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
         indices = torch.zeros([1], dtype = torch.uint64).to(torch_device)
     x_tensor = to_tensor(data, lib)
     indices_tensor = to_tensor(indices, lib)
-    if(torch_device == 'mlu' or torch_device == 'npu'):
+    if(torch_device == 'mlu'):
         indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64
     
     
diff --git a/src/ops/utils.h b/src/ops/utils.h
index b48cf419..8e0286a1 100644
--- a/src/ops/utils.h
+++ b/src/ops/utils.h
@@ -224,7 +224,7 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uin
 // split the dimension dim of a tensor descriptor into multiple dimensions
 inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector<uint64_t> &dims) {
     uint64_t ndim = desc->ndim;
-    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) {
+    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t) 1, std::multiplies{})) {
         return nullptr;
     }
     uint64_t new_ndim = ndim + dims.size() - 1;

From c4c33549cbdfb831126592b55032cf67d3ee8c87 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 18 Dec 2024 15:05:07 +0800
Subject: [PATCH 10/10] modified cpu

---
 operatorspy/tests/random_sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
index e0735ca3..d09f293f 100644
--- a/operatorspy/tests/random_sample.py
+++ b/operatorspy/tests/random_sample.py
@@ -99,7 +99,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
         indices = torch.zeros([1], dtype = torch.uint64).to(torch_device)
     x_tensor = to_tensor(data, lib)
     indices_tensor = to_tensor(indices, lib)
-    if(torch_device == 'mlu'):
+    if(torch_device != 'cuda'):
         indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64