InfiniTensor
diff --git a/‎include/ops/reducemax/reducemax.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemax/reducemax.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemean/reducemean.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemean/reducemean.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemin/reducemin.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemin/reducemin.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎operatorspy/tests/reducemax.py‎
Lines changed: 31 additions & 19 deletions b/‎operatorspy/tests/reducemax.py‎
Lines changed: 31 additions & 19 deletions
diff --git a/‎operatorspy/tests/reducemean.py‎
Lines changed: 32 additions & 18 deletions b/‎operatorspy/tests/reducemean.py‎
Lines changed: 32 additions & 18 deletions
diff --git a/‎src/ops/reduce/cpu/reduce_cpu.cc‎
Lines changed: 2 additions & 2 deletions b/‎src/ops/reduce/cpu/reduce_cpu.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/ops/reduce/cpu/reduce_cpu.h‎
Lines changed: 2 additions & 2 deletions b/‎src/ops/reduce/cpu/reduce_cpu.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/ops/reduce/cuda/reduce_cuda.cc‎
Lines changed: 122 additions & 0 deletions b/‎src/ops/reduce/cuda/reduce_cuda.cc‎
Lines changed: 122 additions & 0 deletions
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
 #endif
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
 #endif
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
 #endif
@@ -113,7 +113,6 @@ def test(
             c_bool(noop_with_empty_axes),
         )
     )
-    print(f"op desctiptor created")
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
     for i in range(NUM_PRERUN if PROFILE else 1):
@@ -151,32 +150,42 @@ def test(
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
-        print(dynamic_axes)
-        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
         print("\n")
         #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
         # dynamic calc test eg
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
-        ((2, 10, 24, 10), [0, 2], False, True, None),
-        # stride = 
-        ((2, 10, 24, 10), [0, 1], False, True, None),
-        ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
-        # validate attribute noop_with_empty_axes and keepdims
-        ((2, 10, 24, 10), None, True, True, None),
-        ((2, 10, 24, 10), None, True, False, None),
-        ((2, 10, 24, 10), None, False, True, None),
-        ((2, 10, 24, 10), None, False, False, None),
-        ((2, 3, 4), [0, 1], False, False, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        # ((2, 10, 24, 10), [0, 2], False, True, None),
+        # # stride = 
+        # ((2, 10, 24, 10), [0, 1], False, True, None),
+        # ((2, 10, 24, 10), [2, 3], False , True, None),
+        # ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # # validate attribute noop_with_empty_axes and keepdims
+        # ((2, 10, 24, 10), None, True, True, None),
+        # ((2, 10, 24, 10), None, True, False, None),
+        # ((2, 10, 24, 10), None, False, True, None),
+        # ((2, 10, 24, 10), None, False, False, None),
+        # ((2, 3, 4), [0, 1], False, False, None),
         #((2, 10, 24, 10), [], True),
+        ((4,), [0], False, False, None, torch.float32),
+        ((1000, 3), [0, 1], False, False, None, torch.float16),
+        ((50, 3), [0, 1], False, False, None, torch.float32),
+        ((1000, 3), [0, 1], False, False, None, torch.float32),
     ]
     args = get_args()
     lib = open_lib()
@@ -202,5 +211,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyReducemaxDescriptor.restype = c_int32
     lib.infiniopDestroyReducemaxDescriptor.argtypes = [infiniopReducemaxDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
@@ -47,6 +47,7 @@ def inferShape(x_shape, axis, noop_with_empty_axes, keepdims=False):
                 return tuple([1] * len(x_shape))
             else:
                 return tuple([])
+    
     assert len(axis) <= len(x_shape), "axis out of range"
     output_shape = []
     axis = [a if a >= 0 else a + len(x_shape) for a in axis]  # 更新 axis 列表中的值
@@ -82,9 +83,9 @@ def test(
     print(
         f"Testing reducemean on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
     )
-    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    x = torch.randint(0, 10, x_shape, dtype=tensor_dtype, device=torch_device)
     print(f"y_shape = {inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)}")
-    y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float('-inf'), dtype=tensor_dtype, device=torch_device)
+    y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float(0), dtype=tensor_dtype, device=torch_device)
     print(f"y_shape = {y.shape}")
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = reduce_mean(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
@@ -141,6 +142,7 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
+    #print(f"input_data = {x}")
     print(f"custom op output:{y}")
     print(f"pytorch output:{ans}")
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
@@ -150,30 +152,39 @@ def test(
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
-        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
         print("\n")
         #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
+
 
 if __name__ == "__main__":
     test_cases = [
         # dynamic calc test eg
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
-        ((2, 10, 24, 10), [0, 2], False, True, None),
-        # stride = 
-        ((2, 10, 24, 10), [0, 1], False, True, None),
-        ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        # ((2, 10, 24, 10), [0, 2], False, True, None),
+        # # stride = 
+        # ((2, 10, 24, 10), [0, 1], False, True, None),
+        # ((2, 10, 24, 10), [2, 3], False , True, None),
+        ((50, 3), [0, 1], False, False, None, torch.float16),
+        ((1000, 3), [0, 1], False, False, None, torch.float16),
         # validate attribute noop_with_empty_axes and keepdims
-        ((2, 10, 24, 10), None, True, True, None),
-        ((2, 10, 24, 10), None, True, False, None),
-        ((2, 10, 24, 10), None, False, True, None),
-        ((2, 10, 24, 10), None, False, False, None),
-        ((2, 3, 4), [0, 1], False, False, None),
+        # ((2, 10, 24, 10), None, True, True, None),
+        # ((2, 10, 24, 10), None, True, False, None),
+        # ((2, 10, 24, 10), None, False, True, None),
+        # ((2, 10, 24, 10), None, False, False, None),
+        # ((2, 3, 4), [0, 1], False, False, None),
         #((2, 10, 24, 10), [], True),
     ]
     args = get_args()
@@ -200,5 +211,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyReducemeanDescriptor.restype = c_int32
     lib.infiniopDestroyReducemeanDescriptor.argtypes = [infiniopReducemeanDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
@@ -214,8 +214,8 @@ infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
 
 infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
                             void *y,
-                            void const *x,
-                            void const *dynamic_axes,
+                            void *x,
+                            void *dynamic_axes,
                             uint64_t dynamic_axes_size,
                             void *stream){
     if (desc->is_axes_static == true && dynamic_axes_size > 0){
 
@@ -44,8 +44,8 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
 
 infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
                             void *y,
-                            void const *x,
-                            void const *dynamic_axes,
+                            void *x,
+                            void *dynamic_axes,
                             uint64_t dynamic_axes_size,
                             void *stream);
 
 
@@ -0,0 +1,122 @@
+#include "reduce_cuda.h"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+// need reduce_size, output_size, output_stride, input_stride
+// generate reduce_mask
+infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
+                                            ReduceCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            int64_t const *axes,
+                                            uint64_t axes_size,
+                                            int reduce_op_type,
+                                            bool keepdims
+                                            ) {
+    if (x->dt != F16 && x->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (keepdims) {
+        if (x->ndim != y->ndim) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (x->dt != y->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(x) || !is_contiguous(y)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    uint64_t element_num = 1;
+    uint64_t output_size = 1;
+    for (uint64_t i = 0; i < x->ndim; i++) {
+        element_num *= x->shape[i];
+    }
+
+    for (uint64_t i = 0; i < y->ndim; i++) {
+        output_size *= y->shape[i];
+    }
+    uint64_t reduce_size = element_num / output_size;
+    uint64_t ndim = y->ndim;
+
+    bool *reduce_mask = new bool[x->ndim];
+    int64_t *input_strides = new int64_t[x->ndim];
+    int64_t *output_strides = new int64_t[y->ndim];
+    uint64_t *input_shape = new uint64_t[x->ndim];
+    uint64_t *output_shape = new uint64_t[y->ndim];
+
+    memcpy(input_shape, x->shape, x->ndim * sizeof(uint64_t));
+    memcpy(output_shape, y->shape, y->ndim * sizeof(uint64_t));
+    memcpy(input_strides, x->strides, x->ndim * sizeof(int64_t));
+    memcpy(output_strides, y->strides, y->ndim * sizeof(int64_t));
+
+    bool if_reduce_axes_contiguous = true;
+    int reduce_mode = 0;
+    for (uint64_t i = 0; i < axes_size; i++) {
+        reduce_mask[axes[i]] = true;
+        if (i < axes_size - 1 && axes[i] != axes[i + 1] - 1) {
+            if_reduce_axes_contiguous = false;
+        }
+    }
+    if (if_reduce_axes_contiguous) {
+        if (axes_size == x->ndim) {
+            // all axes are reduced
+            int reduce_mode = 0;
+        } else {
+            // some axes are not reduced but axes are contiguous
+            if (reduce_size > 1024 && output_size < 128) reduce_mode = 1; // multi-thread for each output element
+            else reduce_mode = 2; // one thread for each output element
+        }
+    } else {
+        if (reduce_size > 1024 && output_size < 128) reduce_mode = 3;
+        else reduce_mode = 4;
+    }
+    bool *d_reduce_mask = new bool[x->ndim];
+    int64_t *d_input_strides = new int64_t[x->ndim];
+    int64_t *d_output_strides = new int64_t[y->ndim];
+    uint64_t *d_input_shape = new uint64_t[x->ndim];
+    uint64_t *d_output_shape = new uint64_t[y->ndim];
+
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_reduce_mask, x->ndim * sizeof(bool)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_input_strides, x->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_output_strides, y->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_input_shape, x->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_output_shape, y->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+
+    checkCudaErrorWithCode(cudaMemcpy(d_reduce_mask, reduce_mask, x->ndim * sizeof(bool), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_input_strides, input_strides, x->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_output_strides, output_strides, y->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_input_shape, input_shape, x->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_output_shape, output_shape, y->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    *desc_ptr = new ReduceCudaDescriptor{
+        DevNvGpu,
+        x->dt,
+        ndim,
+        d_reduce_mask,
+        d_input_strides,
+        d_output_strides,
+        d_input_shape,
+        d_output_shape,
+        reduce_size,
+        element_num,
+        output_size,
+        static_cast<int>(reduce_op_type),
+        reduce_mode
+    };
+    delete [] reduce_mask;
+    delete [] input_strides;
+    delete [] output_strides;
+    delete [] input_shape;
+    delete [] output_shape;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree((void*)desc->reduce_mask), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->input_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->output_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->input_shape), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->output_shape), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}