contigous axes reduce op

Graylatzhou · Graylatzhou · commit fc1515210fd7 · 2025-03-09T02:30:45.000+08:00
diff --git a/operatorspy/tests/reducemax.py b/operatorspy/tests/reducemax.py
@@ -21,7 +21,7 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 1
 NUM_ITERATIONS = 1
 
@@ -141,8 +141,9 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"custom op output:{y}")
-    print(f"pytorch output:{ans}")
+    # print(f"input : {x}")
+    # print(f"custom op output:{y}")
+    # print(f"pytorch output:{ans}")
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     
     check_error(lib.infiniopDestroyReducemaxDescriptor(descriptor))
@@ -182,10 +183,13 @@ def test_cuda(lib, test_cases):
         # ((2, 10, 24, 10), None, False, False, None),
         # ((2, 3, 4), [0, 1], False, False, None),
         #((2, 10, 24, 10), [], True),
-        ((4,), [0], False, False, None, torch.float32),
-        ((1000, 3), [0, 1], False, False, None, torch.float16),
-        ((50, 3), [0, 1], False, False, None, torch.float32),
-        ((1000, 3), [0, 1], False, False, None, torch.float32),
+        #((4,), [0], False, False, None, torch.float32),
+        ((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((50, 3), [0, 1], False, False, None, torch.float16),
+        ((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((2000, 200, 50), [0, 1], False, True, None, torch.float32),
+        ((1000, 200, 500), [0, 1], False, True, None, torch.float16),
+        ((1000, 200, 50), [0, 1], False, True, None, torch.float32),
     ]
     args = get_args()
     lib = open_lib()
diff --git a/operatorspy/tests/reducemean.py b/operatorspy/tests/reducemean.py
@@ -21,7 +21,7 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 1
 NUM_ITERATIONS = 1
 
@@ -177,8 +177,13 @@ def test_cuda(lib, test_cases):
         # # stride = 
         # ((2, 10, 24, 10), [0, 1], False, True, None),
         # ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((50, 3), [0, 1], False, False, None, torch.float16),
-        ((1000, 3), [0, 1], False, False, None, torch.float16),
+        #((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((30, 50, 20, 1000), [0, 1, 2, 3], False, False, None, torch.float16),
+        ((30000, 1000, 40), [0, 1], False, False, None, torch.float32),
+        #((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((2, 2, 5), [0, 1], False, True, None, torch.float32),
+        ((1000, 200, 500), [0, 1], False, True, None, torch.float16),
+        ((1000, 200, 50), [0, 1], False, True, None, torch.float32),
         # validate attribute noop_with_empty_axes and keepdims
         # ((2, 10, 24, 10), None, True, True, None),
         # ((2, 10, 24, 10), None, True, False, None),
diff --git a/operatorspy/tests/reducemin.py b/operatorspy/tests/reducemin.py
@@ -21,7 +21,7 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 1
 NUM_ITERATIONS = 1
 
@@ -141,41 +141,50 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"custom op output:{y}")
-    print(f"pytorch output:{ans}")
+    # print(f"custom op output:{y}")
+    # print(f"pytorch output:{ans}")
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     
     check_error(lib.infiniopDestroyReducemaxDescriptor(descriptor))
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
-        print(dynamic_axes)
-        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
-        print("\n")
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
         #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
         # dynamic calc test eg
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
-        ((2, 10, 24, 10), [0, 2], False, True, None),
-        # stride = 
-        ((2, 10, 24, 10), [0, 1], False, True, None),
-        ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
-        # validate attribute noop_with_empty_axes and keepdims
-        ((2, 10, 24, 10), None, True, True, None),
-        ((2, 10, 24, 10), None, True, False, None),
-        ((2, 10, 24, 10), None, False, True, None),
-        ((2, 10, 24, 10), None, False, False, None),
-        ((2, 3, 4), [0, 1], False, False, None),
-        #((2, 10, 24, 10), [], True),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        # ((2, 10, 24, 10), [0, 2], False, True, None),
+        # # stride = 
+        # ((2, 10, 24, 10), [0, 1], False, True, None),
+        # ((2, 10, 24, 10), [2, 3], False , True, None),
+        # ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # # validate attribute noop_with_empty_axes and keepdims
+        # ((2, 10, 24, 10), None, True, True, None),
+        # ((2, 10, 24, 10), None, True, False, None),
+        # ((2, 10, 24, 10), None, False, True, None),
+        # ((2, 10, 24, 10), None, False, False, None),
+        # ((2, 3, 4), [0, 1], False, False, None),
+        # #((2, 10, 24, 10), [], True),
+        ((2, 1000), [0, 1], False, False, None, torch.float32),
+        ((2, 2, 5), [0, 1], False, True, None, torch.float32),
+        ((1000, 200, 500), [0, 1], False, True, None, torch.float16),
+        ((1000, 200, 50), [0, 1], False, True, None, torch.float32),
     ]
     args = get_args()
     lib = open_lib()
@@ -201,5 +210,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyReduceminDescriptor.restype = c_int32
     lib.infiniopDestroyReduceminDescriptor.argtypes = [infiniopReduceminDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
diff --git a/src/ops/reduce/cuda/reduce_cuda.cc b/src/ops/reduce/cuda/reduce_cuda.cc
@@ -36,19 +36,18 @@ infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
         output_size *= y->shape[i];
     }
     uint64_t reduce_size = element_num / output_size;
-    uint64_t ndim = y->ndim;
 
     bool *reduce_mask = new bool[x->ndim];
     int64_t *input_strides = new int64_t[x->ndim];
     int64_t *output_strides = new int64_t[y->ndim];
     uint64_t *input_shape = new uint64_t[x->ndim];
     uint64_t *output_shape = new uint64_t[y->ndim];
-
+    memset(reduce_mask, 0, x->ndim * sizeof(bool)); 
     memcpy(input_shape, x->shape, x->ndim * sizeof(uint64_t));
     memcpy(output_shape, y->shape, y->ndim * sizeof(uint64_t));
     memcpy(input_strides, x->strides, x->ndim * sizeof(int64_t));
     memcpy(output_strides, y->strides, y->ndim * sizeof(int64_t));
-
+    int prefix_size = 1, suffix_size = 1;
     bool if_reduce_axes_contiguous = true;
     int reduce_mode = 0;
     for (uint64_t i = 0; i < axes_size; i++) {
@@ -60,21 +59,25 @@ infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
     if (if_reduce_axes_contiguous) {
         if (axes_size == x->ndim) {
             // all axes are reduced
-            int reduce_mode = 0;
+            reduce_mode = 0;
         } else {
-            // some axes are not reduced but axes are contiguous
-            if (reduce_size > 1024 && output_size < 128) reduce_mode = 1; // multi-thread for each output element
-            else reduce_mode = 2; // one thread for each output element
+            for (uint64_t i = 0; i < axes[0]; i++) {  
+                prefix_size *= x->shape[i];
+            }
+            for (uint64_t i = axes[axes_size - 1] + 1; i < x->ndim; i++) {  
+                suffix_size *= x->shape[i];
+            }
+            reduce_mode = 1;
         }
     } else {
-        if (reduce_size > 1024 && output_size < 128) reduce_mode = 3;
+        if (reduce_size > 1024 && output_size > 128) reduce_mode = 3;
         else reduce_mode = 4;
     }
-    bool *d_reduce_mask = new bool[x->ndim];
-    int64_t *d_input_strides = new int64_t[x->ndim];
-    int64_t *d_output_strides = new int64_t[y->ndim];
-    uint64_t *d_input_shape = new uint64_t[x->ndim];
-    uint64_t *d_output_shape = new uint64_t[y->ndim];
+    bool *d_reduce_mask;
+    int64_t *d_input_strides;
+    int64_t *d_output_strides;
+    uint64_t *d_input_shape;
+    uint64_t *d_output_shape;
 
     checkCudaErrorWithCode(cudaMalloc((void**)&d_reduce_mask, x->ndim * sizeof(bool)), STATUS_MEMORY_NOT_ALLOCATED);
     checkCudaErrorWithCode(cudaMalloc((void**)&d_input_strides, x->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
@@ -91,7 +94,8 @@ infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
     *desc_ptr = new ReduceCudaDescriptor{
         DevNvGpu,
         x->dt,
-        ndim,
+        y->ndim,
+        x->ndim,
         d_reduce_mask,
         d_input_strides,
         d_output_strides,
@@ -101,7 +105,13 @@ infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
         element_num,
         output_size,
         static_cast<int>(reduce_op_type),
-        reduce_mode
+        reduce_mode,
+        axes_size,
+        keepdims,
+        axes[0],
+        axes[axes_size - 1],
+        prefix_size,
+        suffix_size
     };
     delete [] reduce_mask;
     delete [] input_strides;
diff --git a/src/ops/reduce/cuda/reduce_cuda.cu b/src/ops/reduce/cuda/reduce_cuda.cu
diff --git a/src/ops/reduce/cuda/reduce_cuda.h b/src/ops/reduce/cuda/reduce_cuda.h