From 7a9f0a5c43ac04c60f004a75b1333508834352d6 Mon Sep 17 00:00:00 2001
From: Graylatzhou <1391087899@qq.com>
Date: Wed, 26 Feb 2025 01:35:02 +0800
Subject: [PATCH 1/4] Clip, reduce, gather, where Cpu operators

---
 env.sh                              |   1 +
 include/infini_operators.h          |   6 +
 include/ops/clip/clip.h             |  22 ++
 include/ops/gather/gather.h         |  24 +++
 include/ops/reducemax/reducemax.h   |  25 +++
 include/ops/reducemean/reducemean.h |  25 +++
 include/ops/reducemin/reducemin.h   |  25 +++
 include/ops/where/where.h           |  24 +++
 operatorspy/liboperators.py         |   2 +
 operatorspy/tests/clip.py           | 156 ++++++++++++++
 operatorspy/tests/gather.py         | 148 ++++++++++++++
 operatorspy/tests/matmul.py         |   4 +-
 operatorspy/tests/max_pool.py       |   2 +-
 operatorspy/tests/reducemax.py      | 206 +++++++++++++++++++
 operatorspy/tests/reducemean.py     | 204 +++++++++++++++++++
 operatorspy/tests/reducemin.py      | 205 +++++++++++++++++++
 operatorspy/tests/where.py          | 166 +++++++++++++++
 src/ops/clip/cpu/clip_cpu.cc        |  79 +++++++
 src/ops/clip/cpu/clip_cpu.h         |  28 +++
 src/ops/clip/operator.cc            |  44 ++++
 src/ops/gather/cpu/gather_cpu.cc    | 110 ++++++++++
 src/ops/gather/cpu/gather_cpu.h     |  35 ++++
 src/ops/gather/operator.cc          |  43 ++++
 src/ops/reduce/cpu/reduce_cpu.cc    | 305 ++++++++++++++++++++++++++++
 src/ops/reduce/cpu/reduce_cpu.h     |  54 +++++
 src/ops/reduce/operator.cc          |  47 +++++
 src/ops/reduce/reduce.h             |  26 +++
 src/ops/reducemax/operator.cc       |  41 ++++
 src/ops/reducemean/operator.cc      |  41 ++++
 src/ops/reducemin/operator.cc       |  40 ++++
 src/ops/where/cpu/where_cpu.cc      | 144 +++++++++++++
 src/ops/where/cpu/where_cpu.h       |  36 ++++
 src/ops/where/operator.cc           |  46 +++++
 src/tensor/tensor_descriptor.cc     |   1 +
 34 files changed, 2362 insertions(+), 3 deletions(-)
 create mode 100644 env.sh
 create mode 100644 include/ops/clip/clip.h
 create mode 100644 include/ops/gather/gather.h
 create mode 100644 include/ops/reducemax/reducemax.h
 create mode 100644 include/ops/reducemean/reducemean.h
 create mode 100644 include/ops/reducemin/reducemin.h
 create mode 100644 include/ops/where/where.h
 create mode 100644 operatorspy/tests/clip.py
 create mode 100644 operatorspy/tests/gather.py
 create mode 100644 operatorspy/tests/reducemax.py
 create mode 100644 operatorspy/tests/reducemean.py
 create mode 100644 operatorspy/tests/reducemin.py
 create mode 100644 operatorspy/tests/where.py
 create mode 100644 src/ops/clip/cpu/clip_cpu.cc
 create mode 100644 src/ops/clip/cpu/clip_cpu.h
 create mode 100644 src/ops/clip/operator.cc
 create mode 100644 src/ops/gather/cpu/gather_cpu.cc
 create mode 100644 src/ops/gather/cpu/gather_cpu.h
 create mode 100644 src/ops/gather/operator.cc
 create mode 100644 src/ops/reduce/cpu/reduce_cpu.cc
 create mode 100644 src/ops/reduce/cpu/reduce_cpu.h
 create mode 100644 src/ops/reduce/operator.cc
 create mode 100644 src/ops/reduce/reduce.h
 create mode 100644 src/ops/reducemax/operator.cc
 create mode 100644 src/ops/reducemean/operator.cc
 create mode 100644 src/ops/reducemin/operator.cc
 create mode 100644 src/ops/where/cpu/where_cpu.cc
 create mode 100644 src/ops/where/cpu/where_cpu.h
 create mode 100644 src/ops/where/operator.cc

diff --git a/env.sh b/env.sh
new file mode 100644
index 00000000..b743e9e7
--- /dev/null
+++ b/env.sh
@@ -0,0 +1 @@
+export INFINI_ROOT=/home/zzw/workspace/operators-dev/build/linux/x86_64/release
\ No newline at end of file
diff --git a/include/infini_operators.h b/include/infini_operators.h
index 9a5a2555..aedd3dd1 100644
--- a/include/infini_operators.h
+++ b/include/infini_operators.h
@@ -16,4 +16,10 @@
 #include "ops/rms_norm/rms_norm.h"
 #include "ops/rotary_embedding/rotary_embedding.h"
 #include "ops/swiglu/swiglu.h"
+#include "ops/reducemax/reducemax.h"
+#include "ops/reducemean/reducemean.h"
+#include "ops/reducemin/reducemin.h"
+#include "ops/clip/clip.h"
+#include "ops/where/where.h"
+#include "ops/gather/gather.h"
 #include "tensor/tensor_descriptor.h"
diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h
new file mode 100644
index 00000000..3ebd1b0e
--- /dev/null
+++ b/include/ops/clip/clip.h
@@ -0,0 +1,22 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ClipDescriptor {
+    Device device;
+} ClipDescriptor;
+typedef ClipDescriptor *infiniopClipDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+                                                                infiniopClipDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t y
+                                                                );
+
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, void *min, void *max, void *y, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h
new file mode 100644
index 00000000..ae4983da
--- /dev/null
+++ b/include/ops/gather/gather.h
@@ -0,0 +1,24 @@
+#ifndef GAHTER_H
+#define GAHTER_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct GatherDescriptor {
+    Device device;
+} GatherDescriptor;
+typedef GatherDescriptor *infiniopGatherDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle,
+                                                                infiniopGatherDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t indices,
+                                                                int64_t axis
+                                                                );
+
+__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void *indices, void *y, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/ops/reducemax/reducemax.h b/include/ops/reducemax/reducemax.h
new file mode 100644
index 00000000..f04e5349
--- /dev/null
+++ b/include/ops/reducemax/reducemax.h
@@ -0,0 +1,25 @@
+#ifndef REDUCEMAX_H
+#define REDUCEMAX_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReducemaxDescriptor {
+    Device device;
+} ReducemaxDescriptor;
+typedef ReducemaxDescriptor *infiniopReducemaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t handle,
+                                                                infiniopReducemaxDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x,
+                                                                int64_t const *axes,
+                                                                uint64_t n,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes
+                                                                );
+
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
+#endif
diff --git a/include/ops/reducemean/reducemean.h b/include/ops/reducemean/reducemean.h
new file mode 100644
index 00000000..3baa22d8
--- /dev/null
+++ b/include/ops/reducemean/reducemean.h
@@ -0,0 +1,25 @@
+#ifndef REDUCEMEAN_H
+#define REDUCEMEAN_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReducemeanDescriptor {
+    Device device;
+} ReducemeanDescriptor;
+typedef ReducemeanDescriptor *infiniopReducemeanDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_t handle,
+                                                                infiniopReducemeanDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x,
+                                                                int64_t const *axes,
+                                                                uint64_t n,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes
+                                                                );
+
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
+#endif
diff --git a/include/ops/reducemin/reducemin.h b/include/ops/reducemin/reducemin.h
new file mode 100644
index 00000000..8c0fc027
--- /dev/null
+++ b/include/ops/reducemin/reducemin.h
@@ -0,0 +1,25 @@
+#ifndef REDUCEMIN_H
+#define REDUCEMIN_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceminDescriptor {
+    Device device;
+} ReduceminDescriptor;
+typedef ReduceminDescriptor *infiniopReduceminDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceminDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t dst,
+                                                                infiniopTensorDescriptor_t src,
+                                                                int64_t const *axes,
+                                                                uint64_t n,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes
+                                                                );
+
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
+#endif
diff --git a/include/ops/where/where.h b/include/ops/where/where.h
new file mode 100644
index 00000000..373fc845
--- /dev/null
+++ b/include/ops/where/where.h
@@ -0,0 +1,24 @@
+#ifndef WHERE_H
+#define WHERE_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct WhereDescriptor {
+    Device device;
+} WhereDescriptor;
+typedef WhereDescriptor *infiniopWhereDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                            infiniopWhereDescriptor_t *desc_ptr,
+                                                            infiniopTensorDescriptor_t dst,
+                                                            infiniopTensorDescriptor_t src1,
+                                                            infiniopTensorDescriptor_t src2,
+                                                            infiniopTensorDescriptor_t condition
+                                                            );
+
+__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void *src1, void *src2, void *condition, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py
index 0909c0cf..3ff8d3ba 100644
--- a/operatorspy/liboperators.py
+++ b/operatorspy/liboperators.py
@@ -45,6 +45,8 @@ class Handle(Structure):
 def open_lib():
     def find_library_in_ld_path(library_name):
         ld_library_path = LIB_OPERATORS_DIR
+
+        print(LIB_OPERATORS_DIR)
         paths = ld_library_path.split(os.pathsep)
         for path in paths:
             full_path = os.path.join(path, library_name)
diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py
new file mode 100644
index 00000000..f273c84d
--- /dev/null
+++ b/operatorspy/tests/clip.py
@@ -0,0 +1,156 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+import numpy as np
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class ClipDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopClipDescriptor_t = POINTER(ClipDescriptor)
+
+def clip(input, min, max):
+    return torch.clamp(input, min, max)
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    min,
+    max,
+    tensor_dtype=torch.float16
+):
+    print(
+        f"Testing clip on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype} max:{max} min:{min}"
+    )
+    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    output = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    if min != None:
+        min = torch.tensor(min, dtype=torch.float32, device=torch_device)
+    else:
+        min = torch.tensor(float("-inf"), dtype=torch.float32, device=torch_device)
+    if max != None:
+        max = torch.tensor(max, dtype=torch.float32, device=torch_device)
+    else:
+        max = torch.tensor(float("inf"), dtype=torch.float32, device=torch_device)
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        if min == None and max == None:
+            break
+        ans = clip(x, min, max)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = clip(x, min, max)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :10f}")
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(output, lib)
+    descriptor = infiniopClipDescriptor_t()
+    check_error(
+        lib.infiniopCreateClipDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x_tensor.descriptor,
+            y_tensor.descriptor,
+        )
+    )
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopClip(
+                descriptor,
+                x_tensor.data,
+                min.data_ptr() if min != None else None,
+                max.data_ptr() if max != None else None,
+                y_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+                check_error(
+                    lib.infiniopClip(
+                    descriptor,
+                    x_tensor.data,
+                    min.data_ptr() if min != None else None,
+                    max.data_ptr() if max != None else None,
+                    y_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"lib time: {elapsed :10f}")
+    print("x:", x)
+    print("custom op ans:", output)
+    print("ans:", ans) if max != None or min != None else print("ans:", x)
+    assert torch.allclose(output, ans, atol=0, rtol=0) if max != None or min != None else torch.allclose(output, x, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyClipDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, min, max in test_cases:
+        test(lib, handle, "cpu", x_shape, min, max, tensor_dtype=torch.float16)
+        print("\n")
+        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        ((3, 4), -1, 1),
+        ((3, 4), None, 1),
+        ((3, 4), -1, None),
+        ((3, 4), None, None)
+        # stride = 
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateClipDescriptor.restype = c_int32
+    lib.infiniopCreateClipDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopClipDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopClip.restype = c_int32
+    lib.infiniopClip.argtypes = [
+        infiniopClipDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyClipDescriptor.restype = c_int32
+    lib.infiniopDestroyClipDescriptor.argtypes = [infiniopClipDescriptor_t]
+    test_cpu(lib, test_cases)
+    print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
new file mode 100644
index 00000000..6a83ce26
--- /dev/null
+++ b/operatorspy/tests/gather.py
@@ -0,0 +1,148 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+import numpy as np
+
+PROFILE = True
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class GatherDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopGatherDescriptor_t = POINTER(GatherDescriptor)
+
+def gather(input, indices, axis):
+    np_input = input.numpy()
+    np_indices = indices.numpy()
+    np_output = np.take(np_input, np_indices, axis=axis)
+    return torch.from_numpy(np_output)
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def inferShape(input_shape, indices_shape, axis):
+    output_shape = input_shape[:axis] + tuple(indices_shape) + input_shape[axis + 1:]
+    return output_shape
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    indices_shape,
+    axis,
+    tensor_dtype=torch.float16
+):
+    print(
+        f"Testing clip on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
+    )
+    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    if len(x.shape) == 2:
+        indices = torch.tensor(2, dtype=torch.int64, device=torch_device)
+    elif len(x.shape) == 3:
+        indices = torch.tensor([[0, 1], [1, 2]], dtype=torch.int64, device=torch_device)
+    dst = torch.randn(inferShape(x_shape, indices.shape, axis), dtype=tensor_dtype, device=torch_device)
+    ans = gather(x, indices, axis)
+    axis = axis
+    x_tensor = to_tensor(x, lib)
+    indices_tensor = to_tensor(indices, lib)
+    dst_tensor = to_tensor(dst, lib)
+    descriptor = infiniopGatherDescriptor_t()
+    check_error(
+        lib.infiniopCreateGatherDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            dst_tensor.descriptor,
+            x_tensor.descriptor,
+            indices_tensor.descriptor,
+            axis
+        )
+    )
+    x_tensor.descriptor.contents.invalidate()
+    indices_tensor.descriptor.contents.invalidate()
+    dst_tensor.descriptor.contents.invalidate()
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGather(
+                descriptor,
+                x_tensor.data,
+                indices_tensor.data,
+                dst_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGather(
+                    descriptor,
+                    x_tensor.data,
+                    indices_tensor.data,
+                    dst_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"lib time: {elapsed :10f}")
+    print(f"pytorch ans: {ans}")
+    print(f"lib ans: {dst_tensor.data}")
+    assert torch.allclose(dst, ans, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, indices_shape, axis in test_cases:
+        test(lib, handle, "cpu", x_shape, indices_shape, axis, tensor_dtype=torch.float16)
+        print("\n")
+        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        ((3, 4), (2), 0),
+        ((2, 3, 4), (2, 2), 1),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGatherDescriptor.restype = c_int32
+    lib.infiniopCreateGatherDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGatherDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGather.restype = c_int32
+    lib.infiniopGather.argtypes = [
+        infiniopGatherDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGatherDescriptor.restype = c_int32
+    lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopGatherDescriptor_t]
+    test_cpu(lib, test_cases)
+    print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index ac4b0f7f..a949390e 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -22,9 +22,9 @@
 from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
+NUM_ITERATIONS = 50
 
 class MatmulDescriptor(Structure):
     _fields_ = [("device", c_int32)]
diff --git a/operatorspy/tests/max_pool.py b/operatorspy/tests/max_pool.py
index ffc0bb19..7cd4ef8a 100644
--- a/operatorspy/tests/max_pool.py
+++ b/operatorspy/tests/max_pool.py
@@ -23,7 +23,7 @@
 # constant for control whether profile the pytorch and lib functions
 # NOTE: need to manually add synchronization function to the lib function,
 #       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
diff --git a/operatorspy/tests/reducemax.py b/operatorspy/tests/reducemax.py
new file mode 100644
index 00000000..e2086293
--- /dev/null
+++ b/operatorspy/tests/reducemax.py
@@ -0,0 +1,206 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+import numpy as np
+
+PROFILE = False
+NUM_PRERUN = 1
+NUM_ITERATIONS = 1
+
+class ReducemaxDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopReducemaxDescriptor_t = POINTER(ReducemaxDescriptor)
+
+def reduce_max(input, axis, noop_with_empty_axes, keepdims=True):
+    if axis == None:
+        if noop_with_empty_axes:
+            return input
+        else:
+            return torch.amax(input, dim=axis, keepdim=keepdims)
+    return torch.amax(input, dim=axis, keepdim=keepdims)
+
+def inferShape(x_shape, axis, noop_with_empty_axes, keepdims=False):
+    if axis == None:
+        if noop_with_empty_axes:
+            return x_shape
+        else:
+            if keepdims:
+                return tuple([1] * len(x_shape))
+            else:
+                return tuple([])
+    assert len(axis) <= len(x_shape), "axis out of range"
+    output_shape = []
+    axis = [a if a >= 0 else a + len(x_shape) for a in axis]  # 更新 axis 列表中的值
+    for a in axis:
+        assert 0 <= a <= len(x_shape) - 1, "axis out of range"
+    for i, s in enumerate(x_shape):
+        if i in axis and keepdims:
+            output_shape.append(1)
+        elif i in axis and not keepdims:
+            continue
+        else:
+            output_shape.append(s)
+            
+    print(f"output_shape = {output_shape}")
+    return tuple(output_shape)
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    axes,
+    dynamic_axes,
+    noop_with_empty_axes=False,
+    keepdims=True,
+    tensor_dtype=torch.float16
+):
+    print(
+        f"Testing reducemax on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
+    )
+    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    print(f"y_shape = {inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)}")
+    y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float('-inf'), dtype=tensor_dtype, device=torch_device)
+    print(f"y_shape = {y.shape}")
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = reduce_max(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduce_max(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :10f}")
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopReducemaxDescriptor_t()
+    axe = tuple_to_void_p(axes) if axes != None else None
+    lenth_axes = c_uint64(len(axes)) if axes != None else c_uint64(0)
+    dynamic_axes_parm = tuple_to_void_p(dynamic_axes) if dynamic_axes != None else None
+    lenth_dynamic_axes = c_uint64(len(dynamic_axes)) if dynamic_axes != None else c_uint64(0)
+    check_error(
+        lib.infiniopCreateReducemaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            axe,
+            lenth_axes,
+            c_bool(keepdims),
+            c_bool(noop_with_empty_axes),
+        )
+    )
+    print(f"op desctiptor created")
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopReducemax(
+                descriptor,
+                y_tensor.data,
+                x_tensor.data,
+                dynamic_axes_parm,
+                lenth_dynamic_axes,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopReducemax(
+                    descriptor,
+                    y_tensor.data,
+                    x_tensor.data,
+                    dynamic_axes_parm,
+                    lenth_dynamic_axes,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"lib time: {elapsed :10f}")
+    print(f"custom op output:{y}")
+    print(f"pytorch output:{ans}")
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    
+    check_error(lib.infiniopDestroyReducemaxDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
+        print(dynamic_axes)
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+        print("\n")
+        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # dynamic calc test eg
+        ((2, 3, 4, 5), [0, 2], False, True, None),
+        ((2, 3, 4, 5), [0, 2], False, True, None),
+        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        ((2, 10, 24, 10), [0, 2], False, True, None),
+        # stride = 
+        ((2, 10, 24, 10), [0, 1], False, True, None),
+        ((2, 10, 24, 10), [2, 3], False , True, None),
+        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # validate attribute noop_with_empty_axes and keepdims
+        ((2, 10, 24, 10), None, True, True, None),
+        ((2, 10, 24, 10), None, True, False, None),
+        ((2, 10, 24, 10), None, False, True, None),
+        ((2, 10, 24, 10), None, False, False, None),
+        ((2, 3, 4), [0, 1], False, False, None),
+        #((2, 10, 24, 10), [], True),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReducemaxDescriptor.restype = c_int32
+    lib.infiniopCreateReducemaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReducemaxDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_bool,
+        c_bool
+    ]
+    lib.infiniopReducemax.restype = c_int32
+    lib.infiniopReducemax.argtypes = [
+        infiniopReducemaxDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReducemaxDescriptor.restype = c_int32
+    lib.infiniopDestroyReducemaxDescriptor.argtypes = [infiniopReducemaxDescriptor_t]
+    test_cpu(lib, test_cases)
+    print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/reducemean.py b/operatorspy/tests/reducemean.py
new file mode 100644
index 00000000..eaaf79d0
--- /dev/null
+++ b/operatorspy/tests/reducemean.py
@@ -0,0 +1,204 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+import numpy as np
+
+PROFILE = False
+NUM_PRERUN = 1
+NUM_ITERATIONS = 1
+
+class ReducemeanDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopReducemeanDescriptor_t = POINTER(ReducemeanDescriptor)
+
+def reduce_mean(input, axis, noop_with_empty_axes, keepdims=True):
+    if axis == None:
+        if noop_with_empty_axes:
+            return input
+        else:
+            return torch.mean(input, dim=axis, keepdim=keepdims)
+    return torch.mean(input, dim=axis, keepdim=keepdims)
+
+def inferShape(x_shape, axis, noop_with_empty_axes, keepdims=False):
+    if axis == None:
+        if noop_with_empty_axes:
+            return x_shape
+        else:
+            if keepdims:
+                return tuple([1] * len(x_shape))
+            else:
+                return tuple([])
+    assert len(axis) <= len(x_shape), "axis out of range"
+    output_shape = []
+    axis = [a if a >= 0 else a + len(x_shape) for a in axis]  # 更新 axis 列表中的值
+    for a in axis:
+        assert 0 <= a <= len(x_shape) - 1, "axis out of range"
+    for i, s in enumerate(x_shape):
+        if i in axis and keepdims:
+            output_shape.append(1)
+        elif i in axis and not keepdims:
+            continue
+        else:
+            output_shape.append(s)
+            
+    print(f"output_shape = {output_shape}")
+    return tuple(output_shape)
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    axes,
+    dynamic_axes,
+    noop_with_empty_axes=False,
+    keepdims=True,
+    tensor_dtype=torch.float16
+):
+    print(
+        f"Testing reducemean on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
+    )
+    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    print(f"y_shape = {inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)}")
+    y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float('-inf'), dtype=tensor_dtype, device=torch_device)
+    print(f"y_shape = {y.shape}")
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = reduce_mean(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduce_mean(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :10f}")
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopReducemeanDescriptor_t()
+    axe = tuple_to_void_p(axes) if axes != None else None
+    lenth_axes = c_uint64(len(axes)) if axes != None else c_uint64(0)
+    dynamic_axes_parm = tuple_to_void_p(dynamic_axes) if dynamic_axes != None else None
+    lenth_dynamic_axes = c_uint64(len(dynamic_axes)) if dynamic_axes != None else c_uint64(0)
+    check_error(
+        lib.infiniopCreateReducemeanDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            axe,
+            lenth_axes,
+            c_bool(keepdims),
+            c_bool(noop_with_empty_axes),
+        )
+    )
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopReducemean(
+                descriptor,
+                y_tensor.data,
+                x_tensor.data,
+                dynamic_axes_parm,
+                lenth_dynamic_axes,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopReducemean(
+                    descriptor,
+                    y_tensor.data,
+                    x_tensor.data,
+                    dynamic_axes_parm,
+                    lenth_dynamic_axes,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"lib time: {elapsed :10f}")
+    print(f"custom op output:{y}")
+    print(f"pytorch output:{ans}")
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    
+    check_error(lib.infiniopDestroyReducemeanDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+        print("\n")
+        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # dynamic calc test eg
+        ((2, 3, 4, 5), [0, 2], False, True, None),
+        ((2, 3, 4, 5), [0, 2], False, True, None),
+        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        ((2, 10, 24, 10), [0, 2], False, True, None),
+        # stride = 
+        ((2, 10, 24, 10), [0, 1], False, True, None),
+        ((2, 10, 24, 10), [2, 3], False , True, None),
+        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # validate attribute noop_with_empty_axes and keepdims
+        ((2, 10, 24, 10), None, True, True, None),
+        ((2, 10, 24, 10), None, True, False, None),
+        ((2, 10, 24, 10), None, False, True, None),
+        ((2, 10, 24, 10), None, False, False, None),
+        ((2, 3, 4), [0, 1], False, False, None),
+        #((2, 10, 24, 10), [], True),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReducemeanDescriptor.restype = c_int32
+    lib.infiniopCreateReducemeanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReducemeanDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_bool,
+        c_bool
+    ]
+    lib.infiniopReducemean.restype = c_int32
+    lib.infiniopReducemean.argtypes = [
+        infiniopReducemeanDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReducemeanDescriptor.restype = c_int32
+    lib.infiniopDestroyReducemeanDescriptor.argtypes = [infiniopReducemeanDescriptor_t]
+    test_cpu(lib, test_cases)
+    print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/reducemin.py b/operatorspy/tests/reducemin.py
new file mode 100644
index 00000000..f93136ca
--- /dev/null
+++ b/operatorspy/tests/reducemin.py
@@ -0,0 +1,205 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+import numpy as np
+
+PROFILE = False
+NUM_PRERUN = 1
+NUM_ITERATIONS = 1
+
+class ReduceminDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopReduceminDescriptor_t = POINTER(ReduceminDescriptor)
+
+def reduce_min(input, axis, noop_with_empty_axes, keepdims=True):
+    if axis == None:
+        if noop_with_empty_axes:
+            return input
+        else:
+            return torch.amin(input, dim=axis, keepdim=keepdims)
+    return torch.amin(input, dim=axis, keepdim=keepdims)
+
+def inferShape(x_shape, axis, noop_with_empty_axes, keepdims=False):
+    if axis == None:
+        if noop_with_empty_axes:
+            return x_shape
+        else:
+            if keepdims:
+                return tuple([1] * len(x_shape))
+            else:
+                return tuple([])
+    assert len(axis) <= len(x_shape), "axis out of range"
+    output_shape = []
+    axis = [a if a >= 0 else a + len(x_shape) for a in axis]  # 更新 axis 列表中的值
+    for a in axis:
+        assert 0 <= a <= len(x_shape) - 1, "axis out of range"
+    for i, s in enumerate(x_shape):
+        if i in axis and keepdims:
+            output_shape.append(1)
+        elif i in axis and not keepdims:
+            continue
+        else:
+            output_shape.append(s)
+            
+    print(f"output_shape = {output_shape}")
+    return tuple(output_shape)
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    axes,
+    dynamic_axes,
+    noop_with_empty_axes=False,
+    keepdims=True,
+    tensor_dtype=torch.float16
+):
+    print(
+        f"Testing reducemin on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
+    )
+    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    print(f"y_shape = {inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)}")
+    y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float('inf'), dtype=tensor_dtype, device=torch_device)
+    print(f"y_shape = {y.shape}")
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = reduce_min(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduce_min(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :10f}")
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopReduceminDescriptor_t()
+    axe = tuple_to_void_p(axes) if axes != None else None
+    lenth_axes = c_uint64(len(axes)) if axes != None else c_uint64(0)
+    dynamic_axes_parm = tuple_to_void_p(dynamic_axes) if dynamic_axes != None else None
+    lenth_dynamic_axes = c_uint64(len(dynamic_axes)) if dynamic_axes != None else c_uint64(0)
+    check_error(
+        lib.infiniopCreateReduceminDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            axe,
+            lenth_axes,
+            c_bool(keepdims),
+            c_bool(noop_with_empty_axes),
+        )
+    )
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopReducemin(
+                descriptor,
+                y_tensor.data,
+                x_tensor.data,
+                dynamic_axes_parm,
+                lenth_dynamic_axes,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopReducemin(
+                    descriptor,
+                    y_tensor.data,
+                    x_tensor.data,
+                    dynamic_axes_parm,
+                    lenth_dynamic_axes,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"lib time: {elapsed :10f}")
+    print(f"custom op output:{y}")
+    print(f"pytorch output:{ans}")
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    
+    check_error(lib.infiniopDestroyReducemaxDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
+        print(dynamic_axes)
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+        print("\n")
+        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # dynamic calc test eg
+        ((2, 3, 4, 5), [0, 2], False, True, None),
+        ((2, 3, 4, 5), [0, 2], False, True, None),
+        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        ((2, 10, 24, 10), [0, 2], False, True, None),
+        # stride = 
+        ((2, 10, 24, 10), [0, 1], False, True, None),
+        ((2, 10, 24, 10), [2, 3], False , True, None),
+        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # validate attribute noop_with_empty_axes and keepdims
+        ((2, 10, 24, 10), None, True, True, None),
+        ((2, 10, 24, 10), None, True, False, None),
+        ((2, 10, 24, 10), None, False, True, None),
+        ((2, 10, 24, 10), None, False, False, None),
+        ((2, 3, 4), [0, 1], False, False, None),
+        #((2, 10, 24, 10), [], True),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceminDescriptor.restype = c_int32
+    lib.infiniopCreateReduceminDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceminDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_bool,
+        c_bool
+    ]
+    lib.infiniopReducemin.restype = c_int32
+    lib.infiniopReducemin.argtypes = [
+        infiniopReduceminDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceminDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceminDescriptor.argtypes = [infiniopReduceminDescriptor_t]
+    test_cpu(lib, test_cases)
+    print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py
new file mode 100644
index 00000000..7caf84e9
--- /dev/null
+++ b/operatorspy/tests/where.py
@@ -0,0 +1,166 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+import numpy as np
+import onnx
+
+PROFILE = True
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class WhereDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopWhereDescriptor_t = POINTER(WhereDescriptor)
+
+def where(condition, x, y):
+    return torch.where(condition, x, y)
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def inferShape(x_shape, y_shape):
+    ndim_x = len(x_shape)
+    ndim_y = len(y_shape)
+    ndim = 0
+    output_shape = []
+    ndim = max(ndim_x, ndim_y)
+    for i in range(ndim - 1, -1, -1):
+        dim_x = x_shape[i] if i < ndim_x else 1
+        dim_y = y_shape[i] if i < ndim_y else 1
+        output_shape.append(max(dim_x, dim_y))
+    output_shape.reverse()
+    return tuple(output_shape)
+    
+def test(
+    lib,
+    handle,
+    torch_device,
+    condition_shape,
+    src1_shape,
+    src2_shape,
+    tensor_dtype=torch.float16
+):
+    print(
+        f"Testing where on {torch_device} with condition_shape:{condition_shape} dtype:{tensor_dtype}"
+    )
+    condition = torch.randint(0, 2, condition_shape, dtype=torch.uint8).to(torch_device)
+    src1 = torch.randn(src1_shape, dtype=tensor_dtype, device=torch_device)
+    src2 = torch.randn(src2_shape, dtype=tensor_dtype, device=torch_device)
+    output = torch.randn(inferShape(src1_shape, src2_shape), dtype=tensor_dtype, device=torch_device)
+
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = where(condition, src1, src2)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = where(condition, src1, src2)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :10f}")
+    src1_tensor = to_tensor(src1, lib)
+    src2_tensor = to_tensor(src2, lib)
+    output_tensor = to_tensor(output, lib)
+    condition_tensor = to_tensor(condition, lib)
+    descriptor = infiniopWhereDescriptor_t()
+    check_error(
+        lib.infiniopCreateWhereDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            src1_tensor.descriptor,
+            src2_tensor.descriptor,
+            condition_tensor.descriptor,
+        )
+    )
+    src1_tensor.descriptor.contents.invalidate()
+    src2_tensor.descriptor.contents.invalidate()
+    output_tensor.descriptor.contents.invalidate()
+    condition_tensor.descriptor.contents.invalidate()
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopWhere(
+                descriptor,
+                output_tensor.data,
+                src1_tensor.data,
+                src2_tensor.data,
+                condition_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+                check_error(
+                    lib.infiniopWhere(
+                    descriptor,
+                    output_tensor.data,
+                    src1_tensor.data,
+                    src2_tensor.data,
+                    condition_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"lib time: {elapsed :10f}")
+    assert torch.allclose(output, ans, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyWhereDescriptor(descriptor))
+    
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for condition_shape, src1_shape, src2_shape in test_cases:
+        test(lib, handle, "cpu", condition_shape, src1_shape, src2_shape, tensor_dtype=torch.float16)
+        print("\n")
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        ((2, 3, 4, 5), (2, 3, 4, 5), (2, 3, 4, 5)),
+        ((3, 1), (3, 4), (1, 4)),
+        ((1,), (3, 4), (3, 4)),
+        ((2, 1, 3), (1, 4, 3), (2, 4, 1)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateWhereDescriptor.restype = c_int32
+    lib.infiniopCreateWhereDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopWhereDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopWhere.restype = c_int32
+    lib.infiniopWhere.argtypes = [
+        infiniopWhereDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p
+    ]
+    lib.infiniopDestroyWhereDescriptor.restype = c_int32
+    lib.infiniopDestroyWhereDescriptor.argtypes = [infiniopWhereDescriptor_t]
+    test_cpu(lib, test_cases)
+    print("All tests passed!")
\ No newline at end of file
diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc
new file mode 100644
index 00000000..1cefe0ab
--- /dev/null
+++ b/src/ops/clip/cpu/clip_cpu.cc
@@ -0,0 +1,79 @@
+#include "clip_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
+    ClipCpuDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t y
+    ){
+
+    if (x->dt != F16 && x->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (x->ndim != y->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x->dt != y->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(x) || !is_contiguous(y)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    uint64_t element_num = 1;
+    for (uint64_t i = 0; i < x->ndim; i++) {
+        element_num *= x->shape[i];
+    }
+    *desc_ptr = new ClipCpuDescriptor{
+        DevCpu,
+        x->dt,
+        element_num
+    };
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc,
+    void const *x,
+    void *min,
+    void *max,
+    void *y){
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto y_ = reinterpret_cast<Tdata *>(y);
+    float min_val;
+    float max_val;
+    auto min_ = reinterpret_cast<float *>(min);
+    min_val = *min_;
+    auto max_ = reinterpret_cast<float *>(max);
+    max_val = *max_;
+
+    for (uint64_t i = 0; i < desc->element_num; i++) {
+        if constexpr (std::is_same<Tdata, uint16_t>::value){
+            y_[i] = f32_to_f16(std::min(std::max(f16_to_f32(x_[i]), min_val), max_val));
+        }
+        else{
+            y_[i] = std::min(std::max(x_[i], min_val), max_val);
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
+    void const*x,
+    void *min,
+    void *max,
+    void *y,
+    void *stream){
+    if (desc->dtype == F16) {
+        return clip_cpu<uint16_t>(desc, x, min, max, y);
+    }
+    if (desc->dtype == F32) {
+        return clip_cpu<float>(desc, x, min, max, y);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
+
+infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc){
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h
new file mode 100644
index 00000000..992a2444
--- /dev/null
+++ b/src/ops/clip/cpu/clip_cpu.h
@@ -0,0 +1,28 @@
+#ifndef __CPU_CLIP_H__
+#define __CPU_CLIP_H__
+
+#include "operators.h"
+struct ClipCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t element_num;
+};
+
+typedef struct ClipCpuDescriptor *ClipCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
+                                        ClipCpuDescriptor_t *desc_ptr,
+                                        infiniopTensorDescriptor_t x,
+                                        infiniopTensorDescriptor_t y
+                                        );
+
+infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
+                                  void const *x, 
+                                  void *min,
+                                  void *max,
+                                  void *y,
+                                  void *stream);
+
+infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc
new file mode 100644
index 00000000..c655dab7
--- /dev/null
+++ b/src/ops/clip/operator.cc
@@ -0,0 +1,44 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/clip/clip.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/clip_cpu.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateClipDescriptor(
+    infiniopHandle_t handle,
+    infiniopClipDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t y
+    ) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *) desc_ptr, x, y);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+
+__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, void *min, void *max, void *y, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuClip((ClipCpuDescriptor_t) desc, x, min, max, y, stream);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc){
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyClipDescriptor((ClipCpuDescriptor_t) desc);
+#endif
+    }
+
+}
\ No newline at end of file
diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc
new file mode 100644
index 00000000..50744168
--- /dev/null
+++ b/src/ops/gather/cpu/gather_cpu.cc
@@ -0,0 +1,110 @@
+#include "gather_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t handle,
+    GatherCpuDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t indices,
+    int64_t axis
+    ){
+    if (y->dt != x->dt){
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != F16 && y->dt != F32){
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)){
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (axis < 0 || axis >= x->ndim){
+        return STATUS_BAD_PARAM;
+    }
+    uint64_t *dst_shape = new uint64_t[y->ndim];
+    uint64_t *src_shape = new uint64_t[x->ndim];
+    uint64_t *indices_shape = new uint64_t[indices->ndim];
+
+    memcpy(dst_shape, y->shape, y->ndim * sizeof(uint64_t));
+    memcpy(indices_shape, indices->shape, indices->ndim * sizeof(uint64_t));
+    memcpy(src_shape, x->shape, x->ndim * sizeof(uint64_t));
+
+    *desc_ptr = new GatherCpuDescriptor{
+        DevCpu,
+        y->dt,
+        dst_shape,
+        src_shape,
+        indices_shape,
+        indices->ndim,
+        x->ndim,
+        y->ndim,
+        axis
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc){
+    delete[] desc->dst_shape;
+    delete[] desc->src_shape;
+    delete[] desc->indices_shape;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
+    void const *x, 
+    void *indices,
+    void *y)
+{
+    auto *src_data = reinterpret_cast<Tdata const*>(x);
+    auto *indices_data = reinterpret_cast<int64_t*>(indices); // [!code ++]
+    auto *dst_data = reinterpret_cast<Tdata*>(y);
+    uint64_t indices_element_count = 1;
+    for (uint64_t i = 0; i < desc->indices_ndim; ++i) {
+        indices_element_count *= desc->indices_shape[i];
+    }
+
+    const uint64_t axis = desc->axis;
+    uint64_t src_outer_dim = 1;
+    for (uint64_t i = 0; i < axis; ++i) {
+        src_outer_dim *= desc->src_shape[i];  
+    }
+
+    uint64_t src_inner_dim = 1;
+    for (uint64_t i = axis + 1; i < desc->src_ndim; ++i) {
+        src_inner_dim *= desc->src_shape[i];
+    }
+    for (uint64_t outer = 0; outer < src_outer_dim; ++outer) {
+        for (uint64_t idx = 0; idx < indices_element_count; ++idx) {
+            const int64_t index_val = indices_data[idx]; // [!code ++]
+            const uint64_t src_offset = 
+                outer * desc->src_shape[axis] * src_inner_dim + 
+                index_val * src_inner_dim;
+            const uint64_t dst_offset = 
+                outer * indices_element_count * src_inner_dim + 
+                idx * src_inner_dim;
+            memcpy(
+                dst_data + dst_offset,
+                src_data + src_offset,
+                sizeof(Tdata) * src_inner_dim
+            );
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
+    void const *x, 
+    void *indices,
+    void *y,
+    void *stream){
+    if (desc->dtype == F16){
+        return gather_cpu<uint16_t>(desc, x, indices, y);
+    }
+    if (desc->dtype == F32){
+        return gather_cpu<float>(desc, x, indices, y);
+    }
+    return STATUS_SUCCESS;
+}
+
diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h
new file mode 100644
index 00000000..61a9c313
--- /dev/null
+++ b/src/ops/gather/cpu/gather_cpu.h
@@ -0,0 +1,35 @@
+#ifndef __CPU_GATHER_H__
+#define __CPU_GATHER_H__
+
+#include "operators.h"
+struct GatherCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t const *dst_shape;
+    uint64_t const *src_shape;
+    uint64_t const *indices_shape;
+    uint64_t indices_ndim;
+    uint64_t src_ndim;
+    uint64_t dst_ndim;
+    int64_t axis;
+};
+
+typedef struct GatherCpuDescriptor *GatherCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t handle,
+                                        GatherCpuDescriptor_t *desc_ptr,
+                                        infiniopTensorDescriptor_t y,
+                                        infiniopTensorDescriptor_t x,
+                                        infiniopTensorDescriptor_t indices,
+                                        int64_t axis
+                                        );
+
+infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
+                                  void const *data, 
+                                  void *indices,
+                                  void *dst,
+                                  void *stream);
+
+infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc
new file mode 100644
index 00000000..f2dcbc08
--- /dev/null
+++ b/src/ops/gather/operator.cc
@@ -0,0 +1,43 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/gather/gather.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/gather_cpu.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t indices,
+    int64_t axis
+    ) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateGatherDescriptor(handle, (GatherCpuDescriptor_t *) desc_ptr, y, x, indices, axis);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void *indices, void *y, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGather((GatherCpuDescriptor_t) desc, x, indices, y, stream);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc){
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyGatherDescriptor((GatherCpuDescriptor_t) desc);
+#endif
+    }
+}
\ No newline at end of file
diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc
new file mode 100644
index 00000000..2040a84b
--- /dev/null
+++ b/src/ops/reduce/cpu/reduce_cpu.cc
@@ -0,0 +1,305 @@
+#include "reduce_cpu.h"
+#include "../../utils.h"
+#include <iostream>
+infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
+                                            ReduceCpuDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            int64_t const *axes,
+                                            uint64_t n,
+                                            int reduce_type,
+                                            bool noop_with_empty_axes,
+                                            bool keepdims) {    
+    uint64_t ndim = y->ndim;
+    uint64_t x_ndim = x->ndim;
+    if (reduce_type > 2){
+        return STATUS_BAD_PARAM;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    uint64_t *x_shape = new uint64_t[x_ndim];
+    uint64_t *y_shape = new uint64_t[ndim];
+    int64_t *x_strides = new int64_t[x_ndim];
+    int64_t *y_strides = new int64_t[ndim]; 
+    uint64_t y_size = 1;
+    for (uint64_t i = 0; i < ndim; i++) {
+        y_size *= y->shape[i];
+    } 
+    memcpy(y_shape, y->shape, ndim * sizeof(uint64_t));
+    memcpy(y_strides, y->strides, ndim * sizeof(int64_t));
+    memcpy(x_strides, x->strides, x_ndim * sizeof(int64_t));
+    memcpy(x_shape, x->shape, x_ndim * sizeof(uint64_t));
+
+    if (axes != nullptr && n > 0) {
+        bool is_axes_static = true;
+        for (uint64_t i = 0; i < n; i++) {
+            if (axes[i] >= x->ndim) {
+                return STATUS_BAD_PARAM; // 轴索引越界
+            }
+        }
+        std::unordered_set<int64_t> axes_set;
+        for (uint64_t i = 0; i < n; i++) {
+            if (axes_set.find(axes[i]) != axes_set.end()) {
+                return STATUS_BAD_PARAM; // 轴重复
+            }
+            axes_set.insert(axes[i]);
+        }
+        std::vector<int64_t> axes_vec(axes_set.begin(), axes_set.end());
+        std::sort(axes_vec.begin(), axes_vec.end());
+        int64_t* unique_axes = new int64_t[axes_vec.size()]; 
+        std::copy(axes_vec.begin(), axes_vec.end(), unique_axes);
+        uint64_t *reduce_axes_stride = new uint64_t[axes_vec.size()];
+        for (int i = axes_vec.size() - 1; i >= 0; i--){
+            reduce_axes_stride[i] = 1;
+        }
+        for (int i = axes_vec.size() - 2; i >= 0; i--){
+            reduce_axes_stride[i] = reduce_axes_stride[i + 1] * x->shape[axes_vec[i + 1]];
+        }
+        int reduce_element_num = 1;
+        for (auto axis: axes_vec) {
+            reduce_element_num *= x->shape[axis];
+        }
+        *desc_ptr = new ReduceCpuDescriptor{
+            DevCpu,
+            y->dt,
+            ndim,
+            x_ndim,
+            x_shape,
+            unique_axes,
+            x_strides,
+            reduce_axes_stride,
+            y_strides,
+            y_size,
+            reduce_type,
+            reduce_element_num,
+            n,
+            is_axes_static,
+            noop_with_empty_axes,
+            true,
+            keepdims
+        };
+    }
+    else {
+        *desc_ptr = new ReduceCpuDescriptor{
+            DevCpu,
+            y->dt,
+            ndim,
+            x_ndim,
+            x_shape,
+            nullptr,
+            x_strides,
+            nullptr,
+            y_strides,
+            y_size,
+            reduce_type,
+            1,
+            0,
+            false,
+            noop_with_empty_axes,
+            false,
+            keepdims
+        };
+    }
+    return STATUS_SUCCESS;
+}   
+
+
+infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc) {
+    delete[] desc->x_shape;
+    delete[] desc->axes;
+    delete[] desc->x_strides;
+    delete[] desc->reduce_axes_stride;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+template<typename Tdata, bool keepdims>
+infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
+                            void *y,
+                            void const *x){
+    auto y_ = reinterpret_cast<Tdata *>(y);
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto input_strides = desc->x_strides;
+    auto reduce_axes_stride = desc->reduce_axes_stride;
+    auto num_axes = desc->axes_num; 
+    auto axes = desc->axes;
+    auto ndim = desc->ndim;
+    auto x_shape = desc->x_shape; 
+    auto y_strides = desc->y_strides;
+    int i, j;
+    int indices[desc->x_ndim];
+    //#pragma omp parallel for
+    std::vector<int> non_reduce_axes;
+    for (int j = 0; j < desc->x_ndim; ++j) {
+        if (!std::binary_search(axes, axes + num_axes, j)) {
+            non_reduce_axes.push_back(j);
+        }
+    }
+    for (i = 0; i < desc->y_size; i++) {
+        float sum_value = 0;
+        std::vector<int> indices(desc->x_ndim, 0);
+        int global_index = i;
+        if (!non_reduce_axes.empty()) {
+            if constexpr (keepdims){
+                for (int j = 0; j < non_reduce_axes.size(); j++) {
+                    indices[non_reduce_axes[j]] = global_index / y_strides[non_reduce_axes[j]];
+                    global_index %= y_strides[non_reduce_axes[j]];
+                }
+            }else{
+                for (int j = 0; j < non_reduce_axes.size(); j++) {
+                    indices[non_reduce_axes[j]] = global_index / y_strides[j];
+                    global_index %= y_strides[j];
+                }
+            }
+        }
+        int64_t base_offset = 0;
+        for (int j = 0; j < desc->x_ndim; ++j) {
+            base_offset += indices[j] * desc->x_strides[j];
+        }
+        for (j = 0; j < desc->reduce_element_num; j++){
+            int64_t offset = base_offset;
+            uint64_t remaining = j;
+            int k;
+            for (k = 0; k < num_axes; k++){
+                const int axis = axes[k];
+                const uint64_t coord = remaining / reduce_axes_stride[k];
+                offset += coord * input_strides[axis];
+                remaining %= reduce_axes_stride[k]; 
+            }
+            switch(desc->reduce_mode){
+                case 0:
+                    if constexpr (std::is_same<Tdata, uint16_t>::value){
+                        sum_value += f16_to_f32(x_[offset]);
+                    }
+                    else{
+                        sum_value += x_[offset];
+                    }
+                    break;
+                case 1:
+                    if constexpr (std::is_same<Tdata, uint16_t>::value){
+                        y_[i] = f32_to_f16(std::fmax(f16_to_f32(y_[i]), f16_to_f32(x_[offset])));
+                    }
+                    else{
+                        y_[i] = std::max(y_[i], x_[offset]);
+                    }
+                    break;
+                case 2:
+                    if constexpr (std::is_same<Tdata, uint16_t>::value){
+                        y_[i] = f32_to_f16(std::fmin(f16_to_f32(y_[i]), f16_to_f32(x_[offset])));
+                    }
+                    else{
+                        y_[i] = std::min(y_[i], x_[offset]);
+                    }
+                    break;
+            }
+        }
+        if (desc->reduce_mode == 0){
+            sum_value /= desc->reduce_element_num;
+            if constexpr (std::is_same<Tdata, uint16_t>::value){
+                y_[i] = f32_to_f16(sum_value);
+            }
+            else{
+                y_[i] = sum_value;
+            }
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
+                            void *y,
+                            void const *x,
+                            void const *dynamic_axes,
+                            uint64_t dynamic_axes_size,
+                            void *stream){
+    if (desc->is_axes_static == true && dynamic_axes_size > 0){
+        return STATUS_BAD_PARAM;
+    }
+    if (desc->is_axes_static == false && dynamic_axes_size == 0){
+        if (desc->noop_with_empty_axes){
+            if (desc->dt == F16){
+                memcpy(y, x, desc->y_size * sizeof(uint16_t));
+                return STATUS_SUCCESS;
+            }
+            else if (desc->dt == F32){
+                memcpy(y, x, desc->y_size * sizeof(float));
+                return STATUS_SUCCESS;
+            }
+            else return STATUS_BAD_TENSOR_DTYPE;
+        }
+        delete[] desc->axes;
+        delete[] desc->reduce_axes_stride;
+        desc->axes = new int64_t[desc->x_ndim];
+        desc->reduce_axes_stride = new uint64_t[desc->x_ndim];
+        std::vector<int64_t> full_axes(desc->x_ndim);
+        std::iota(full_axes.begin(), full_axes.end(), 0);
+        std::copy(full_axes.begin(), full_axes.end(), desc->axes);
+        for (int i = desc->x_ndim - 1; i >= 0; i--){
+            desc->reduce_axes_stride[i] = 1;
+        }
+        for (int i = desc->x_ndim - 2; i >= 0; i--){
+            desc->reduce_axes_stride[i] = desc->reduce_axes_stride[i + 1] * desc->x_shape[full_axes[i + 1]];
+        }
+
+        desc->axes_num = desc->x_ndim;
+        desc->reduce_element_num = 1;
+        for (int i = 0; i < desc->x_ndim; i++){
+            desc->reduce_element_num *= desc->x_shape[i];
+        }
+        desc->owns_axes_memory = true;
+    }
+    if (desc->is_axes_static == false && dynamic_axes_size > 0){
+        auto dynamic_axes_data = reinterpret_cast<int64_t const *>(dynamic_axes);
+
+        for (uint64_t i = 0; i < dynamic_axes_size; i++){
+            if (dynamic_axes_data[i] >= desc->x_ndim){
+                return STATUS_BAD_PARAM;
+            }
+        }
+        std::unordered_set<int64_t> axes_set;
+        for (uint64_t i = 0; i < dynamic_axes_size; i++){
+            if (axes_set.find(dynamic_axes_data[i]) != axes_set.end()){
+                return STATUS_BAD_PARAM;
+            }
+            axes_set.insert(dynamic_axes_data[i]);
+        }
+        std::vector<int64_t> axes_vec(axes_set.begin(), axes_set.end());
+        std::sort(axes_vec.begin(), axes_vec.end());
+
+        delete[] desc->axes;
+        delete[] desc->reduce_axes_stride;
+        desc->axes = new int64_t[axes_vec.size()];
+        desc->reduce_axes_stride = new uint64_t[axes_vec.size()];
+        desc->axes_num = axes_vec.size();
+        std::copy(axes_vec.begin(), axes_vec.end(), desc->axes);
+        for (int i = axes_vec.size() - 1; i >= 0; i--){
+            desc->reduce_axes_stride[i] = 1;
+        }
+        for (int i = axes_vec.size() - 2; i >= 0; i--){
+            desc->reduce_axes_stride[i] = desc->reduce_axes_stride[i + 1] * desc->x_shape[axes_vec[i + 1]];
+        }
+        for (auto axis: axes_vec){
+            desc->reduce_element_num *= desc->x_shape[axis];
+        }
+        desc->owns_axes_memory = true;
+    }
+    if (desc->dt == F16) {
+        if (desc->keepdims == true) {
+            return reduce_cpu<uint16_t, true>(desc, y, x);
+        }
+        return reduce_cpu<uint16_t, false>(desc, y, x);
+    }
+    if (desc->dt == F32) {
+        if (desc->keepdims == true) {
+            return reduce_cpu<float, true>(desc, y, x);
+        }
+        return reduce_cpu<float, false>(desc, y, x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h
new file mode 100644
index 00000000..61bd9d74
--- /dev/null
+++ b/src/ops/reduce/cpu/reduce_cpu.h
@@ -0,0 +1,54 @@
+#ifndef __CPU_REDUCE_H__
+#define __CPU_REDUCE_H__
+
+#include "../../../devices/cpu/common_cpu.h"
+#include "operators.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <unordered_set>
+
+struct ReduceCpuDescriptor {
+    Device device;
+    DataLayout dt;
+    uint64_t ndim;
+    uint64_t x_ndim;
+    uint64_t const *x_shape;
+    int64_t *axes;
+    int64_t const *x_strides;
+    uint64_t *reduce_axes_stride;
+    int64_t const *y_strides;
+    uint64_t y_size;
+    int reduce_mode;
+    int reduce_element_num;
+    uint64_t axes_num;
+    bool is_axes_static;
+    bool noop_with_empty_axes;
+    bool owns_axes_memory;
+    bool keepdims;
+};
+
+typedef struct ReduceCpuDescriptor *ReduceCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
+                                            ReduceCpuDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            int64_t const *axes,
+                                            uint64_t n,
+                                            int reduce_type,
+                                            bool noop_with_empty_axes,
+                                            bool keepdims);
+
+
+infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
+                            void *y,
+                            void const *x,
+                            void const *dynamic_axes,
+                            uint64_t dynamic_axes_size,
+                            void *stream);
+
+infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc
new file mode 100644
index 00000000..eaa713df
--- /dev/null
+++ b/src/ops/reduce/operator.cc
@@ -0,0 +1,47 @@
+#include "../utils.h"
+#include "operators.h"
+#include "reduce.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/reduce_cpu.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateReduceDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    int64_t const *axes,
+    uint64_t n,
+    bool keepdims,
+    bool noop_with_empty_axes,
+    int reduce_type) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateReduceDescriptor(handle, (ReduceCpuDescriptor_t *) desc_ptr, y, x, axes, n, reduce_type, noop_with_empty_axes, keepdims);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuReduce((ReduceCpuDescriptor_t) desc, y, x, dynamic_axes, dynamic_axes_size, stream);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc){
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyReduceDescriptor((ReduceCpuDescriptor_t) desc);
+#endif
+    }
+
+}
\ No newline at end of file
diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h
new file mode 100644
index 00000000..a667d533
--- /dev/null
+++ b/src/ops/reduce/reduce.h
@@ -0,0 +1,26 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include "export.h"
+#include "operators.h"
+//#include "status.h"
+
+typedef struct ReduceDescriptor {
+    Device device;
+} ReduceDescriptor;
+typedef ReduceDescriptor *infiniopReduceDescriptor_t;
+
+__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle,
+                                                     infiniopReduceDescriptor_t *desc_ptr,
+                                                     infiniopTensorDescriptor_t y,
+                                                     infiniopTensorDescriptor_t x,
+                                                     int64_t const *axes,
+                                                     uint64_t n,
+                                                     bool keepdims,
+                                                     bool noop_with_empty_axes,
+                                                     int reduce_type);
+
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+
+__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc);
+#endif
\ No newline at end of file
diff --git a/src/ops/reducemax/operator.cc b/src/ops/reducemax/operator.cc
new file mode 100644
index 00000000..921b6682
--- /dev/null
+++ b/src/ops/reducemax/operator.cc
@@ -0,0 +1,41 @@
+#include "../utils.h"
+#include "../reduce/reduce.h"
+#include "ops/reducemax/reducemax.h"
+
+struct _ReducemaxDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+};
+
+typedef struct _ReducemaxDescriptor *_ReducemaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopReducemaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    int64_t const *axes,
+    uint64_t n,
+    bool keepdims,
+    bool noop_with_empty_axes
+    ) {
+    infiniopReduceDescriptor_t reduce_desc;
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, n, keepdims, noop_with_empty_axes, 1), STATUS_SUCCESS);
+    *(_ReducemaxDescriptor_t *) desc_ptr = new _ReducemaxDescriptor{
+        handle->device,
+        reduce_desc
+    };
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+    auto _desc = (_ReducemaxDescriptor_t) desc;
+    CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReducemaxDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/reducemean/operator.cc b/src/ops/reducemean/operator.cc
new file mode 100644
index 00000000..ab322da7
--- /dev/null
+++ b/src/ops/reducemean/operator.cc
@@ -0,0 +1,41 @@
+#include "../utils.h"
+#include "../reduce/reduce.h"
+#include "ops/reducemean/reducemean.h"
+
+struct _ReducemeanDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+};
+
+typedef struct _ReducemeanDescriptor *_ReducemeanDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(
+    infiniopHandle_t handle,
+    infiniopReducemeanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    int64_t const *axes,
+    uint64_t n,
+    bool keepdims,
+    bool noop_with_empty_axes
+    ) {
+    infiniopReduceDescriptor_t reduce_desc;
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, n, keepdims, noop_with_empty_axes, 0), STATUS_SUCCESS);
+    *(_ReducemeanDescriptor_t *) desc_ptr = new _ReducemeanDescriptor{
+        handle->device,
+        reduce_desc
+    };
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+    auto _desc = (_ReducemeanDescriptor_t) desc;
+    CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReducemeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/reducemin/operator.cc b/src/ops/reducemin/operator.cc
new file mode 100644
index 00000000..9907d8e5
--- /dev/null
+++ b/src/ops/reducemin/operator.cc
@@ -0,0 +1,40 @@
+#include "../utils.h"
+#include "../reduce/reduce.h"
+#include "ops/reducemin/reducemin.h"
+
+struct _ReduceminDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+};
+
+typedef struct _ReduceminDescriptor *_ReduceminDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceminDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceminDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    int64_t const *axes,
+    uint64_t n,
+    bool keepdims,
+    bool noop_with_empty_axes
+    ) {
+    infiniopReduceDescriptor_t reduce_desc;
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, n, keepdims, noop_with_empty_axes,2), STATUS_SUCCESS);
+    *(_ReduceminDescriptor_t *) desc_ptr = new _ReduceminDescriptor{
+        handle->device,
+        reduce_desc
+    };
+    return STATUS_SUCCESS;
+}
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+    auto _desc = (_ReduceminDescriptor_t) desc;
+    CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceminDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc
new file mode 100644
index 00000000..11a1a2d4
--- /dev/null
+++ b/src/ops/where/cpu/where_cpu.cc
@@ -0,0 +1,144 @@
+#include "where_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t handle,
+    WhereCpuDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t src1,
+    infiniopTensorDescriptor_t src2,
+    infiniopTensorDescriptor_t condition
+    ){
+    if (dst->dt != F16 && dst->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src1->dt != F16 && src1->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src2->dt != F16 && src2->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (condition->dt != U8) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(dst) || !is_contiguous(src1) || !is_contiguous(src2) || !is_contiguous(condition)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (dst->dt != src1->dt || dst->dt != src2->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    uint64_t *src1_shape = new uint64_t[dst->ndim];
+    uint64_t *src2_shape = new uint64_t[dst->ndim];
+    uint64_t *condition_shape = new uint64_t[dst->ndim];
+    uint64_t *dst_shape = new uint64_t[dst->ndim];
+    int64_t *dst_strides = new int64_t[dst->ndim];
+    uint64_t dst_ndim = dst->ndim;
+    uint64_t element_num = 1;
+    for (uint64_t i = 0; i < dst->ndim; i++) {
+        element_num *= dst->shape[i];
+    }
+    memcpy(src1_shape, src1->shape, src1->ndim * sizeof(uint64_t));
+    memcpy(src2_shape, src2->shape, src2->ndim * sizeof(uint64_t));
+    memcpy(condition_shape, condition->shape, condition->ndim * sizeof(uint8_t));
+    //扩展到dst的维度
+    for (uint64_t i = 0; i < dst->ndim; i++) {
+        if (i < src1->ndim){
+            src1_shape[i] = src1->shape[i];
+        }
+        else{
+            src1_shape[i] = 1;
+        }
+        if (i < src2->ndim){
+            src2_shape[i] = src2->shape[i];
+        }
+        else{
+            src2_shape[i] = 1;
+        }
+        if (i < condition->ndim){
+            condition_shape[i] = condition->shape[i];
+        }
+        else{
+            condition_shape[i] = 1;
+        }
+    }
+    memcpy(dst_shape, dst->shape, dst->ndim * sizeof(uint64_t));
+    memcpy(dst_strides, dst->strides, dst->ndim * sizeof(int64_t));
+    *desc_ptr = new WhereCpuDescriptor{
+        DevCpu,
+        dst->dt,
+        src1_shape,
+        src2_shape,
+        condition_shape,
+        dst_shape,
+        dst_strides,
+        dst_ndim,
+        element_num
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc){
+    delete[] desc->src1_shape;
+    delete[] desc->src2_shape;
+    delete[] desc->condition_shape;
+    delete[] desc->dst_shape;
+    delete[] desc->dst_strides;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t where_cpu(WhereCpuDescriptor_t desc,
+    void *dst, 
+    void *src1,
+    void *src2,
+    void *condition,
+    void *stream){
+    auto dst_ = reinterpret_cast<Tdata *>(dst);
+    auto src1_ = reinterpret_cast<Tdata *>(src1);
+    auto src2_ = reinterpret_cast<Tdata *>(src2);
+    auto condition_ = reinterpret_cast<uint8_t *>(condition);
+    auto ndim = desc->dst_ndim;
+    auto dst_strides = desc->dst_strides;
+    auto condition_shape = desc->condition_shape;
+    auto src1_shape = desc->src1_shape;
+    auto src2_shape = desc->src2_shape;
+    #pragma omp parallel for
+    for (uint64_t i = 0; i < desc->element_num; i++) {
+        std::vector<int> indices(ndim, 0);
+        int global_index = i;
+        for (int z = 0; z <= ndim - 1; z++){
+            indices[z] = global_index / dst_strides[z];
+            global_index %= dst_strides[z];
+        }
+        uint64_t condition_index = 0, src1_index = 0, src2_index = 0;
+        for (int j = 0; j < ndim; j++) {
+            if (condition_shape[j] != 1){
+                condition_index = condition_index * condition_shape[j] + indices[j];
+            }
+            if (src1_shape[j] != 1){
+                src1_index = src1_index * src1_shape[j] + indices[j];
+            }
+            if (src2_shape[j] != 1){
+                src2_index = src2_index * src2_shape[j] + indices[j];
+            }
+        }
+        dst_[i] = condition_[condition_index] ? src1_[src1_index] : src2_[src2_index];
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
+    void *dst, 
+    void *src1,
+    void *src2,
+    void *condition,
+    void *stream){
+    if (desc->dtype == F16) {
+        return where_cpu<uint16_t>(desc, dst, src1, src2, condition, stream);
+    }
+    if (desc->dtype == F32) {
+        return where_cpu<float>(desc, dst, src1, src2, condition, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h
new file mode 100644
index 00000000..9fbc5922
--- /dev/null
+++ b/src/ops/where/cpu/where_cpu.h
@@ -0,0 +1,36 @@
+#ifndef __CPU_WHERE_H__
+#define __CPU_WHERE_H__
+
+#include "operators.h"
+struct WhereCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t const *src1_shape;
+    uint64_t const *src2_shape;
+    uint64_t const *condition_shape;
+    uint64_t const *dst_shape;
+    int64_t const *dst_strides;
+    uint64_t dst_ndim;
+    uint64_t element_num;
+};
+
+typedef struct WhereCpuDescriptor *WhereCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t handle,
+                                        WhereCpuDescriptor_t *desc_ptr,
+                                        infiniopTensorDescriptor_t dst,
+                                        infiniopTensorDescriptor_t src1,
+                                        infiniopTensorDescriptor_t src2,
+                                        infiniopTensorDescriptor_t condition
+                                        );
+
+infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
+                                  void *dst, 
+                                  void *src1,
+                                  void *src2,
+                                  void *condition,
+                                  void *stream);
+
+infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc
new file mode 100644
index 00000000..fe94ffcb
--- /dev/null
+++ b/src/ops/where/operator.cc
@@ -0,0 +1,46 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/where/where.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/where_cpu.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateWhereDescriptor(
+    infiniopHandle_t handle,
+    infiniopWhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t src1,
+    infiniopTensorDescriptor_t src2,
+    infiniopTensorDescriptor_t condition
+    ) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateWhereDescriptor(handle, (WhereCpuDescriptor_t *) desc_ptr, dst, src1, src2, condition);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+
+__C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void *src1, void *src2, void *condition, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuWhere((WhereCpuDescriptor_t) desc, dst, src1, src2, condition, stream);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc){
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyWhereDescriptor((WhereCpuDescriptor_t) desc);
+#endif
+    }
+
+}
\ No newline at end of file
diff --git a/src/tensor/tensor_descriptor.cc b/src/tensor/tensor_descriptor.cc
index 57afe92d..529a09fc 100644
--- a/src/tensor/tensor_descriptor.cc
+++ b/src/tensor/tensor_descriptor.cc
@@ -1,6 +1,7 @@
 #include "tensor/tensor_descriptor.h"
 #include <cstring>
 
+
 __C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, uint64_t ndim, uint64_t const *shape_, int64_t const *strides_, DataLayout datatype) {
     uint64_t *shape = new uint64_t[ndim];
     int64_t *strides = new int64_t[ndim];

From d98e6a97d06f591c70870342da69454aaaeb154c Mon Sep 17 00:00:00 2001
From: Graylatzhou <1391087899@qq.com>
Date: Tue, 11 Mar 2025 18:05:49 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E6=81=A2=E5=A4=8D=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E6=9D=83=E9=99=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/ops/clip/clip.h             |   2 +-
 include/ops/gather/gather.h         |   2 +-
 include/ops/reducemax/reducemax.h   |   2 +-
 include/ops/reducemean/reducemean.h |   2 +-
 include/ops/reducemin/reducemin.h   |   2 +-
 operatorspy/tests/clip.py           |  76 ++--
 operatorspy/tests/gather.py         |   2 +-
 operatorspy/tests/reducemax.py      |  68 +--
 operatorspy/tests/reducemean.py     |  66 ++-
 operatorspy/tests/reducemin.py      |  61 ++-
 operatorspy/tests/where.py          |  62 ++-
 src/ops/clip/cpu/clip_cpu.cc        |  39 +-
 src/ops/clip/cpu/clip_cpu.h         |   4 +-
 src/ops/clip/cuda/clip_cuda.cc      |  40 ++
 src/ops/clip/cuda/clip_cuda.cu      | 139 +++++++
 src/ops/clip/cuda/clip_cuda.h       |  33 ++
 src/ops/clip/operator.cc            |  21 +-
 src/ops/gather/cpu/gather_cpu.cc    |  25 +-
 src/ops/gather/cpu/gather_cpu.h     |   3 +-
 src/ops/gather/cuda/gather_cuda.cc  |  52 +++
 src/ops/gather/cuda/gather_cuda.cu  |   4 +
 src/ops/gather/cuda/gather_cuda.h   |  34 ++
 src/ops/gather/operator.cc          |  19 +-
 src/ops/reduce/cpu/reduce_cpu.cc    |   4 +-
 src/ops/reduce/cpu/reduce_cpu.h     |   4 +-
 src/ops/reduce/cuda/reduce_cuda.cc  | 158 +++++++
 src/ops/reduce/cuda/reduce_cuda.cu  | 625 ++++++++++++++++++++++++++++
 src/ops/reduce/cuda/reduce_cuda.h   |  61 +++
 src/ops/reduce/operator.cc          |  19 +-
 src/ops/reduce/reduce.h             |   2 +-
 src/ops/reducemax/operator.cc       |   2 +-
 src/ops/reducemean/operator.cc      |   2 +-
 src/ops/reducemin/operator.cc       |   2 +-
 src/ops/where/cpu/where_cpu.cc      |  93 +++--
 src/ops/where/cpu/where_cpu.h       |   6 +
 src/ops/where/cuda/where_cuda.cc    | 120 ++++++
 src/ops/where/cuda/where_cuda.cu    | 184 ++++++++
 src/ops/where/cuda/where_cuda.h     |  46 ++
 src/ops/where/operator.cc           |  19 +-
 39 files changed, 1898 insertions(+), 207 deletions(-)
 create mode 100644 src/ops/clip/cuda/clip_cuda.cc
 create mode 100644 src/ops/clip/cuda/clip_cuda.cu
 create mode 100644 src/ops/clip/cuda/clip_cuda.h
 create mode 100644 src/ops/gather/cuda/gather_cuda.cc
 create mode 100644 src/ops/gather/cuda/gather_cuda.cu
 create mode 100644 src/ops/gather/cuda/gather_cuda.h
 create mode 100644 src/ops/reduce/cuda/reduce_cuda.cc
 create mode 100644 src/ops/reduce/cuda/reduce_cuda.cu
 create mode 100644 src/ops/reduce/cuda/reduce_cuda.h
 create mode 100644 src/ops/where/cuda/where_cuda.cc
 create mode 100644 src/ops/where/cuda/where_cuda.cu
 create mode 100644 src/ops/where/cuda/where_cuda.h

diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h
index 3ebd1b0e..b5e0c19e 100644
--- a/include/ops/clip/clip.h
+++ b/include/ops/clip/clip.h
@@ -15,7 +15,7 @@ __C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t hand
                                                                 infiniopTensorDescriptor_t y
                                                                 );
 
-__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, void *min, void *max, void *y, void *stream);
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void *x, float *min, float *max, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
 
diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h
index ae4983da..7ad07f98 100644
--- a/include/ops/gather/gather.h
+++ b/include/ops/gather/gather.h
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t ha
                                                                 int64_t axis
                                                                 );
 
-__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void *indices, void *y, void *stream);
+__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *x, void *indices, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
 
diff --git a/include/ops/reducemax/reducemax.h b/include/ops/reducemax/reducemax.h
index f04e5349..8f0e6526 100644
--- a/include/ops/reducemax/reducemax.h
+++ b/include/ops/reducemax/reducemax.h
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
 #endif
diff --git a/include/ops/reducemean/reducemean.h b/include/ops/reducemean/reducemean.h
index 3baa22d8..c94c0e7d 100644
--- a/include/ops/reducemean/reducemean.h
+++ b/include/ops/reducemean/reducemean.h
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
 #endif
diff --git a/include/ops/reducemin/reducemin.h b/include/ops/reducemin/reducemin.h
index 8c0fc027..c22e9ef7 100644
--- a/include/ops/reducemin/reducemin.h
+++ b/include/ops/reducemin/reducemin.h
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
 #endif
diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py
index f273c84d..a2f3f20d 100644
--- a/operatorspy/tests/clip.py
+++ b/operatorspy/tests/clip.py
@@ -1,4 +1,4 @@
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool, c_float
 import ctypes
 import sys
 import os
@@ -21,7 +21,7 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
@@ -46,29 +46,30 @@ def test(
     x_shape,
     min,
     max,
-    tensor_dtype=torch.float16
+    tensor_dtype=torch.float32
 ):
     print(
         f"Testing clip on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype} max:{max} min:{min}"
     )
-    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
-    output = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    x = torch.randn(x_shape, dtype=torch.float32, device=torch_device)
+    
+    output = torch.randn(x_shape, dtype=torch.float32, device=torch_device)
     if min != None:
-        min = torch.tensor(min, dtype=torch.float32, device=torch_device)
+        min_t = torch.tensor(min, dtype=torch.float32, device=torch_device)
     else:
-        min = torch.tensor(float("-inf"), dtype=torch.float32, device=torch_device)
+        min_t = torch.tensor(float("-inf"), dtype=torch.float32, device=torch_device)
     if max != None:
-        max = torch.tensor(max, dtype=torch.float32, device=torch_device)
+        max_t = torch.tensor(max, dtype=torch.float32, device=torch_device)
     else:
-        max = torch.tensor(float("inf"), dtype=torch.float32, device=torch_device)
+        max_t = torch.tensor(float("inf"), dtype=torch.float32, device=torch_device)
     for i in range(NUM_PRERUN if PROFILE else 1):
         if min == None and max == None:
             break
-        ans = clip(x, min, max)
+        ans = clip(x, min_t, max_t)
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            _ = clip(x, min, max)
+            _ = clip(x, min_t, max_t)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"pytorch time: {elapsed :10f}")
     x_tensor = to_tensor(x, lib)
@@ -82,6 +83,7 @@ def test(
             y_tensor.descriptor,
         )
     )
+    #Ss = [1024, 2048, 4096]
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
     for i in range(NUM_PRERUN if PROFILE else 1):
@@ -89,8 +91,8 @@ def test(
             lib.infiniopClip(
                 descriptor,
                 x_tensor.data,
-                min.data_ptr() if min != None else None,
-                max.data_ptr() if max != None else None,
+                ctypes.byref(c_float(min)) if min != None else None,
+                ctypes.byref(c_float(max)) if max != None else None,
                 y_tensor.data,
                 None,
             )
@@ -102,37 +104,50 @@ def test(
                     lib.infiniopClip(
                     descriptor,
                     x_tensor.data,
-                    min.data_ptr() if min != None else None,
-                    max.data_ptr() if max != None else None,
+                    ctypes.byref(c_float(min)) if min != None else None,
+                    ctypes.byref(c_float(max)) if max != None else None,
                     y_tensor.data,
                     None,
                 )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print("x:", x)
-    print("custom op ans:", output)
-    print("ans:", ans) if max != None or min != None else print("ans:", x)
     assert torch.allclose(output, ans, atol=0, rtol=0) if max != None or min != None else torch.allclose(output, x, atol=0, rtol=0)
     check_error(lib.infiniopDestroyClipDescriptor(descriptor))
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, min, max in test_cases:
-        test(lib, handle, "cpu", x_shape, min, max, tensor_dtype=torch.float16)
-        print("\n")
-        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    for x_shape, min, max, tensor_type in test_cases:
+        test(lib, handle, "cpu", x_shape, min, max, tensor_dtype=tensor_type)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, min, max, tensor_type in test_cases:
+        test(lib, handle, "cuda", x_shape, min, max, tensor_dtype=tensor_type)
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        ((3, 4), -1, 1),
-        ((3, 4), None, 1),
-        ((3, 4), -1, None),
-        ((3, 4), None, None)
-        # stride = 
+        ((3, 4), -1, 1, torch.float32),
+        ((3, 4), None, 1, torch.float32),
+        ((3, 4), -1, None, torch.float32),
+        ((3, 4), None, None, torch.float32),
+        ((16), -1, 1, torch.float32),
+        ((1024, 1024), -1, 1, torch.float32),
+        ((4096, 4096), -1, 1, torch.float32),
+        
+        ((13), -1, 1, torch.float32),
+        ((3, 4), -1, 1, torch.float16),
+        ((3, 4), None, 1, torch.float16),
+        ((3, 4), -1, None, torch.float16),
+        ((3, 4), None, None, torch.float16),
+        ((16), -1, 1, torch.float16),
+        ((1024, 1024), -1, 1, torch.float16),
+        ((4096, 4096), -1, 1, torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -141,6 +156,7 @@ def test_cpu(lib, test_cases):
         infiniopHandle_t,
         POINTER(infiniopClipDescriptor_t),
         infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t
     ]
     lib.infiniopClip.restype = c_int32
     lib.infiniopClip.argtypes = [
@@ -149,8 +165,12 @@ def test_cpu(lib, test_cases):
         c_void_p,
         c_void_p,
         c_void_p,
+        c_void_p
     ]
     lib.infiniopDestroyClipDescriptor.restype = c_int32
     lib.infiniopDestroyClipDescriptor.argtypes = [infiniopClipDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
     print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
index 6a83ce26..a3e819ed 100644
--- a/operatorspy/tests/gather.py
+++ b/operatorspy/tests/gather.py
@@ -107,7 +107,7 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
     print(f"pytorch ans: {ans}")
-    print(f"lib ans: {dst_tensor.data}")
+    print(f"lib ans: {dst}")
     assert torch.allclose(dst, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
 
diff --git a/operatorspy/tests/reducemax.py b/operatorspy/tests/reducemax.py
index e2086293..a8873a2a 100644
--- a/operatorspy/tests/reducemax.py
+++ b/operatorspy/tests/reducemax.py
@@ -21,9 +21,9 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 1
-NUM_ITERATIONS = 1
+NUM_ITERATIONS = 50
 
 class ReducemaxDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -113,7 +113,6 @@ def test(
             c_bool(noop_with_empty_axes),
         )
     )
-    print(f"op desctiptor created")
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
     for i in range(NUM_PRERUN if PROFILE else 1):
@@ -142,41 +141,57 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"custom op output:{y}")
-    print(f"pytorch output:{ans}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    
+    # print(f"input : {x}")
+    # print(f"custom op output:{y}")
+    # print(f"pytorch output:{ans}")
     check_error(lib.infiniopDestroyReducemaxDescriptor(descriptor))
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
-        print(dynamic_axes)
-        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
         print("\n")
         #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
         # dynamic calc test eg
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
-        ((2, 10, 24, 10), [0, 2], False, True, None),
-        # stride = 
-        ((2, 10, 24, 10), [0, 1], False, True, None),
-        ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
-        # validate attribute noop_with_empty_axes and keepdims
-        ((2, 10, 24, 10), None, True, True, None),
-        ((2, 10, 24, 10), None, True, False, None),
-        ((2, 10, 24, 10), None, False, True, None),
-        ((2, 10, 24, 10), None, False, False, None),
-        ((2, 3, 4), [0, 1], False, False, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        # ((2, 10, 24, 10), [0, 2], False, True, None),
+        # # stride = 
+        # ((2, 10, 24, 10), [0, 1], False, True, None),
+        # ((2, 10, 24, 10), [2, 3], False , True, None),
+        # ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # # validate attribute noop_with_empty_axes and keepdims
+        # ((2, 10, 24, 10), None, True, True, None),
+        # ((2, 10, 24, 10), None, True, False, None),
+        # ((2, 10, 24, 10), None, False, True, None),
+        # ((2, 10, 24, 10), None, False, False, None),
+        # ((2, 3, 4), [0, 1], False, False, None),
         #((2, 10, 24, 10), [], True),
+        #((4,), [0], False, False, None, torch.float32),
+        ((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((50, 3), [0, 1], False, False, None, torch.float16),
+        ((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((2000, 200, 50), [0, 1], False, True, None, torch.float32),
+        ((1000, 200, 500), [0, 1], False, True, None, torch.float16),
+        ((1000, 200, 50), [0, 1], False, True, None, torch.float32),
+        ((20, 3, 4, 5), [0, 2], False, False, None, torch.float32),
+        ((20, 30, 40, 5), [0, 2, 3], False, False, None, torch.float32),
+        ((200, 3, 40, 5), [0, 3], False, False, None, torch.float32),
     ]
     args = get_args()
     lib = open_lib()
@@ -202,5 +217,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyReducemaxDescriptor.restype = c_int32
     lib.infiniopDestroyReducemaxDescriptor.argtypes = [infiniopReducemaxDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/reducemean.py b/operatorspy/tests/reducemean.py
index eaaf79d0..009b6646 100644
--- a/operatorspy/tests/reducemean.py
+++ b/operatorspy/tests/reducemean.py
@@ -21,9 +21,9 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 1
-NUM_ITERATIONS = 1
+NUM_ITERATIONS = 50
 
 class ReducemeanDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -47,6 +47,7 @@ def inferShape(x_shape, axis, noop_with_empty_axes, keepdims=False):
                 return tuple([1] * len(x_shape))
             else:
                 return tuple([])
+    
     assert len(axis) <= len(x_shape), "axis out of range"
     output_shape = []
     axis = [a if a >= 0 else a + len(x_shape) for a in axis]  # 更新 axis 列表中的值
@@ -82,9 +83,9 @@ def test(
     print(
         f"Testing reducemean on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
     )
-    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    x = torch.randint(0, 10, x_shape, dtype=tensor_dtype, device=torch_device)
     print(f"y_shape = {inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)}")
-    y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float('-inf'), dtype=tensor_dtype, device=torch_device)
+    y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float(0), dtype=tensor_dtype, device=torch_device)
     print(f"y_shape = {y.shape}")
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = reduce_mean(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
@@ -141,8 +142,9 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"custom op output:{y}")
-    print(f"pytorch output:{ans}")
+    #print(f"input_data = {x}")
+    # print(f"custom op output:{y}")
+    # print(f"pytorch output:{ans}")
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     
     check_error(lib.infiniopDestroyReducemeanDescriptor(descriptor))
@@ -150,30 +152,47 @@ def test(
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
-        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
         print("\n")
         #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
+
 
 if __name__ == "__main__":
     test_cases = [
         # dynamic calc test eg
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
-        ((2, 10, 24, 10), [0, 2], False, True, None),
-        # stride = 
-        ((2, 10, 24, 10), [0, 1], False, True, None),
-        ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        # ((2, 10, 24, 10), [0, 2], False, True, None),
+        # # stride = 
+        # ((2, 10, 24, 10), [0, 1], False, True, None),
+        # ((2, 10, 24, 10), [2, 3], False , True, None),
+        #((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((30, 5, 20, 100), [0, 1, 2, 3], False, False, None, torch.float16),
+        ((30000, 1000, 40), [0, 1], False, False, None, torch.float32),
+        #((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((2, 2, 5), [0, 1], False, True, None, torch.float32),
+        ((1000, 200, 500), [0, 1], False, True, None, torch.float16),
+        ((1000, 200, 50), [0, 1], False, True, None, torch.float32),
+        ((20, 3, 4, 5), [0, 2], False, False, None, torch.float32),
+        ((20, 30, 40, 5), [0, 2, 3], False, False, None, torch.float32),
+        ((200, 3, 40, 5), [0, 3], False, False, None, torch.float32),
         # validate attribute noop_with_empty_axes and keepdims
-        ((2, 10, 24, 10), None, True, True, None),
-        ((2, 10, 24, 10), None, True, False, None),
-        ((2, 10, 24, 10), None, False, True, None),
-        ((2, 10, 24, 10), None, False, False, None),
-        ((2, 3, 4), [0, 1], False, False, None),
+        # ((2, 10, 24, 10), None, True, True, None),
+        # ((2, 10, 24, 10), None, True, False, None),
+        # ((2, 10, 24, 10), None, False, True, None),
+        # ((2, 10, 24, 10), None, False, False, None),
+        # ((2, 3, 4), [0, 1], False, False, None),
         #((2, 10, 24, 10), [], True),
     ]
     args = get_args()
@@ -200,5 +219,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyReducemeanDescriptor.restype = c_int32
     lib.infiniopDestroyReducemeanDescriptor.argtypes = [infiniopReducemeanDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/reducemin.py b/operatorspy/tests/reducemin.py
index f93136ca..c590b71b 100644
--- a/operatorspy/tests/reducemin.py
+++ b/operatorspy/tests/reducemin.py
@@ -21,7 +21,7 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 1
 NUM_ITERATIONS = 1
 
@@ -141,8 +141,8 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"custom op output:{y}")
-    print(f"pytorch output:{ans}")
+    # print(f"custom op output:{y}")
+    # print(f"pytorch output:{ans}")
     assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     
     check_error(lib.infiniopDestroyReducemaxDescriptor(descriptor))
@@ -150,32 +150,44 @@ def test(
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
-        print(dynamic_axes)
-        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
-        print("\n")
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
         #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
         # dynamic calc test eg
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
-        ((2, 10, 24, 10), [0, 2], False, True, None),
-        # stride = 
-        ((2, 10, 24, 10), [0, 1], False, True, None),
-        ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
-        # validate attribute noop_with_empty_axes and keepdims
-        ((2, 10, 24, 10), None, True, True, None),
-        ((2, 10, 24, 10), None, True, False, None),
-        ((2, 10, 24, 10), None, False, True, None),
-        ((2, 10, 24, 10), None, False, False, None),
-        ((2, 3, 4), [0, 1], False, False, None),
-        #((2, 10, 24, 10), [], True),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        # ((2, 10, 24, 10), [0, 2], False, True, None),
+        # # stride = 
+        # ((2, 10, 24, 10), [0, 1], False, True, None),
+        # ((2, 10, 24, 10), [2, 3], False , True, None),
+        # ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # # validate attribute noop_with_empty_axes and keepdims
+        # ((2, 10, 24, 10), None, True, True, None),
+        # ((2, 10, 24, 10), None, True, False, None),
+        # ((2, 10, 24, 10), None, False, True, None),
+        # ((2, 10, 24, 10), None, False, False, None),
+        # ((2, 3, 4), [0, 1], False, False, None),
+        # #((2, 10, 24, 10), [], True),
+        ((2, 1000), [0, 1], False, False, None, torch.float32),
+        ((2, 2, 5), [0, 1], False, True, None, torch.float32),
+        ((1000, 200, 500), [0, 1], False, True, None, torch.float16),
+        ((1000, 200, 50), [0, 1], False, True, None, torch.float32),
+                ((20, 3, 4, 5), [0, 2], False, False, None, torch.float32),
+        ((20, 30, 40, 5), [0, 2, 3], False, False, None, torch.float32),
+        ((200, 3, 40, 5), [0, 3], False, False, None, torch.float32),
     ]
     args = get_args()
     lib = open_lib()
@@ -201,5 +213,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyReduceminDescriptor.restype = c_int32
     lib.infiniopDestroyReduceminDescriptor.argtypes = [infiniopReduceminDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
\ No newline at end of file
diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py
index 7caf84e9..f8809433 100644
--- a/operatorspy/tests/where.py
+++ b/operatorspy/tests/where.py
@@ -20,9 +20,8 @@
 import torch
 from typing import Tuple
 import numpy as np
-import onnx
 
-PROFILE = True
+PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
@@ -43,15 +42,22 @@ def tuple_to_void_p(py_tuple: Tuple):
 def inferShape(x_shape, y_shape):
     ndim_x = len(x_shape)
     ndim_y = len(y_shape)
-    ndim = 0
-    output_shape = []
     ndim = max(ndim_x, ndim_y)
-    for i in range(ndim - 1, -1, -1):
-        dim_x = x_shape[i] if i < ndim_x else 1
-        dim_y = y_shape[i] if i < ndim_y else 1
-        output_shape.append(max(dim_x, dim_y))
-    output_shape.reverse()
+    output_shape = []
+    
+    for i in range(-1, -ndim-1, -1):
+        dim_x = x_shape[i] if i >= -ndim_x else 1
+        dim_y = y_shape[i] if i >= -ndim_y else 1
+        
+        if dim_x != dim_y:
+            if dim_x != 1 and dim_y != 1:
+                raise ValueError(f"Shapes {x_shape} and {y_shape} cannot be broadcast together")
+        
+        output_dim = max(dim_x, dim_y)
+        output_shape.insert(0, output_dim)
+    
     return tuple(output_shape)
+
     
 def test(
     lib,
@@ -68,8 +74,7 @@ def test(
     condition = torch.randint(0, 2, condition_shape, dtype=torch.uint8).to(torch_device)
     src1 = torch.randn(src1_shape, dtype=tensor_dtype, device=torch_device)
     src2 = torch.randn(src2_shape, dtype=tensor_dtype, device=torch_device)
-    output = torch.randn(inferShape(src1_shape, src2_shape), dtype=tensor_dtype, device=torch_device)
-
+    output = torch.randn(inferShape(inferShape(src1_shape, src2_shape), condition_shape), dtype=tensor_dtype, device=torch_device)
 
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = where(condition, src1, src2)
@@ -130,18 +135,33 @@ def test(
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for condition_shape, src1_shape, src2_shape in test_cases:
-        test(lib, handle, "cpu", condition_shape, src1_shape, src2_shape, tensor_dtype=torch.float16)
+    for condition_shape, src1_shape, src2_shape, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", condition_shape, src1_shape, src2_shape, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for condition_shape, src1_shape, src2_shape, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", condition_shape, src1_shape, src2_shape, tensor_dtype=tensor_dtype)
         print("\n")
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        ((2, 3, 4, 5), (2, 3, 4, 5), (2, 3, 4, 5)),
-        ((3, 1), (3, 4), (1, 4)),
-        ((1,), (3, 4), (3, 4)),
-        ((2, 1, 3), (1, 4, 3), (2, 4, 1)),
+        ((2, 16), (2, 16), (2, 16), torch.float32),
+        ((2, 3, 1, 1), (1, 4, 5), (2, 3, 4, 5), torch.float32),
+        ((3, 1), (3, 4), (1, 4), torch.float32),
+        ((1,), (3, 4), (3, 4), torch.float32),
+        ((2, 1, 3), (1, 4, 3), (2, 4, 1), torch.float32),
+
+        ((2, 16), (2, 16), (2, 16), torch.float16),
+        ((2, 3, 1, 1), (1, 4, 5), (2, 3, 4, 5), torch.float16),
+        ((3, 1), (3, 4), (1, 4), torch.float16),
+        ((1,), (3, 4), (3, 4), torch.float16),
+        ((2, 1, 3), (1, 4, 3), (2, 4, 1), torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -150,6 +170,9 @@ def test_cpu(lib, test_cases):
         infiniopHandle_t,
         POINTER(infiniopWhereDescriptor_t),
         infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t
     ]
     lib.infiniopWhere.restype = c_int32
     lib.infiniopWhere.argtypes = [
@@ -162,5 +185,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyWhereDescriptor.restype = c_int32
     lib.infiniopDestroyWhereDescriptor.argtypes = [infiniopWhereDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
\ No newline at end of file
diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc
index 1cefe0ab..cfdecc02 100644
--- a/src/ops/clip/cpu/clip_cpu.cc
+++ b/src/ops/clip/cpu/clip_cpu.cc
@@ -35,24 +35,22 @@ infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
 template<typename Tdata>
 infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc,
     void const *x,
-    void *min,
-    void *max,
+    float min,
+    float max,
+    bool has_min,
+    bool has_max,
     void *y){
     auto x_ = reinterpret_cast<Tdata const *>(x);
     auto y_ = reinterpret_cast<Tdata *>(y);
-    float min_val;
-    float max_val;
-    auto min_ = reinterpret_cast<float *>(min);
-    min_val = *min_;
-    auto max_ = reinterpret_cast<float *>(max);
-    max_val = *max_;
-
     for (uint64_t i = 0; i < desc->element_num; i++) {
         if constexpr (std::is_same<Tdata, uint16_t>::value){
-            y_[i] = f32_to_f16(std::min(std::max(f16_to_f32(x_[i]), min_val), max_val));
+            float x_f = f16_to_f32(x_[i]);
+            x_f = has_min ? std::max(x_f, min) : x_f;
+            x_f = has_max ? std::min(x_f, max) : x_f;
+            y_[i] = f32_to_f16(x_f);
         }
         else{
-            y_[i] = std::min(std::max(x_[i], min_val), max_val);
+            y_[i] = std::min(std::max(x_[i], min), max);
         }
     }
     return STATUS_SUCCESS;
@@ -60,19 +58,28 @@ infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc,
 
 infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
     void const*x,
-    void *min,
-    void *max,
+    float *min,
+    float *max,
     void *y,
     void *stream){
+    bool has_min = true;
+    bool has_max = true;
+    if (min == nullptr){
+        has_min = false;
+    }
+    if (max == nullptr){
+        has_max = false;
+    }
+    float min_value = has_min ? *min : -std::numeric_limits<float>::infinity();
+    float max_value = has_max ? *max : std::numeric_limits<float>::infinity();
     if (desc->dtype == F16) {
-        return clip_cpu<uint16_t>(desc, x, min, max, y);
+        return clip_cpu<uint16_t>(desc, x, min_value, max_value, has_min, has_max, y);
     }
     if (desc->dtype == F32) {
-        return clip_cpu<float>(desc, x, min, max, y);
+        return clip_cpu<float>(desc, x, min_value, max_value, has_min, has_max, y);
     }
     return STATUS_BAD_TENSOR_DTYPE;
 }
-
 infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc){
     delete desc;
     return STATUS_SUCCESS;
diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h
index 992a2444..7006a351 100644
--- a/src/ops/clip/cpu/clip_cpu.h
+++ b/src/ops/clip/cpu/clip_cpu.h
@@ -18,8 +18,8 @@ infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
 
 infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
                                   void const *x, 
-                                  void *min,
-                                  void *max,
+                                  float *min,
+                                  float *max,
                                   void *y,
                                   void *stream);
 
diff --git a/src/ops/clip/cuda/clip_cuda.cc b/src/ops/clip/cuda/clip_cuda.cc
new file mode 100644
index 00000000..f8f7ca9b
--- /dev/null
+++ b/src/ops/clip/cuda/clip_cuda.cc
@@ -0,0 +1,40 @@
+#include "clip_cuda.h"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
+                                            ClipCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t x,
+                                            infiniopTensorDescriptor_t y
+                                            ) {
+    if (x->dt != F16 && x->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (x->ndim != y->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x->dt != y->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(x) || !is_contiguous(y)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    uint64_t element_num = 1;
+    for (uint64_t i = 0; i < x->ndim; i++) {
+        element_num *= x->shape[i];
+    }
+    uint64_t ndim = y->ndim;
+    *desc_ptr = new ClipCudaDescriptor{
+        DevNvGpu,
+        x->dt,
+        ndim,
+        element_num,
+    };
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/clip/cuda/clip_cuda.cu b/src/ops/clip/cuda/clip_cuda.cu
new file mode 100644
index 00000000..327f6d7e
--- /dev/null
+++ b/src/ops/clip/cuda/clip_cuda.cu
@@ -0,0 +1,139 @@
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../../utils.h"
+#include "clip_cuda.h"
+#include <cuda_fp16.h>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
+#define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
+
+__global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min_value, int N){
+    int idx = 4 * (blockDim.x * blockIdx.x + threadIdx.x);
+    if (idx < N) {
+        int remaining = N - idx;
+        float4 reg_a, reg_b;
+        if (remaining >= 4) {
+        reg_a = FLOAT4(a[idx]);
+        } else {
+            reg_a.x = a[idx];
+            reg_a.y = (remaining >= 2) ? a[idx + 1] : 0;
+            reg_a.z = (remaining >= 3) ? a[idx + 2] : 0;
+            reg_a.w = 0;
+        }
+        reg_b.x = fminf(fmaxf(reg_a.x, min_value), max_value);
+        reg_b.y = fminf(fmaxf(reg_a.y, min_value), max_value);
+        reg_b.z = fminf(fmaxf(reg_a.z, min_value), max_value);
+        reg_b.w = fminf(fmaxf(reg_a.w, min_value), max_value);
+        if (remaining >= 4) {
+            FLOAT4(b[idx]) = reg_b;
+        } else {
+            if (remaining >= 1) b[idx]     = reg_b.x;
+            if (remaining >= 2) b[idx + 1] = reg_b.y;
+            if (remaining >= 3) b[idx + 2] = reg_b.z;
+        }
+    }
+}
+
+
+__global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float min_value, int N){
+    int idx = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
+    if (idx >= N) return;
+    const half min_half = __float2half(min_value);
+    const half max_half = __float2half(max_value);
+    half pack_a[8], pack_b[8];
+    if (idx + 7 < N) {
+        LDST128BITS(pack_a[0]) = LDST128BITS(a[idx]);
+    } else {
+        for (int i = 0; i < 8 && (idx + i) < N; i++) {
+            pack_a[i] = a[idx + i];
+        }
+    }
+    #pragma unroll
+    for (int i = 0; i < 8; i++)
+    {
+        pack_b[i] = __hlt(pack_a[i], min_half) ? min_half : pack_a[i];
+        pack_b[i] = __hgt(pack_a[i], max_half) ? max_half : pack_a[i];
+    }
+    if (idx + 7 < N) {
+        LDST128BITS(b[idx]) = LDST128BITS(pack_b[0]);
+    } else {
+        for (int i = 0; i < 8 && (idx + i) < N; i++) {
+            b[idx + i] = pack_b[i];
+        }
+    }
+}
+template<typename Tdata>
+infiniopStatus_t clip_nv_gpu(
+    ClipCudaDescriptor_t desc,
+    void *x,
+    void *y,
+    float min_value,
+    float max_value,
+    int per_thread_element,
+    void* stream) {
+    uint64_t N = desc->element_num;
+    dim3 block(256 / per_thread_element);
+    dim3 grid((N + 256 - 1) / 256);
+    if constexpr(std::is_same<Tdata, float>::value){
+        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
+    }else{
+        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
+    }
+    /*
+    if (desc->ndim != 2){
+        dim3 block(256 / per_thread_element);
+        dim3 grid((N + 256 - 1) / 256);
+        if constexpr(std::is_same<Tdata, float>::value){
+            clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
+        }else{
+            clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
+        }
+    }else{
+        if ((desc->K / per_thread_element) <= 1024){
+            dim3 block(desc->K / (per_thread_element));                                   
+            dim3 grid(desc->S);
+            if constexpr(std::is_same<Tdata, float>::value){
+                clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
+            }else{
+                clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
+            }
+        }
+        else{
+            dim3 block(256 / per_thread_element);
+            dim3 grid((N + 256 - 1) / 256);
+            if constexpr(std::is_same<Tdata, float>::value){
+                clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), min_value, max_value, N);
+            }else{
+                clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), min_value, max_value, N);
+            }
+        }
+    }
+        */
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
+    void *x,
+    void *y,
+    float *min,
+    float *max,
+    void *stream){
+    bool has_min = true;
+    bool has_max = true;
+    if (min == nullptr){
+        has_min = false;
+    }
+    if (max == nullptr){
+        has_max = false;
+    }
+    float min_value = has_min ? *min : -std::numeric_limits<float>::infinity();
+    float max_value = has_max ? *max : std::numeric_limits<float>::infinity();
+
+    if (desc->dtype == F16) {
+        return clip_nv_gpu<half>(desc, x, y, min_value, max_value, 8, stream);
+    }
+    if (desc->dtype == F32) {
+        return clip_nv_gpu<float>(desc, x, y, min_value, max_value, 4, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/clip/cuda/clip_cuda.h b/src/ops/clip/cuda/clip_cuda.h
new file mode 100644
index 00000000..a805726a
--- /dev/null
+++ b/src/ops/clip/cuda/clip_cuda.h
@@ -0,0 +1,33 @@
+#ifndef __CUDA_CLIP_H__
+#define __CUDA_CLIP_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <memory>
+
+typedef struct ClipCudaDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t element_num;
+} ClipCudaDescriptor;
+
+typedef struct ClipCudaDescriptor *ClipCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
+                                            ClipCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t x,
+                                            infiniopTensorDescriptor_t y
+                                            );
+
+
+infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
+                            void *x,
+                            void *y,
+                            float *min,
+                            float *max,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc);
+
+#endif// __CUDA_MATMUL_H__
diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc
index c655dab7..380cb561 100644
--- a/src/ops/clip/operator.cc
+++ b/src/ops/clip/operator.cc
@@ -6,6 +6,10 @@
 #include "cpu/clip_cpu.h"
 #endif
 
+#ifdef ENABLE_NV_GPU
+#include "cuda/clip_cuda.h"
+#endif
+
 
 __C infiniopStatus_t infiniopCreateClipDescriptor(
     infiniopHandle_t handle,
@@ -17,17 +21,26 @@ __C infiniopStatus_t infiniopCreateClipDescriptor(
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *) desc_ptr, x, y);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateClipDescriptor((CudaHandle_t) handle, (ClipCudaDescriptor_t *) desc_ptr, x, y);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
 }
 
 
-__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, void *min, void *max, void *y, void *stream) {
+__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void *x, float *min, float *max, void *y, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuClip((ClipCpuDescriptor_t) desc, x, min, max, y, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaClip((ClipCudaDescriptor_t) desc, x, y, min, max, stream);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -38,7 +51,11 @@ __C infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuDestroyClipDescriptor((ClipCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaDestroyClipDescriptor((ClipCudaDescriptor_t) desc);
 #endif
     }
-
+    return STATUS_BAD_DEVICE;
 }
\ No newline at end of file
diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc
index 50744168..89765862 100644
--- a/src/ops/gather/cpu/gather_cpu.cc
+++ b/src/ops/gather/cpu/gather_cpu.cc
@@ -32,6 +32,7 @@ infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t handle,
     *desc_ptr = new GatherCpuDescriptor{
         DevCpu,
         y->dt,
+        indices->dt,
         dst_shape,
         src_shape,
         indices_shape,
@@ -51,14 +52,14 @@ infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc){
     return STATUS_SUCCESS;
 }
 
-template<typename Tdata>
+template<typename Tdata, typename Tindices>
 infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
-    void const *x, 
+    void *x, 
     void *indices,
     void *y)
 {
-    auto *src_data = reinterpret_cast<Tdata const*>(x);
-    auto *indices_data = reinterpret_cast<int64_t*>(indices); // [!code ++]
+    auto *src_data = reinterpret_cast<Tdata*>(x);
+    auto *indices_data = reinterpret_cast<Tindices*>(indices); // [!code ++]
     auto *dst_data = reinterpret_cast<Tdata*>(y);
     uint64_t indices_element_count = 1;
     for (uint64_t i = 0; i < desc->indices_ndim; ++i) {
@@ -95,15 +96,25 @@ infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
 }
 
 infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
-    void const *x, 
+    void *x, 
     void *indices,
     void *y,
     void *stream){
     if (desc->dtype == F16){
-        return gather_cpu<uint16_t>(desc, x, indices, y);
+        if (desc->indices_dtype == I32){
+            return gather_cpu<uint16_t, int32_t>(desc, x, indices, y);
+        }
+        else if (desc->indices_dtype == I64){
+            return gather_cpu<uint16_t, int64_t>(desc, x, indices, y);
+        }
     }
     if (desc->dtype == F32){
-        return gather_cpu<float>(desc, x, indices, y);
+        if (desc->indices_dtype == I32){
+            return gather_cpu<float, int32_t>(desc, x, indices, y);
+        }
+        else if (desc->indices_dtype == I64){
+            return gather_cpu<float, int64_t>(desc, x, indices, y);
+        }
     }
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h
index 61a9c313..6d2d55d7 100644
--- a/src/ops/gather/cpu/gather_cpu.h
+++ b/src/ops/gather/cpu/gather_cpu.h
@@ -5,6 +5,7 @@
 struct GatherCpuDescriptor {
     Device device;
     DT dtype;
+    DT indices_dtype;
     uint64_t const *dst_shape;
     uint64_t const *src_shape;
     uint64_t const *indices_shape;
@@ -25,7 +26,7 @@ infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t handle,
                                         );
 
 infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
-                                  void const *data, 
+                                  void *data, 
                                   void *indices,
                                   void *dst,
                                   void *stream);
diff --git a/src/ops/gather/cuda/gather_cuda.cc b/src/ops/gather/cuda/gather_cuda.cc
new file mode 100644
index 00000000..a63eaffd
--- /dev/null
+++ b/src/ops/gather/cuda/gather_cuda.cc
@@ -0,0 +1,52 @@
+#include "gather_cuda.h"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t handle,
+                                            GatherCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            infiniopTensorDescriptor_t indices,
+                                            int64_t axis
+                                            ){
+    if (y->dt != x->dt){
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != F16 && y->dt != F32){
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)){
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (axis < 0 || axis >= x->ndim){
+        return STATUS_BAD_PARAM;
+    }
+}
+
+infiniopStatus_t gather_nv_gpu(
+    GatherCudaDescriptor_t desc,
+    void *x,
+    void *indices,
+    void *y,
+    void *stream){
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc,
+                            void *x,
+                            void *indices,
+                            void *y,
+                            void *stream){
+    if (desc->dtype == F16){
+
+    }
+    if (desc->dtype == F32){
+
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc){
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/gather/cuda/gather_cuda.cu b/src/ops/gather/cuda/gather_cuda.cu
new file mode 100644
index 00000000..df3a9702
--- /dev/null
+++ b/src/ops/gather/cuda/gather_cuda.cu
@@ -0,0 +1,4 @@
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../../utils.h"
+#include "gather_cuda.h"
+#include <cuda_fp16.h>
diff --git a/src/ops/gather/cuda/gather_cuda.h b/src/ops/gather/cuda/gather_cuda.h
new file mode 100644
index 00000000..0996ae64
--- /dev/null
+++ b/src/ops/gather/cuda/gather_cuda.h
@@ -0,0 +1,34 @@
+#ifndef __CUDA_GATHER_H__
+#define __CUDA_GATHER_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <memory>
+
+typedef struct GatherCudaDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t element_num;
+} GatherCudaDescriptor;
+
+typedef struct GatherCudaDescriptor *GatherCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t handle,
+                                            GatherCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            infiniopTensorDescriptor_t indices,
+                                            int64_t axis
+                                            );
+
+
+infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc,
+                            void *x,
+                            void *indices,
+                            void *y,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc);
+
+#endif// __CUDA_MATMUL_H__
diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc
index f2dcbc08..58855c00 100644
--- a/src/ops/gather/operator.cc
+++ b/src/ops/gather/operator.cc
@@ -6,6 +6,9 @@
 #include "cpu/gather_cpu.h"
 #endif
 
+#ifdef ENABLE_NV_GPU
+#include "cuda/gather_cuda.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateGatherDescriptor(
     infiniopHandle_t handle,
@@ -19,15 +22,24 @@ __C infiniopStatus_t infiniopCreateGatherDescriptor(
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuCreateGatherDescriptor(handle, (GatherCpuDescriptor_t *) desc_ptr, y, x, indices, axis);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateGatherDescriptor((CudaHandle_t) handle, (GatherCudaDescriptor_t *) desc_ptr, y, x, indices, axis);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
 }
-__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void *indices, void *y, void *stream) {
+__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *x, void *indices, void *y, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuGather((GatherCpuDescriptor_t) desc, x, indices, y, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaGather((GatherCudaDescriptor_t) desc, x, indices, y, stream);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -38,6 +50,11 @@ __C infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuDestroyGatherDescriptor((GatherCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaDestroyGatherDescriptor((GatherCudaDescriptor_t) desc);
 #endif
     }
+    return STATUS_BAD_DEVICE;
 }
\ No newline at end of file
diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc
index 2040a84b..d2a25917 100644
--- a/src/ops/reduce/cpu/reduce_cpu.cc
+++ b/src/ops/reduce/cpu/reduce_cpu.cc
@@ -214,8 +214,8 @@ infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
 
 infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
                             void *y,
-                            void const *x,
-                            void const *dynamic_axes,
+                            void *x,
+                            void *dynamic_axes,
                             uint64_t dynamic_axes_size,
                             void *stream){
     if (desc->is_axes_static == true && dynamic_axes_size > 0){
diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h
index 61bd9d74..3111c371 100644
--- a/src/ops/reduce/cpu/reduce_cpu.h
+++ b/src/ops/reduce/cpu/reduce_cpu.h
@@ -44,8 +44,8 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
 
 infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
                             void *y,
-                            void const *x,
-                            void const *dynamic_axes,
+                            void *x,
+                            void *dynamic_axes,
                             uint64_t dynamic_axes_size,
                             void *stream);
 
diff --git a/src/ops/reduce/cuda/reduce_cuda.cc b/src/ops/reduce/cuda/reduce_cuda.cc
new file mode 100644
index 00000000..461a9ca6
--- /dev/null
+++ b/src/ops/reduce/cuda/reduce_cuda.cc
@@ -0,0 +1,158 @@
+#include "reduce_cuda.h"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+// need reduce_size, output_size, output_stride, input_stride
+
+infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
+                                            ReduceCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            int64_t const *axes,
+                                            uint64_t axes_size,
+                                            int reduce_op_type,
+                                            bool keepdims
+                                            ) {
+    if (x->dt != F16 && x->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (keepdims) {
+        if (x->ndim != y->ndim) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (x->dt != y->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(x) || !is_contiguous(y)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    uint64_t element_num = 1;
+    uint64_t output_size = 1;
+    for (uint64_t i = 0; i < x->ndim; i++) {
+        element_num *= x->shape[i];
+    }
+
+    for (uint64_t i = 0; i < y->ndim; i++) {
+        output_size *= y->shape[i];
+    }
+    uint64_t reduce_size = element_num / output_size;
+
+    int64_t *input_strides = new int64_t[x->ndim];
+    int64_t *output_strides = new int64_t[y->ndim];
+    uint64_t *input_shape = new uint64_t[x->ndim];
+    uint64_t *output_shape = new uint64_t[y->ndim];
+    int64_t *h_axes = new int64_t[axes_size];
+    uint64_t* reduce_axes_stride = new uint64_t[axes_size];
+    std::fill_n(reduce_axes_stride, axes_size, 1);
+    for (int i = axes_size - 2; i >= 0; i--){
+        reduce_axes_stride[i] = reduce_axes_stride[i + 1] * x->shape[axes[i + 1]];
+    }
+
+
+    memcpy(h_axes, axes, axes_size * sizeof(int64_t));
+    memcpy(input_shape, x->shape, x->ndim * sizeof(uint64_t));
+    memcpy(output_shape, y->shape, y->ndim * sizeof(uint64_t));
+    memcpy(input_strides, x->strides, x->ndim * sizeof(int64_t));
+    memcpy(output_strides, y->strides, y->ndim * sizeof(int64_t));
+    int prefix_size = 1, suffix_size = 1;
+    bool if_reduce_axes_contiguous = true;
+    int reduce_mode = 0;
+    std::vector<int64_t> non_reduce_axes;
+    for (int j = 0; j < x->ndim; ++j) {
+        if (!std::binary_search(axes, axes + axes_size, j)) {
+            non_reduce_axes.push_back(j);
+        }
+    }
+    int64_t *h_non_reduce_axes = new int64_t[non_reduce_axes.size()];
+    std::copy(non_reduce_axes.begin(), non_reduce_axes.end(), h_non_reduce_axes);
+
+    for (uint64_t i = 0; i < axes_size; i++) {
+        if (i < axes_size - 1 && axes[i] != axes[i + 1] - 1) {
+            if_reduce_axes_contiguous = false;
+        }
+    }
+    if (if_reduce_axes_contiguous) {
+        if (axes_size == x->ndim) {
+            reduce_mode = 0;
+        } else {
+            for (uint64_t i = 0; i < axes[0]; i++) {  
+                prefix_size *= x->shape[i];
+            }
+            for (uint64_t i = axes[axes_size - 1] + 1; i < x->ndim; i++) {  
+                suffix_size *= x->shape[i];
+            }
+            reduce_mode = 1;
+        }
+    } else {
+        reduce_mode = 2;
+    }
+    int64_t *d_non_reduce_axes;
+    int64_t *d_input_strides;
+    int64_t *d_output_strides;
+    int64_t *d_reduce_axes;
+    uint64_t *d_reduce_axes_stride;
+    uint64_t *d_input_shape;
+    uint64_t *d_output_shape;
+
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_reduce_axes_stride, axes_size * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_reduce_axes, axes_size * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_non_reduce_axes, non_reduce_axes.size() * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_input_strides, x->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_output_strides, y->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_input_shape, x->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&d_output_shape, y->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+
+    checkCudaErrorWithCode(cudaMemcpy(d_reduce_axes_stride, reduce_axes_stride, axes_size * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_reduce_axes, h_axes, axes_size * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_non_reduce_axes, h_non_reduce_axes, non_reduce_axes.size() * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_input_strides, input_strides, x->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_output_strides, output_strides, y->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_input_shape, input_shape, x->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(d_output_shape, output_shape, y->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    *desc_ptr = new ReduceCudaDescriptor{
+        DevNvGpu,
+        x->dt,
+        x->ndim,
+        y->ndim,
+        d_non_reduce_axes,
+        d_input_strides,
+        d_output_strides,
+        d_input_shape,
+        d_output_shape,
+        d_reduce_axes,
+        d_reduce_axes_stride,
+        reduce_size,
+        element_num,
+        output_size,
+        static_cast<int>(reduce_op_type),
+        reduce_mode,
+        axes_size,
+        keepdims,
+        axes[0],
+        axes[axes_size - 1],
+        prefix_size,
+        suffix_size
+    };
+    delete [] h_axes;
+    delete [] reduce_axes_stride;
+    delete [] h_non_reduce_axes;
+    delete [] input_strides;
+    delete [] output_strides;
+    delete [] input_shape;
+    delete [] output_shape;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc) {
+   
+    checkCudaErrorWithCode(cudaFree((void*)desc->reduce_axes), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->reduce_axes_stride), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->non_reduce_axes), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->input_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->output_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->input_shape), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->output_shape), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/reduce/cuda/reduce_cuda.cu b/src/ops/reduce/cuda/reduce_cuda.cu
new file mode 100644
index 00000000..a6ec6963
--- /dev/null
+++ b/src/ops/reduce/cuda/reduce_cuda.cu
@@ -0,0 +1,625 @@
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../../utils.h"
+#include "reduce_cuda.h"
+#include <cuda_fp16.h>
+#include <type_traits>
+#include <cuda_runtime.h>
+#define WARP_SIZE 32
+
+#define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
+#define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
+
+enum class ReduceOp {
+    SUM,
+    MIN,
+    MAX,
+    MEAN
+};
+
+__global__ void divide_by_n_kernel(float* y, int N) {
+    y[0] /= static_cast<float>(N);
+}
+
+__global__ void divide_by_n_kernel(half* y, float* temp_buffer, int N) {
+    y[0] = __float2half(temp_buffer[0] / static_cast<float>(N));
+}
+
+template <ReduceOp Op, typename T>
+__device__ __forceinline__ T init_value() {
+    if constexpr (std::is_same_v<T, float>) {
+        if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN) 
+            return 0.0f;
+        else if constexpr (Op == ReduceOp::MAX) 
+            return -INFINITY;
+        else if constexpr (Op == ReduceOp::MIN) 
+            return INFINITY;
+    } else if constexpr (std::is_same_v<T, half>) {
+        if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN) 
+            return __float2half(0.0f);
+            else if constexpr (Op == ReduceOp::MAX) 
+                return __float2half(-INFINITY);
+            else if constexpr (Op == ReduceOp::MIN) 
+                return __float2half(INFINITY);
+    }
+}
+
+template <ReduceOp Op>
+__device__ __forceinline__ float reduce_op(float a, float b) {
+    if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN) 
+        return a + b;
+    else if constexpr (Op == ReduceOp::MAX) 
+        return fmaxf(a, b);
+    else if constexpr (Op == ReduceOp::MIN) 
+        return fminf(a, b);
+}
+
+template <ReduceOp Op, const int kWarpSize = 32>
+__device__ __forceinline__ float warp_reduce(float val) {
+    #pragma unroll
+    for (int mask = kWarpSize >> 1; mask >= 1; mask >>= 1) {
+        if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN)
+            val += __shfl_down_sync(0xffffffff, val, mask);
+        else if constexpr (Op == ReduceOp::MAX)
+            val = fmaxf(val, __shfl_down_sync(0xffffffff, val, mask));
+        else if constexpr (Op == ReduceOp::MIN)
+            val = fminf(val, __shfl_down_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+
+template <ReduceOp Op>
+__device__ __forceinline__ float finalize_result(float val, int count) {
+    if constexpr (Op == ReduceOp::MEAN)
+        return val / static_cast<float>(count);
+    else
+        return val;
+}
+
+template <ReduceOp Op>
+__device__ __forceinline__ half reduce_op(half a, half b) {
+    if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN) 
+        return __hadd(a, b);
+    else if constexpr (Op == ReduceOp::MAX) 
+        return __hmax(a, b);
+    else if constexpr (Op == ReduceOp::MIN) 
+        return __hmin(a, b);
+}
+
+template <ReduceOp Op, const int kWarpSize = 32>
+__device__ __forceinline__ half warp_reduce(half val) {
+    #pragma unroll
+    for (int mask = kWarpSize >> 1; mask >= 1; mask >>= 1) {
+        if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN)
+            val = __hadd(val, __shfl_down_sync(0xffffffff, val, mask));
+        else if constexpr (Op == ReduceOp::MAX)
+            val = __hmax(val, __shfl_down_sync(0xffffffff, val, mask));
+        else if constexpr (Op == ReduceOp::MIN)
+            val = __hmin(val, __shfl_down_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+
+template <ReduceOp Op>
+__device__ __forceinline__ half finalize_result(half val, int count) {
+    if constexpr (Op == ReduceOp::MEAN) {
+        // 转换为float计算，然后转回half
+        float fval = __half2float(val);
+        float result = fval / static_cast<float>(count);
+        return __float2half(result);
+    } else {
+        return val;
+    }
+}
+
+template <ReduceOp Op>
+__global__ void warp_final_reduce_kernel(float *temp_in, float *y, int num_blocks) {
+    int tid = threadIdx.x;
+    float thread_result = init_value<Op, float>();
+
+    for (int i = tid; i < num_blocks; i += blockDim.x) {
+        if constexpr (Op == ReduceOp::MAX)
+            thread_result = fmaxf(thread_result, temp_in[i]);
+        else if constexpr (Op == ReduceOp::MIN)
+            thread_result = fminf(thread_result, temp_in[i]);
+    }
+    thread_result = warp_reduce<Op>(thread_result);
+    
+    if (tid == 0)
+        y[0] = thread_result;
+}
+
+template <ReduceOp Op>
+__global__ void block_final_reduce_kernel(float *temp_in, float *y, int num_blocks) {
+    int tid = threadIdx.x;
+    __shared__ float s_data[256];
+    
+    float thread_result = init_value<Op, float>();
+
+    for (int i = tid; i < num_blocks; i += blockDim.x) {
+        if constexpr (Op == ReduceOp::MAX)
+            thread_result = fmaxf(thread_result, temp_in[i]);
+        else if constexpr (Op == ReduceOp::MIN)
+            thread_result = fminf(thread_result, temp_in[i]);
+    }
+
+    s_data[tid] = thread_result;
+    __syncthreads();
+
+    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            if constexpr (Op == ReduceOp::MAX)
+                s_data[tid] = fmaxf(s_data[tid], s_data[tid + stride]);
+            else if constexpr (Op == ReduceOp::MIN)
+                s_data[tid] = fminf(s_data[tid], s_data[tid + stride]);
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0)
+        y[0] = s_data[0];
+}
+
+template <ReduceOp Op>
+__global__ void warp_final_reduce_kernel(half *temp_in, half *y, int num_blocks) {
+    int tid = threadIdx.x;
+    half thread_result = init_value<Op, half>();
+    for (int i = tid; i < num_blocks; i += blockDim.x) {
+        if constexpr (Op == ReduceOp::MAX)
+            thread_result = __hmax(thread_result, temp_in[i]);
+        else if constexpr (Op == ReduceOp::MIN)
+            thread_result = __hmin(thread_result, temp_in[i]);
+    }    
+    thread_result = warp_reduce<Op>(thread_result);
+    
+    if (tid == 0)
+        y[0] = thread_result;
+}
+
+template <ReduceOp Op>
+__global__ void block_final_reduce_kernel(half *temp_in, half *y, int num_blocks) {
+    int tid = threadIdx.x;
+    __shared__ half s_data[256];
+    
+    half thread_result = init_value<Op, half>();
+    
+    for (int i = tid; i < num_blocks; i += blockDim.x) {
+        if constexpr (Op == ReduceOp::MAX)
+            thread_result = __hmax(thread_result, temp_in[i]);
+        else if constexpr (Op == ReduceOp::MIN)
+            thread_result = __hmin(thread_result, temp_in[i]);
+    }
+    
+    s_data[tid] = thread_result;
+    __syncthreads();
+    
+    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            if constexpr (Op == ReduceOp::MAX)
+                s_data[tid] = __hmax(s_data[tid], s_data[tid + stride]);
+            else if constexpr (Op == ReduceOp::MIN)
+                s_data[tid] = __hmin(s_data[tid], s_data[tid + stride]);
+        }
+        __syncthreads();
+    }
+    
+    if (tid == 0)
+        y[0] = s_data[0];
+}
+
+template <ReduceOp Op>
+__global__ void blockall_reduce_f32x4_kernel(float *x, float *y, uint64_t N){
+    int tid = threadIdx.x;
+    int base_idx = (tid + 64 * blockIdx.x) * 4;
+    constexpr int NUM_WARPS = (64 + WARP_SIZE - 1) / WARP_SIZE;
+    __shared__ float reduce_smem[NUM_WARPS];
+    float result = init_value<Op, float>();
+    if (base_idx < N){
+        if (base_idx + 3 < N){
+            float4 reg_x = FLOAT4(x[base_idx]);
+            if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN) {
+                result += reg_x.x + reg_x.y + reg_x.z + reg_x.w;
+            } else if constexpr (Op == ReduceOp::MAX) {
+                result = fmaxf(result, fmaxf(fmaxf(reg_x.x, reg_x.y), fmaxf(reg_x.z, reg_x.w)));
+            } else if constexpr (Op == ReduceOp::MIN) {
+                result = fminf(result, fminf(fminf(reg_x.x, reg_x.y), fminf(reg_x.z, reg_x.w)));
+            }
+        } else {
+            result = reduce_op<Op>(result, x[base_idx]);
+            if (base_idx + 1 < N) result = reduce_op<Op>(result, x[base_idx + 1]);
+            if (base_idx + 2 < N) result = reduce_op<Op>(result, x[base_idx + 2]);
+            if (base_idx + 3 < N) result = reduce_op<Op>(result, x[base_idx + 3]);
+        }
+    }
+    const int warp_id = tid / WARP_SIZE; 
+    const int lane = tid % WARP_SIZE;
+    result = warp_reduce<Op>(result);
+    if (lane == 0) {
+        reduce_smem[warp_id] = result;
+    }
+    __syncthreads(); 
+    if (warp_id == 0) {
+        result = (lane < NUM_WARPS) ? reduce_smem[lane] : init_value<Op, float>();       
+        result = warp_reduce<Op>(result);
+        if (tid == 0) {
+            if constexpr (Op == ReduceOp::MEAN) {
+                atomicAdd(y, result);
+            }
+            else if constexpr (Op == ReduceOp::MAX || Op == ReduceOp::MIN) y[blockIdx.x] = result;
+        }
+    }
+}
+
+// blockallreduce kernel for half(f16x8) using template operations
+template <ReduceOp Op, typename T>
+__global__ void blockall_reduce_f16x8_kernel(half *x, T *y, uint64_t N){
+    int tid = threadIdx.x;
+    int base_idx = (tid + 32 * blockIdx.x) * 8;
+    constexpr int NUM_WARPS = (32 + WARP_SIZE - 1) / WARP_SIZE;
+    __shared__ T reduce_smem[NUM_WARPS];
+    T result = init_value<Op, T>();
+    if (base_idx < N){
+        if (base_idx + 7 < N){
+            half pack_x[8];
+            LDST128BITS(pack_x[0]) = LDST128BITS(x[base_idx]);
+            #pragma unroll
+            for (int i = 0; i < 8; i++){
+                if constexpr (std::is_same<T, half>::value) {
+                    result = reduce_op<Op>(result, pack_x[i]);
+                } else {
+                    result = reduce_op<Op>(result, __half2float(pack_x[i]));
+                }
+            }
+        } else {
+            for (int i = 0; i < 8 && (base_idx + i) < N; i++){
+                if constexpr (std::is_same<T, half>::value) {
+                    result = reduce_op<Op>(result, x[base_idx + i]);
+                } else {
+                    result = reduce_op<Op>(result, __half2float(x[base_idx + i]));
+                }
+            }
+        }
+    }
+    const int warp_id = tid / WARP_SIZE;
+    const int lane = tid % WARP_SIZE;
+
+    result = warp_reduce<Op>(result);
+    
+    if (lane == 0) {
+        reduce_smem[warp_id] = result;
+    }
+    
+    __syncthreads();
+    
+    if (warp_id == 0){
+        result = (lane < NUM_WARPS) ? reduce_smem[lane] : init_value<Op, T>();
+        result = warp_reduce<Op>(result);
+        
+        if (tid == 0) {
+            if constexpr (Op == ReduceOp::MEAN && std::is_same<T, float>::value) {
+                atomicAdd(y, result);
+            }
+            else if constexpr (Op == ReduceOp::MAX || Op == ReduceOp::MIN) y[blockIdx.x] = result;
+        }
+    }
+}
+
+template <ReduceOp Op>
+__global__ void reduce_f32x4_contigous_kernel(
+    float *x, float *y,
+    int prefix_size, int suffix_size,
+    uint64_t output_size, uint64_t reduce_size){
+
+    extern __shared__ float shared_mem_float[];
+    int tid = threadIdx.x;
+    int output_idx = blockIdx.x;
+    int prefix_idx = output_idx / suffix_size;
+    int suffix_idx = output_idx % suffix_size;
+
+    if (output_idx >= output_size) return;
+    float result = init_value<Op, float>();
+    int base_idx = prefix_idx * reduce_size * suffix_size + suffix_idx;
+    // const int vector_reduce_size = reduce_size / 4;
+    // suffix_size equals to inner_stride
+    for (int i = tid; i < reduce_size; i += blockDim.x){
+        result = reduce_op<Op>(result, x[base_idx + i * suffix_size]);
+    }
+
+    shared_mem_float[tid] = result;
+    __syncthreads();
+    for (int s = blockDim.x / 2; s >= 32; s >>= 1) {
+        if (tid < s) {
+            shared_mem_float[tid] = reduce_op<Op>(shared_mem_float[tid], shared_mem_float[tid + s]);
+        }
+        __syncthreads();
+    }
+    if (tid < 32){
+        result = shared_mem_float[tid];
+        result = warp_reduce<Op>(result);
+        if (tid == 0) {
+            y[output_idx] = finalize_result<Op>(result, reduce_size);
+            
+        }
+    }
+}
+
+template <ReduceOp Op>
+__global__ void reduce_f16x8_contigous_kernel(
+    half *x, half *y,
+    int prefix_size, int suffix_size,
+    uint64_t output_size, uint64_t reduce_size){
+    extern __shared__ float shared_mem_half[];
+    int tid = threadIdx.x;
+    int output_idx = blockIdx.x;
+    int prefix_idx = output_idx / suffix_size;
+    int suffix_idx = output_idx % suffix_size;
+ 
+    if (output_idx >= output_size) return;
+    float result = init_value<Op, float>();
+    int base_idx = prefix_idx * reduce_size * suffix_size + suffix_idx;
+    for (int i = tid; i < reduce_size; i += blockDim.x){
+        result = reduce_op<Op>(result, __half2float(x[base_idx + i * suffix_size]));
+    }
+    shared_mem_half[tid] = result;
+    __syncthreads();
+    for (int s = blockDim.x / 2; s >= 32; s >>= 1) {
+        if (tid < s) {
+            shared_mem_half[tid] = reduce_op<Op>(shared_mem_half[tid], shared_mem_half[tid + s]);
+        }
+        __syncthreads();
+    }
+    if (tid < 32){
+        result = shared_mem_half[tid];
+        result = warp_reduce<Op>(result);
+        // each block has one result
+        if (tid == 0) {
+            float fresult = finalize_result<Op>(result, reduce_size);
+            y[output_idx] = __float2half(fresult);
+        }
+    }
+}
+/*
+[2, 3, 4, 5] tensor axes = [0, ] -> output tensor [1, 3, 1, 5]
+output_idx = 0
+
+
+*/
+template<ReduceOp Op, bool KeepDims>
+__global__ void reduce_f32_kernel(
+    float *x, float *y,
+    const int64_t *x_strides, const int64_t *y_strides,
+    const uint64_t reduce_size, const uint64_t output_size,
+    const int64_t *non_reduce_axes,
+    const uint64_t axes_size,
+    const uint64_t *reduce_axes_stride,
+    const int64_t *reduce_axes, const int64_t INPUT_NDIM,
+    const uint64_t element_num
+){
+    extern __shared__ float shared_mem_float[];
+    int tid = threadIdx.x;
+    int output_idx = blockIdx.x;
+    if (output_idx >= output_size) return;
+    float result = init_value<Op, float>();
+    uint64_t out_coords[10] = {0};
+    if constexpr (KeepDims) {
+        for (int i = 0; i < INPUT_NDIM - axes_size; i++) {
+            out_coords[non_reduce_axes[i]] = output_idx / y_strides[non_reduce_axes[i]];
+            output_idx %= y_strides[non_reduce_axes[i]];
+        }
+    } else {
+        for (int i = 0; i < INPUT_NDIM - axes_size; i++) {
+            out_coords[non_reduce_axes[i]] = output_idx / y_strides[i];
+            output_idx %= y_strides[i];
+        }
+    }
+    uint64_t baseoffset = 0;
+    for (int i = 0; i < INPUT_NDIM; i++) {
+        baseoffset += out_coords[i] * x_strides[i];
+    }
+    for (uint64_t i = tid; i < reduce_size; i += blockDim.x){
+        uint64_t offset = baseoffset;
+        uint64_t remaining = i;
+        for (int j = 0; j < axes_size; j++) {
+            const uint64_t axis = reduce_axes[j];
+            const uint64_t coord = remaining / reduce_axes_stride[j];
+            offset += coord * x_strides[axis];
+            remaining %= reduce_axes_stride[j];
+        }
+
+        result = reduce_op<Op>(result, x[offset]);
+
+    }
+    shared_mem_float[tid] = result;
+    __syncthreads();
+    for (int s = blockDim.x / 2; s >= 32; s >>= 1) {
+        if (tid < s) {
+            shared_mem_float[tid] = reduce_op<Op>(shared_mem_float[tid], shared_mem_float[tid + s]);
+        }
+        __syncthreads();
+    }
+    if (tid < 32){
+        result = shared_mem_float[tid];
+        result = warp_reduce<Op>(result);
+        if (tid == 0) {
+            y[blockIdx.x] = finalize_result<Op>(result, reduce_size);
+        }
+    }
+}
+
+
+template<typename Tdata>
+infiniopStatus_t reduce_nv_gpu(
+    ReduceCudaDescriptor_t desc,
+    void *x,
+    void *y,
+    void *stream){
+    uint64_t N = desc->element_num;
+    if (desc->reduce_mode == 0){
+        if constexpr(std::is_same<Tdata, float>::value){
+            dim3 block(256 / 4);
+            dim3 grid((N + block.x * 4 - 1) / (block.x * 4));
+            if (desc->reduce_op_type == 0) {
+                blockall_reduce_f32x4_kernel<ReduceOp::MEAN><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), N);
+                divide_by_n_kernel<<<1, 1, 0, (cudaStream_t)stream>>>(
+                    reinterpret_cast<float *>(y), N);
+            } else if (desc->reduce_op_type == 1 || desc->reduce_op_type == 2) {
+                float* temp_buffer;
+                cudaMalloc(&temp_buffer, grid.x * sizeof(float));
+                if (desc->reduce_op_type == 1) {            
+                    blockall_reduce_f32x4_kernel<ReduceOp::MAX><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), temp_buffer, N);
+                } else if (desc->reduce_op_type == 2) {
+                    blockall_reduce_f32x4_kernel<ReduceOp::MIN><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), temp_buffer, N);
+                }
+                int num_blocks = grid.x;
+                if (num_blocks <= 32) {
+                    if (desc->reduce_op_type == 1) { // MAX
+                        warp_final_reduce_kernel<ReduceOp::MAX><<<1, 32, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<float *>(y), num_blocks);
+                    } else {
+                        warp_final_reduce_kernel<ReduceOp::MIN><<<1, 32, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<float *>(y), num_blocks);
+                    }
+                } else {
+                    if (desc->reduce_op_type == 1) {
+                        block_final_reduce_kernel<ReduceOp::MAX><<<1, 256, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<float *>(y), num_blocks);
+                    } else {
+                        block_final_reduce_kernel<ReduceOp::MIN><<<1, 256, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<float *>(y), num_blocks);
+                    }
+                }
+                cudaFree(temp_buffer);
+            }
+        } else {
+            dim3 block(256 / 8);
+            dim3 grid((N + block.x * 8 - 1) / (block.x * 8));
+            if (desc->reduce_op_type == 0) {
+                float* temp_buffer;
+                cudaMalloc(&temp_buffer, sizeof(float));
+                blockall_reduce_f16x8_kernel<ReduceOp::MEAN, float><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), temp_buffer, N);
+                divide_by_n_kernel<<<1, 1, 0, (cudaStream_t)stream>>>(
+                    reinterpret_cast<half *>(y), temp_buffer, N);
+            } else if (desc->reduce_op_type == 1 || desc->reduce_op_type == 2) {
+                half *temp_buffer;
+                cudaMalloc(&temp_buffer, grid.x * sizeof(half));
+                if (desc->reduce_op_type == 1) {
+                    blockall_reduce_f16x8_kernel<ReduceOp::MAX, half><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(temp_buffer), N);
+                } else if (desc->reduce_op_type == 2) {
+                    blockall_reduce_f16x8_kernel<ReduceOp::MIN, half><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(temp_buffer), N);
+                }
+                int num_blocks = grid.x;
+                if (num_blocks <= 32) {
+                    if (desc->reduce_op_type == 1) {
+                        warp_final_reduce_kernel<ReduceOp::MAX><<<1, 32, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<half *>(y), num_blocks);
+                    } else {
+                        warp_final_reduce_kernel<ReduceOp::MIN><<<1, 32, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<half *>(y), num_blocks);
+                    }
+                } else {
+                    if (desc->reduce_op_type == 1) {
+                        block_final_reduce_kernel<ReduceOp::MAX><<<1, 256, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<half *>(y), num_blocks);
+                    } else {
+                        block_final_reduce_kernel<ReduceOp::MIN><<<1, 256, 0, (cudaStream_t)stream>>>(
+                            temp_buffer, reinterpret_cast<half *>(y), num_blocks);
+                    }
+                }
+                cudaFree(temp_buffer);
+            }
+        }
+    }
+    // contiguous axes
+    else if (desc->reduce_mode == 1){
+        if constexpr(std::is_same<Tdata, float>::value){
+            dim3 block(128);
+            dim3 grid(desc->output_size);
+            size_t shared_mem_size = block.x * sizeof(float);
+            if (desc->reduce_op_type == 0) {
+                reduce_f32x4_contigous_kernel<ReduceOp::MEAN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                    reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+            } else if (desc->reduce_op_type == 1) {
+                reduce_f32x4_contigous_kernel<ReduceOp::MAX><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                    reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+            } else if (desc->reduce_op_type == 2) {
+                reduce_f32x4_contigous_kernel<ReduceOp::MIN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                    reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+            }
+        } else {
+            dim3 block(128);
+            dim3 grid(desc->output_size);
+            size_t shared_mem_size = block.x * sizeof(half);
+            if (desc->reduce_op_type == 0) {
+                reduce_f16x8_contigous_kernel<ReduceOp::MEAN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                    reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+            }else if (desc->reduce_op_type == 1) {
+                reduce_f16x8_contigous_kernel<ReduceOp::MAX><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                    reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+            } else if (desc->reduce_op_type == 2) {
+                reduce_f16x8_contigous_kernel<ReduceOp::MIN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                    reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+            }
+        }
+    }
+    // not contiguous axes
+    else if (desc->reduce_mode == 2){
+        dim3 block(128);
+        dim3 grid(desc->output_size);
+        size_t shared_mem_size = block.x * sizeof(float);
+        if constexpr(std::is_same<Tdata, float>::value){
+            if (desc->keepdims){
+                if (desc->reduce_op_type == 0) {
+                    reduce_f32_kernel<ReduceOp::MEAN, true><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
+                        desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
+                }
+                else if (desc->reduce_op_type == 1) {
+                    reduce_f32_kernel<ReduceOp::MAX, true><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
+                        desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
+                }
+                else if (desc->reduce_op_type == 2) {
+                    reduce_f32_kernel<ReduceOp::MIN, true><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
+                        desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
+                }
+            }else{
+                if (desc->reduce_op_type == 0) {
+                    reduce_f32_kernel<ReduceOp::MEAN, false><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
+                        desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
+                }
+                else if (desc->reduce_op_type == 1) {
+                    reduce_f32_kernel<ReduceOp::MAX, false><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
+                        desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
+                }
+                else if (desc->reduce_op_type == 2) {
+                    reduce_f32_kernel<ReduceOp::MIN, false><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
+                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
+                        desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
+                }
+            }
+        }    
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaReduce(
+    ReduceCudaDescriptor_t desc,
+    void *y,
+    void *x,
+    void *stream){
+    if (desc->dtype == F16) {
+        return reduce_nv_gpu<half>(desc, x, y, stream);
+    }
+    if (desc->dtype == F32) {
+        return reduce_nv_gpu<float>(desc, x, y, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/reduce/cuda/reduce_cuda.h b/src/ops/reduce/cuda/reduce_cuda.h
new file mode 100644
index 00000000..3f433da2
--- /dev/null
+++ b/src/ops/reduce/cuda/reduce_cuda.h
@@ -0,0 +1,61 @@
+#ifndef __CUDA_REDUCE_H__
+#define __CUDA_REDUCE_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <memory>
+
+typedef struct ReduceCudaDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t input_ndim;
+    uint64_t output_ndim;
+    int64_t *non_reduce_axes;
+    int64_t *input_strides;
+    int64_t *output_strides;
+    uint64_t *input_shape;
+    uint64_t *output_shape;
+    int64_t *reduce_axes;
+    uint64_t *reduce_axes_stride;
+    uint64_t reduce_size;
+    uint64_t element_num;
+    uint64_t output_size;
+    int reduce_op_type;
+    int reduce_mode; // output_size * reduce_size = element_num
+    uint64_t axes_size;
+    bool keepdims;
+    int64_t start_axis;
+    int64_t end_axis;
+    int prefix_size;
+    int suffix_size;
+} ReduceCudaDescriptor;
+
+typedef struct ReduceCudaDescriptor *ReduceCudaDescriptor_t;
+
+
+enum ReduceOpType {
+    REDUCE_SUM = 0,
+    REDUCE_MIN = 1,
+    REDUCE_MAX = 2,
+    REDUCE_MEAN = 3
+};
+
+infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
+                                            ReduceCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t x,
+                                            infiniopTensorDescriptor_t y,
+                                            int64_t const *axes,
+                                            uint64_t axes_size,
+                                            int reduce_op_type,
+                                            bool keepdims
+                                            );
+
+
+infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc,
+    void *y,
+    void *x,
+    void *stream);
+
+infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc);
+
+#endif// __CUDA_MATMUL_H__
diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc
index eaa713df..d5c70703 100644
--- a/src/ops/reduce/operator.cc
+++ b/src/ops/reduce/operator.cc
@@ -6,6 +6,10 @@
 #include "cpu/reduce_cpu.h"
 #endif
 
+#ifdef ENABLE_NV_GPU
+#include "cuda/reduce_cuda.h"
+#endif
+
 
 __C infiniopStatus_t infiniopCreateReduceDescriptor(
     infiniopHandle_t handle,
@@ -21,16 +25,24 @@ __C infiniopStatus_t infiniopCreateReduceDescriptor(
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuCreateReduceDescriptor(handle, (ReduceCpuDescriptor_t *) desc_ptr, y, x, axes, n, reduce_type, noop_with_empty_axes, keepdims);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaCreateReduceDescriptor((CudaHandle_t)handle, (ReduceCudaDescriptor_t *) desc_ptr, y, x, axes, n, reduce_type, keepdims);
 #endif
     }
     return STATUS_BAD_DEVICE;
 }
 
-__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuReduce((ReduceCpuDescriptor_t) desc, y, x, dynamic_axes, dynamic_axes_size, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaReduce((ReduceCudaDescriptor_t) desc, y, x, stream);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -41,7 +53,12 @@ __C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuDestroyReduceDescriptor((ReduceCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaDestroyReduceDescriptor((ReduceCudaDescriptor_t) desc);
 #endif
     }
+    return STATUS_BAD_DEVICE;
 
 }
\ No newline at end of file
diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h
index a667d533..af5e1964 100644
--- a/src/ops/reduce/reduce.h
+++ b/src/ops/reduce/reduce.h
@@ -20,7 +20,7 @@ __C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle,
                                                      bool noop_with_empty_axes,
                                                      int reduce_type);
 
-__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc);
 #endif
\ No newline at end of file
diff --git a/src/ops/reducemax/operator.cc b/src/ops/reducemax/operator.cc
index 921b6682..53951265 100644
--- a/src/ops/reducemax/operator.cc
+++ b/src/ops/reducemax/operator.cc
@@ -28,7 +28,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(
     return STATUS_SUCCESS;
 }
 
-__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
     auto _desc = (_ReducemaxDescriptor_t) desc;
     CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
     return STATUS_SUCCESS;
diff --git a/src/ops/reducemean/operator.cc b/src/ops/reducemean/operator.cc
index ab322da7..aaeeb9b9 100644
--- a/src/ops/reducemean/operator.cc
+++ b/src/ops/reducemean/operator.cc
@@ -28,7 +28,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(
     return STATUS_SUCCESS;
 }
 
-__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
     auto _desc = (_ReducemeanDescriptor_t) desc;
     CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
     return STATUS_SUCCESS;
diff --git a/src/ops/reducemin/operator.cc b/src/ops/reducemin/operator.cc
index 9907d8e5..3a66a218 100644
--- a/src/ops/reducemin/operator.cc
+++ b/src/ops/reducemin/operator.cc
@@ -27,7 +27,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(
     };
     return STATUS_SUCCESS;
 }
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
     auto _desc = (_ReduceminDescriptor_t) desc;
     CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
     return STATUS_SUCCESS;
diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc
index 11a1a2d4..30a35688 100644
--- a/src/ops/where/cpu/where_cpu.cc
+++ b/src/ops/where/cpu/where_cpu.cc
@@ -9,6 +9,9 @@ infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t handle,
     infiniopTensorDescriptor_t src2,
     infiniopTensorDescriptor_t condition
     ){
+    if (!isValidBroadcastShape(dst, src1) || !isValidBroadcastShape(dst, src2) || !isValidBroadcastShape(dst, condition)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
     if (dst->dt != F16 && dst->dt != F32) {
         return STATUS_BAD_TENSOR_DTYPE;
     }
@@ -31,47 +34,41 @@ infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t handle,
     uint64_t *src2_shape = new uint64_t[dst->ndim];
     uint64_t *condition_shape = new uint64_t[dst->ndim];
     uint64_t *dst_shape = new uint64_t[dst->ndim];
+
     int64_t *dst_strides = new int64_t[dst->ndim];
+    int64_t *src1_strides = new int64_t[src1->ndim];
+    int64_t *src2_strides = new int64_t[src2->ndim];
+    int64_t *condition_strides = new int64_t[condition->ndim];
     uint64_t dst_ndim = dst->ndim;
     uint64_t element_num = 1;
     for (uint64_t i = 0; i < dst->ndim; i++) {
         element_num *= dst->shape[i];
     }
+
     memcpy(src1_shape, src1->shape, src1->ndim * sizeof(uint64_t));
     memcpy(src2_shape, src2->shape, src2->ndim * sizeof(uint64_t));
-    memcpy(condition_shape, condition->shape, condition->ndim * sizeof(uint8_t));
-    //扩展到dst的维度
-    for (uint64_t i = 0; i < dst->ndim; i++) {
-        if (i < src1->ndim){
-            src1_shape[i] = src1->shape[i];
-        }
-        else{
-            src1_shape[i] = 1;
-        }
-        if (i < src2->ndim){
-            src2_shape[i] = src2->shape[i];
-        }
-        else{
-            src2_shape[i] = 1;
-        }
-        if (i < condition->ndim){
-            condition_shape[i] = condition->shape[i];
-        }
-        else{
-            condition_shape[i] = 1;
-        }
-    }
+    memcpy(condition_shape, condition->shape, condition->ndim * sizeof(uint64_t));
     memcpy(dst_shape, dst->shape, dst->ndim * sizeof(uint64_t));
+
     memcpy(dst_strides, dst->strides, dst->ndim * sizeof(int64_t));
+    memcpy(src1_strides, src1->strides, src1->ndim * sizeof(int64_t));
+    memcpy(src2_strides, src2->strides, src2->ndim * sizeof(int64_t));
+    memcpy(condition_strides, condition->strides, condition->ndim * sizeof(int64_t));
     *desc_ptr = new WhereCpuDescriptor{
-        DevCpu,
+        handle->device,
         dst->dt,
         src1_shape,
         src2_shape,
         condition_shape,
         dst_shape,
         dst_strides,
+        src1_strides,
+        src2_strides,
+        condition_strides,
         dst_ndim,
+        src1->ndim,
+        src2->ndim,
+        condition->ndim,
         element_num
     };
     return STATUS_SUCCESS;
@@ -83,9 +80,33 @@ infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc){
     delete[] desc->condition_shape;
     delete[] desc->dst_shape;
     delete[] desc->dst_strides;
+    delete[] desc->src1_strides;
+    delete[] desc->src2_strides;
+    delete[] desc->condition_strides;
     delete desc;
     return STATUS_SUCCESS;
 }
+inline uint64_t broadcast_map(
+    uint64_t idx, 
+    const uint64_t* dst_shape,     
+    uint64_t dst_ndim,             
+    const uint64_t* input_shape,   
+    const int64_t* input_strides,  
+    uint64_t input_ndim            
+) {
+    uint64_t index = 0;
+    const int offset = dst_ndim - input_ndim;
+    for (int i = dst_ndim - 1; i >= 0; idx /= dst_shape[i--]) {
+        const uint64_t coord = idx % dst_shape[i];
+        if (i >= offset) { 
+            const int dim = i - offset;
+            const uint64_t size = input_shape[dim];
+            index += (size == 1 ? 0 : coord % size) * input_strides[dim];
+        }
+    }
+    return index;
+}
+
 
 template<typename Tdata>
 infiniopStatus_t where_cpu(WhereCpuDescriptor_t desc,
@@ -98,31 +119,11 @@ infiniopStatus_t where_cpu(WhereCpuDescriptor_t desc,
     auto src1_ = reinterpret_cast<Tdata *>(src1);
     auto src2_ = reinterpret_cast<Tdata *>(src2);
     auto condition_ = reinterpret_cast<uint8_t *>(condition);
-    auto ndim = desc->dst_ndim;
-    auto dst_strides = desc->dst_strides;
-    auto condition_shape = desc->condition_shape;
-    auto src1_shape = desc->src1_shape;
-    auto src2_shape = desc->src2_shape;
     #pragma omp parallel for
     for (uint64_t i = 0; i < desc->element_num; i++) {
-        std::vector<int> indices(ndim, 0);
-        int global_index = i;
-        for (int z = 0; z <= ndim - 1; z++){
-            indices[z] = global_index / dst_strides[z];
-            global_index %= dst_strides[z];
-        }
-        uint64_t condition_index = 0, src1_index = 0, src2_index = 0;
-        for (int j = 0; j < ndim; j++) {
-            if (condition_shape[j] != 1){
-                condition_index = condition_index * condition_shape[j] + indices[j];
-            }
-            if (src1_shape[j] != 1){
-                src1_index = src1_index * src1_shape[j] + indices[j];
-            }
-            if (src2_shape[j] != 1){
-                src2_index = src2_index * src2_shape[j] + indices[j];
-            }
-        }
+        uint64_t condition_index = broadcast_map(i, desc->dst_shape, desc->dst_ndim, desc->condition_shape, desc->condition_strides, desc->condition_ndim);
+        uint64_t src1_index = broadcast_map(i, desc->dst_shape, desc->dst_ndim, desc->src1_shape, desc->src1_strides, desc->src1_ndim);
+        uint64_t src2_index = broadcast_map(i, desc->dst_shape, desc->dst_ndim, desc->src2_shape, desc->src2_strides, desc->src2_ndim);
         dst_[i] = condition_[condition_index] ? src1_[src1_index] : src2_[src2_index];
     }
     return STATUS_SUCCESS;
diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h
index 9fbc5922..4e647b4f 100644
--- a/src/ops/where/cpu/where_cpu.h
+++ b/src/ops/where/cpu/where_cpu.h
@@ -10,7 +10,13 @@ struct WhereCpuDescriptor {
     uint64_t const *condition_shape;
     uint64_t const *dst_shape;
     int64_t const *dst_strides;
+    int64_t const *src1_strides;
+    int64_t const *src2_strides;
+    int64_t const *condition_strides;
     uint64_t dst_ndim;
+    uint64_t src1_ndim;
+    uint64_t src2_ndim;
+    uint64_t condition_ndim;
     uint64_t element_num;
 };
 
diff --git a/src/ops/where/cuda/where_cuda.cc b/src/ops/where/cuda/where_cuda.cc
new file mode 100644
index 00000000..35701773
--- /dev/null
+++ b/src/ops/where/cuda/where_cuda.cc
@@ -0,0 +1,120 @@
+#include "where_cuda.h"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t handle,
+                                            WhereCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t dst,
+                                            infiniopTensorDescriptor_t src1,
+                                            infiniopTensorDescriptor_t src2,
+                                            infiniopTensorDescriptor_t condition
+                                            ) {
+    if (!isValidBroadcastShape(dst, src1) || !isValidBroadcastShape(dst, src2) || !isValidBroadcastShape(dst, condition)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (dst->dt != F16 && dst->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src1->dt != F16 && src1->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src2->dt != F16 && src2->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (condition->dt != U8) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(dst) || !is_contiguous(src1) || !is_contiguous(src2) || !is_contiguous(condition)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (dst->dt != src1->dt || dst->dt != src2->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    uint64_t *src1_shape = new uint64_t[dst->ndim];
+    uint64_t *src2_shape = new uint64_t[dst->ndim];
+    uint64_t *condition_shape = new uint64_t[dst->ndim];
+    uint64_t *dst_shape = new uint64_t[dst->ndim];
+
+    int64_t *dst_strides = new int64_t[dst->ndim];
+    int64_t *src1_strides = new int64_t[src1->ndim];
+    int64_t *src2_strides = new int64_t[src2->ndim];
+    int64_t *condition_strides = new int64_t[condition->ndim];
+    uint64_t dst_ndim = dst->ndim;
+    uint64_t element_num = 1;
+    for (uint64_t i = 0; i < dst->ndim; i++) {
+        element_num *= dst->shape[i];
+    }
+
+    memcpy(src1_shape, src1->shape, src1->ndim * sizeof(uint64_t));
+    memcpy(src2_shape, src2->shape, src2->ndim * sizeof(uint64_t));
+    memcpy(condition_shape, condition->shape, condition->ndim * sizeof(uint64_t));
+    memcpy(dst_shape, dst->shape, dst->ndim * sizeof(uint64_t));
+
+    memcpy(dst_strides, dst->strides, dst->ndim * sizeof(int64_t));
+    memcpy(src1_strides, src1->strides, src1->ndim * sizeof(int64_t));
+    memcpy(src2_strides, src2->strides, src2->ndim * sizeof(int64_t));
+    memcpy(condition_strides, condition->strides, condition->ndim * sizeof(int64_t));
+
+    uint64_t *src1_device_shape, *src2_device_shape, *condition_device_shape, *dst_device_shape;
+    int64_t *dst_device_strides, *src1_device_strides, *src2_device_strides, *condition_device_strides;
+    checkCudaErrorWithCode(cudaMalloc((void**)&src1_device_shape, src1->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&src2_device_shape, src2->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&condition_device_shape, condition->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&dst_device_shape, dst->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    
+    checkCudaErrorWithCode(cudaMalloc((void**)&dst_device_strides, dst->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&src1_device_strides, src1->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&src2_device_strides, src2->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void**)&condition_device_strides, condition->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    
+    checkCudaErrorWithCode(cudaMemcpy(src1_device_shape, src1_shape, src1->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(src2_device_shape, src2_shape, src2->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(condition_device_shape, condition_shape, condition->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(dst_device_shape, dst->shape, dst->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    checkCudaErrorWithCode(cudaMemcpy(dst_device_strides, dst_strides, dst->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(src1_device_strides, src1_strides, src1->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(src2_device_strides, src2_strides, src2->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(condition_device_strides, condition_strides, condition->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    *desc_ptr = new WhereCudaDescriptor{
+        handle->device,
+        dst->dt,
+        src1_device_shape,
+        src2_device_shape,
+        condition_device_shape,
+        dst_device_shape,
+        dst_device_strides,
+        src1_device_strides,
+        src2_device_strides,
+        condition_device_strides,
+        dst_ndim,
+        src1->ndim,
+        src2->ndim,
+        condition->ndim,
+        element_num
+    };
+    delete[] src1_shape;
+    delete[] src2_shape;
+    delete[] condition_shape;
+    delete[] dst_shape;
+    delete[] dst_strides;
+    delete[] src1_strides;
+    delete[] src2_strides;
+    delete[] condition_strides;
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree((void*)desc->src1_shape), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->src2_shape), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->condition_shape), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->dst_shape), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->dst_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->src1_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->src2_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void*)desc->condition_strides), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/where/cuda/where_cuda.cu b/src/ops/where/cuda/where_cuda.cu
new file mode 100644
index 00000000..df2d3277
--- /dev/null
+++ b/src/ops/where/cuda/where_cuda.cu
@@ -0,0 +1,184 @@
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../../utils.h"
+#include "where_cuda.h"
+#include <cuda_fp16.h>
+
+#define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
+#define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
+
+__device__ __forceinline__ uint64_t broadcast_map(
+    uint64_t idx, 
+    const uint64_t* dst_shape,     
+    uint64_t dst_ndim,             
+    const uint64_t* input_shape,   
+    const int64_t* input_strides,  
+    uint64_t input_ndim            
+) {
+    uint64_t index = 0;
+    const int offset = dst_ndim - input_ndim;
+    for (int i = dst_ndim - 1; i >= 0; idx /= dst_shape[i--]) {
+        const uint64_t coord = idx % dst_shape[i];
+        if (i >= offset) { 
+            const int dim = i - offset;
+            const uint64_t size = input_shape[dim];
+            index += (size == 1 ? 0 : coord % size) * input_strides[dim];
+        }
+    }
+    return index;
+}
+
+
+
+__global__ void where_f32x4_kernel(float* src1, uint64_t const *src1_shape, 
+                                 float* src2, uint64_t const *src2_shape, 
+                                 float* dst, uint64_t const *dst_shape, 
+                                 uint8_t* condition, uint64_t const *condition_shape,
+                                 uint64_t dst_ndim, uint64_t src1_ndim, 
+                                 uint64_t src2_ndim, uint64_t condition_ndim,
+                                 int64_t const *condition_strides, 
+                                 int64_t const *src1_strides, 
+                                 int64_t const *src2_strides, 
+                                 int64_t const *dst_strides,
+                                 int N) {
+
+    int idx_base = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx_base >= N) return;
+    bool valid[4] = {
+        (idx_base + 0) < N,
+        (idx_base + 1) < N,
+        (idx_base + 2) < N,
+        (idx_base + 3) < N
+    };
+    uint64_t condition_offset[4], src1_offset[4], src2_offset[4];
+    float4 reg_dst;
+    #pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        if (valid[i]) {
+            uint64_t idx = idx_base + i;
+            condition_offset[i] = broadcast_map(idx, dst_shape, dst_ndim, condition_shape, 
+                                                condition_strides, condition_ndim);
+            src1_offset[i] = broadcast_map(idx, dst_shape, dst_ndim, src1_shape, 
+                                          src1_strides, src1_ndim);
+            src2_offset[i] = broadcast_map(idx, dst_shape, dst_ndim, src2_shape, 
+                                          src2_strides, src2_ndim);
+            float val = condition[condition_offset[i]] ? src1[src1_offset[i]] : src2[src2_offset[i]];
+            ((float*)&reg_dst)[i] = val;
+        }
+    }
+    if (valid[0] && valid[1] && valid[2] && valid[3]) {
+        FLOAT4(dst[idx_base]) = reg_dst;
+    } else {
+        for (int i = 0; i < 4; ++i) {
+            if (valid[i]) {
+                dst[idx_base + i] = ((float*)&reg_dst)[i];
+            }
+        }
+    }
+}
+
+__global__ void where_f16x8_kernel(half* src1, uint64_t const *src1_shape, 
+                                half* src2, uint64_t const *src2_shape, 
+                                half* dst, uint64_t const *dst_shape, 
+                                uint8_t* condition, uint64_t const *condition_shape,
+                                uint64_t dst_ndim, uint64_t src1_ndim, 
+                                uint64_t src2_ndim, uint64_t condition_ndim,
+                                int64_t const *condition_strides, 
+                                int64_t const *src1_strides, 
+                                int64_t const *src2_strides, 
+                                int64_t const *dst_strides,
+                                int N){
+    int idx_base = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx_base >= N) return;
+    int64_t condition_offset[8], src1_offset[8], src2_offset[8];
+    half pack_dst[8];
+    bool valid[8] = {
+        (idx_base + 0) < N,
+        (idx_base + 1) < N,
+        (idx_base + 2) < N,
+        (idx_base + 3) < N,
+        (idx_base + 4) < N,
+        (idx_base + 5) < N,
+        (idx_base + 6) < N,
+        (idx_base + 7) < N
+    };
+    #pragma unroll
+    for (int i = 0; i < 8; i++){
+        if (valid[i]){
+            uint64_t idx = idx_base + i;
+            condition_offset[i] = broadcast_map(idx, dst_shape, dst_ndim, condition_shape, 
+                            condition_strides, condition_ndim);
+            src1_offset[i] = broadcast_map(idx, dst_shape, dst_ndim, src1_shape, 
+                    src1_strides, src1_ndim);
+            src2_offset[i] = broadcast_map(idx, dst_shape, dst_ndim, src2_shape, 
+                    src2_strides, src2_ndim);
+            pack_dst[i] = condition[condition_offset[i]] ? src1[src1_offset[i]] : src2[src2_offset[i]];
+        }
+    }
+    if (valid[0] && valid[1] && valid[2] && valid[3] && valid[4] && valid[5] && valid[6] && valid[7]){
+        LDST128BITS(dst[idx_base]) = LDST128BITS(pack_dst[0]);
+    } else {
+        for (int i = 0; i < 8; i++){
+            if (valid[i]){
+                dst[idx_base + i] = pack_dst[i];
+            }
+        }
+    }
+}
+
+template<typename Tdata>
+infiniopStatus_t where_nv_gpu(
+    WhereCudaDescriptor_t desc,
+    void *dst, 
+    void *src1,
+    void *src2,
+    void *condition,
+    int per_thread_element,
+    void *stream){
+    uint64_t N = desc->element_num;
+    dim3 block(256 / per_thread_element);
+    dim3 grid((N + 256 - 1) / 256);
+    if constexpr(std::is_same<Tdata, float>::value){
+        where_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
+            reinterpret_cast<float *>(src1), desc->src1_shape, 
+            reinterpret_cast<float *>(src2), desc->src2_shape, 
+            reinterpret_cast<float *>(dst), desc->dst_shape, 
+            reinterpret_cast<uint8_t *>(condition), desc->condition_shape,
+            desc->dst_ndim, desc->src1_ndim, 
+            desc->src2_ndim, desc->condition_ndim,
+            desc->condition_strides, 
+            desc->src1_strides, 
+            desc->src2_strides, 
+            desc->dst_strides,
+            N);
+    } else if constexpr(std::is_same<Tdata, half>::value){
+        where_f16x8_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
+            reinterpret_cast<half *>(src1), desc->src1_shape, 
+            reinterpret_cast<half *>(src2), desc->src2_shape, 
+            reinterpret_cast<half *>(dst), desc->dst_shape, 
+            reinterpret_cast<uint8_t *>(condition), desc->condition_shape,
+            desc->dst_ndim, desc->src1_ndim, 
+            desc->src2_ndim, desc->condition_ndim,
+            desc->condition_strides, 
+            desc->src1_strides, 
+            desc->src2_strides, 
+            desc->dst_strides,
+            N);
+    }
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc,
+    void *dst, 
+    void *src1,
+    void *src2,
+    void *condition,
+    void *stream){
+    if (desc->dtype == F16){
+        return where_nv_gpu<half>(desc, dst, src1, src2, condition, 8, stream);
+    }
+    if (desc->dtype == F32){
+        return where_nv_gpu<float>(desc, dst, src1, src2, condition, 4, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/where/cuda/where_cuda.h b/src/ops/where/cuda/where_cuda.h
new file mode 100644
index 00000000..38c62734
--- /dev/null
+++ b/src/ops/where/cuda/where_cuda.h
@@ -0,0 +1,46 @@
+#ifndef __CUDA_WHERE_H__
+#define __CUDA_WHERE_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <memory>
+
+typedef struct WhereCudaDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t const *src1_shape;
+    uint64_t const *src2_shape;
+    uint64_t const *condition_shape;
+    uint64_t const *dst_shape;
+    int64_t const *dst_strides;
+    int64_t const *src1_strides;
+    int64_t const *src2_strides;
+    int64_t const *condition_strides;
+    uint64_t dst_ndim;
+    uint64_t src1_ndim;
+    uint64_t src2_ndim;
+    uint64_t condition_ndim;
+    uint64_t element_num;
+} WhereCudaDescriptor;
+
+typedef struct WhereCudaDescriptor *WhereCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t handle,
+                                            WhereCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t dst,
+                                            infiniopTensorDescriptor_t src1,
+                                            infiniopTensorDescriptor_t src2,
+                                            infiniopTensorDescriptor_t condition
+                                            );
+
+
+infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc,
+                            void *dst, 
+                            void *src1,
+                            void *src2,
+                            void *condition,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc
index fe94ffcb..5483803c 100644
--- a/src/ops/where/operator.cc
+++ b/src/ops/where/operator.cc
@@ -6,6 +6,10 @@
 #include "cpu/where_cpu.h"
 #endif
 
+#ifdef ENABLE_NV_GPU
+#include "cuda/where_cuda.h"
+#endif
+
 
 __C infiniopStatus_t infiniopCreateWhereDescriptor(
     infiniopHandle_t handle,
@@ -19,6 +23,11 @@ __C infiniopStatus_t infiniopCreateWhereDescriptor(
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuCreateWhereDescriptor(handle, (WhereCpuDescriptor_t *) desc_ptr, dst, src1, src2, condition);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateWhereDescriptor((CudaHandle_t) handle, (WhereCudaDescriptor_t *) desc_ptr, dst, src1, src2, condition);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -30,6 +39,10 @@ __C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, vo
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuWhere((WhereCpuDescriptor_t) desc, dst, src1, src2, condition, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaWhere((WhereCudaDescriptor_t) desc, dst, src1, src2, condition, stream);
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -40,7 +53,11 @@ __C infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t de
 #ifdef ENABLE_CPU
         case DevCpu:
             return cpuDestroyWhereDescriptor((WhereCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaDestroyWhereDescriptor((WhereCudaDescriptor_t) desc);
 #endif
     }
-
+    return STATUS_BAD_DEVICE;
 }
\ No newline at end of file

From d2c863c1c9e7a0d7def5ec930dfee748f63d6166 Mon Sep 17 00:00:00 2001
From: Graylatzhou <1391087899@qq.com>
Date: Mon, 17 Mar 2025 21:20:43 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20const=20=E4=BF=AE?=
 =?UTF-8?q?=E9=A5=B0=E7=AC=A6=E5=88=B0=E7=AE=97=E5=AD=90=E8=BE=93=E5=85=A5?=
 =?UTF-8?q?=E5=B9=B6=E5=AE=9E=E7=8E=B0=20CUDA=20gather=20=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 env.sh                              |   1 -
 include/ops/clip/clip.h             |   2 +-
 include/ops/gather/gather.h         |   2 +-
 include/ops/reducemax/reducemax.h   |   2 +-
 include/ops/reducemean/reducemean.h |   2 +-
 include/ops/reducemin/reducemin.h   |   2 +-
 include/ops/where/where.h           |   2 +-
 operatorspy/tests/gather.py         |  55 +++++----
 src/ops/clip/cuda/clip_cuda.cu      |  49 ++------
 src/ops/clip/cuda/clip_cuda.h       |   2 +-
 src/ops/clip/operator.cc            |   2 +-
 src/ops/gather/cpu/gather_cpu.cc    |  12 +-
 src/ops/gather/cpu/gather_cpu.h     |   4 +-
 src/ops/gather/cuda/gather_cuda.cc  |  49 ++++----
 src/ops/gather/cuda/gather_cuda.cu  | 179 ++++++++++++++++++++++++++++
 src/ops/gather/cuda/gather_cuda.h   |  15 ++-
 src/ops/gather/operator.cc          |   2 +-
 src/ops/reduce/cpu/reduce_cpu.cc    |   3 +-
 src/ops/reduce/cpu/reduce_cpu.h     |   2 +-
 src/ops/reduce/cuda/reduce_cuda.cc  |   2 -
 src/ops/reduce/cuda/reduce_cuda.cu  |  62 +++++-----
 src/ops/reduce/cuda/reduce_cuda.h   |   2 +-
 src/ops/reduce/operator.cc          |   2 +-
 src/ops/reduce/reduce.h             |   3 +-
 src/ops/reducemax/operator.cc       |   2 +-
 src/ops/reducemean/operator.cc      |  23 ++--
 src/ops/where/cpu/where_cpu.cc      |  18 +--
 src/ops/where/cpu/where_cpu.h       |   6 +-
 src/ops/where/cuda/where_cuda.cu    |  36 +++---
 src/ops/where/cuda/where_cuda.h     |   6 +-
 src/ops/where/operator.cc           |   2 +-
 31 files changed, 362 insertions(+), 189 deletions(-)
 delete mode 100644 env.sh

diff --git a/env.sh b/env.sh
deleted file mode 100644
index b743e9e7..00000000
--- a/env.sh
+++ /dev/null
@@ -1 +0,0 @@
-export INFINI_ROOT=/home/zzw/workspace/operators-dev/build/linux/x86_64/release
\ No newline at end of file
diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h
index b5e0c19e..f7e1abeb 100644
--- a/include/ops/clip/clip.h
+++ b/include/ops/clip/clip.h
@@ -15,7 +15,7 @@ __C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t hand
                                                                 infiniopTensorDescriptor_t y
                                                                 );
 
-__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void *x, float *min, float *max, void *y, void *stream);
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, float *min, float *max, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
 
diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h
index 7ad07f98..7d510621 100644
--- a/include/ops/gather/gather.h
+++ b/include/ops/gather/gather.h
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t ha
                                                                 int64_t axis
                                                                 );
 
-__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *x, void *indices, void *y, void *stream);
+__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void const *indices, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
 
diff --git a/include/ops/reducemax/reducemax.h b/include/ops/reducemax/reducemax.h
index 8f0e6526..60da5030 100644
--- a/include/ops/reducemax/reducemax.h
+++ b/include/ops/reducemax/reducemax.h
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
 #endif
diff --git a/include/ops/reducemean/reducemean.h b/include/ops/reducemean/reducemean.h
index c94c0e7d..cfb57913 100644
--- a/include/ops/reducemean/reducemean.h
+++ b/include/ops/reducemean/reducemean.h
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, const void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
 #endif
diff --git a/include/ops/reducemin/reducemin.h b/include/ops/reducemin/reducemin.h
index c22e9ef7..51731d7e 100644
--- a/include/ops/reducemin/reducemin.h
+++ b/include/ops/reducemin/reducemin.h
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, const void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
 #endif
diff --git a/include/ops/where/where.h b/include/ops/where/where.h
index 373fc845..ba47d93f 100644
--- a/include/ops/where/where.h
+++ b/include/ops/where/where.h
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t han
                                                             infiniopTensorDescriptor_t condition
                                                             );
 
-__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void *src1, void *src2, void *condition, void *stream);
+__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void const *src1, void const *src2, void const *condition, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
 
diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
index a3e819ed..2c01cada 100644
--- a/operatorspy/tests/gather.py
+++ b/operatorspy/tests/gather.py
@@ -30,11 +30,10 @@ class GatherDescriptor(Structure):
 
 infiniopGatherDescriptor_t = POINTER(GatherDescriptor)
 
-def gather(input, indices, axis):
-    np_input = input.numpy()
-    np_indices = indices.numpy()
-    np_output = np.take(np_input, np_indices, axis=axis)
-    return torch.from_numpy(np_output)
+def gather(x, indices, axis = 0):
+    idx = [slice(None)] * x.ndim
+    idx[axis] = indices
+    return x[tuple(idx)]
 
 def tuple_to_void_p(py_tuple: Tuple):
     array = ctypes.c_int64 * len(py_tuple)
@@ -55,16 +54,19 @@ def test(
     tensor_dtype=torch.float16
 ):
     print(
-        f"Testing clip on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
+        f"Testing gather on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
     )
     x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
-    if len(x.shape) == 2:
-        indices = torch.tensor(2, dtype=torch.int64, device=torch_device)
-    elif len(x.shape) == 3:
-        indices = torch.tensor([[0, 1], [1, 2]], dtype=torch.int64, device=torch_device)
+    if isinstance(indices_shape, int):
+        indices_shape_tuple = (indices_shape,)
+    else:
+        indices_shape_tuple = tuple(indices_shape)
+    indices = torch.randint(0, x.shape[axis], indices_shape_tuple, 
+                       device=torch_device).type(torch.int64)
     dst = torch.randn(inferShape(x_shape, indices.shape, axis), dtype=tensor_dtype, device=torch_device)
+
     ans = gather(x, indices, axis)
-    axis = axis
+
     x_tensor = to_tensor(x, lib)
     indices_tensor = to_tensor(indices, lib)
     dst_tensor = to_tensor(dst, lib)
@@ -106,25 +108,35 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"pytorch ans: {ans}")
-    print(f"lib ans: {dst}")
+    ans = ans.to(torch_device)
     assert torch.allclose(dst, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, indices_shape, axis in test_cases:
-        test(lib, handle, "cpu", x_shape, indices_shape, axis, tensor_dtype=torch.float16)
-        print("\n")
-        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    for x_shape, indices_shape, axis, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, indices_shape, axis, tensor_dtype=tensor_dtype)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, indices_shape, axis, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, indices_shape, axis, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        ((3, 4), (2), 0),
-        ((2, 3, 4), (2, 2), 1),
+        ((3, 4), (2), 0, torch.float32),
+        ((64, 64), (64, 64), 0, torch.float32),
+        ((64, 64), (64, 64), 1, torch.float32),
+        ((2, 3, 4), (2, 2), 1, torch.float32),
+        ((64, 64), (64, 64), 0, torch.float16),
+        ((64, 64), (64, 64), 1, torch.float16),
+        ((8, 8, 8, 8, 8), (8, 8), 0, torch.float16),
+        ((8, 8, 8, 8, 8), (8, 8), 2, torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -144,5 +156,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyGatherDescriptor.restype = c_int32
     lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopGatherDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
     print("All tests passed!")
\ No newline at end of file
diff --git a/src/ops/clip/cuda/clip_cuda.cu b/src/ops/clip/cuda/clip_cuda.cu
index 327f6d7e..17f6a4fa 100644
--- a/src/ops/clip/cuda/clip_cuda.cu
+++ b/src/ops/clip/cuda/clip_cuda.cu
@@ -7,13 +7,16 @@
 #define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
 #define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
 
-__global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min_value, int N){
+#define LDST128BITS_CONST(value) (reinterpret_cast<float4 const *>(&(value))[0])
+#define FLOAT4_CONST(value) (reinterpret_cast<float4 const *>(&(value))[0])
+
+__global__ void clip_f32x4_kernel(const float *a, float *b, float max_value, float min_value, int N){
     int idx = 4 * (blockDim.x * blockIdx.x + threadIdx.x);
     if (idx < N) {
         int remaining = N - idx;
         float4 reg_a, reg_b;
         if (remaining >= 4) {
-        reg_a = FLOAT4(a[idx]);
+        reg_a = FLOAT4_CONST(a[idx]);
         } else {
             reg_a.x = a[idx];
             reg_a.y = (remaining >= 2) ? a[idx + 1] : 0;
@@ -35,14 +38,14 @@ __global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min
 }
 
 
-__global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float min_value, int N){
+__global__ void clip_f16x8_pack_kernel(const half *a, half *b, float max_value, float min_value, int N){
     int idx = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
     if (idx >= N) return;
     const half min_half = __float2half(min_value);
     const half max_half = __float2half(max_value);
     half pack_a[8], pack_b[8];
     if (idx + 7 < N) {
-        LDST128BITS(pack_a[0]) = LDST128BITS(a[idx]);
+        LDST128BITS(pack_a[0]) = LDST128BITS_CONST(a[idx]);
     } else {
         for (int i = 0; i < 8 && (idx + i) < N; i++) {
             pack_a[i] = a[idx + i];
@@ -65,7 +68,7 @@ __global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float
 template<typename Tdata>
 infiniopStatus_t clip_nv_gpu(
     ClipCudaDescriptor_t desc,
-    void *x,
+    void const *x,
     void *y,
     float min_value,
     float max_value,
@@ -75,45 +78,15 @@ infiniopStatus_t clip_nv_gpu(
     dim3 block(256 / per_thread_element);
     dim3 grid((N + 256 - 1) / 256);
     if constexpr(std::is_same<Tdata, float>::value){
-        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
-    }else{
-        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
-    }
-    /*
-    if (desc->ndim != 2){
-        dim3 block(256 / per_thread_element);
-        dim3 grid((N + 256 - 1) / 256);
-        if constexpr(std::is_same<Tdata, float>::value){
-            clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
-        }else{
-            clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
-        }
+        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
     }else{
-        if ((desc->K / per_thread_element) <= 1024){
-            dim3 block(desc->K / (per_thread_element));                                   
-            dim3 grid(desc->S);
-            if constexpr(std::is_same<Tdata, float>::value){
-                clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
-            }else{
-                clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
-            }
-        }
-        else{
-            dim3 block(256 / per_thread_element);
-            dim3 grid((N + 256 - 1) / 256);
-            if constexpr(std::is_same<Tdata, float>::value){
-                clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), min_value, max_value, N);
-            }else{
-                clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), min_value, max_value, N);
-            }
-        }
+        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
     }
-        */
     return STATUS_SUCCESS;
 }
 
 infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
-    void *x,
+    void const *x,
     void *y,
     float *min,
     float *max,
diff --git a/src/ops/clip/cuda/clip_cuda.h b/src/ops/clip/cuda/clip_cuda.h
index a805726a..76c6e7a5 100644
--- a/src/ops/clip/cuda/clip_cuda.h
+++ b/src/ops/clip/cuda/clip_cuda.h
@@ -22,7 +22,7 @@ infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
 
 
 infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
-                            void *x,
+                            void const *x,
                             void *y,
                             float *min,
                             float *max,
diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc
index 380cb561..e5e34964 100644
--- a/src/ops/clip/operator.cc
+++ b/src/ops/clip/operator.cc
@@ -32,7 +32,7 @@ __C infiniopStatus_t infiniopCreateClipDescriptor(
 }
 
 
-__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void *x, float *min, float *max, void *y, void *stream) {
+__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, float *min, float *max, void *y, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc
index 89765862..51264094 100644
--- a/src/ops/gather/cpu/gather_cpu.cc
+++ b/src/ops/gather/cpu/gather_cpu.cc
@@ -54,12 +54,12 @@ infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc){
 
 template<typename Tdata, typename Tindices>
 infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
-    void *x, 
-    void *indices,
+    void const *x, 
+    void const *indices,
     void *y)
 {
-    auto *src_data = reinterpret_cast<Tdata*>(x);
-    auto *indices_data = reinterpret_cast<Tindices*>(indices); // [!code ++]
+    auto *src_data = reinterpret_cast<const Tdata*>(x);
+    auto *indices_data = reinterpret_cast<const Tindices*>(indices); // [!code ++]
     auto *dst_data = reinterpret_cast<Tdata*>(y);
     uint64_t indices_element_count = 1;
     for (uint64_t i = 0; i < desc->indices_ndim; ++i) {
@@ -96,8 +96,8 @@ infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
 }
 
 infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
-    void *x, 
-    void *indices,
+    void const *x, 
+    void const *indices,
     void *y,
     void *stream){
     if (desc->dtype == F16){
diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h
index 6d2d55d7..41243cae 100644
--- a/src/ops/gather/cpu/gather_cpu.h
+++ b/src/ops/gather/cpu/gather_cpu.h
@@ -26,8 +26,8 @@ infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t handle,
                                         );
 
 infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
-                                  void *data, 
-                                  void *indices,
+                                  void const *data, 
+                                  void const *indices,
                                   void *dst,
                                   void *stream);
 
diff --git a/src/ops/gather/cuda/gather_cuda.cc b/src/ops/gather/cuda/gather_cuda.cc
index a63eaffd..a5f903af 100644
--- a/src/ops/gather/cuda/gather_cuda.cc
+++ b/src/ops/gather/cuda/gather_cuda.cc
@@ -20,31 +20,36 @@ infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t handle,
     }
     if (axis < 0 || axis >= x->ndim){
         return STATUS_BAD_PARAM;
+    }  
+    int otherDims = 1;
+    for (int i = 0; i < axis; i++){
+        otherDims *= static_cast<int>(x->shape[i]);
     }
-}
-
-infiniopStatus_t gather_nv_gpu(
-    GatherCudaDescriptor_t desc,
-    void *x,
-    void *indices,
-    void *y,
-    void *stream){
-    return STATUS_SUCCESS;
-}
-
-
-infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc,
-                            void *x,
-                            void *indices,
-                            void *y,
-                            void *stream){
-    if (desc->dtype == F16){
-
+    for (int i = axis + 1; i < x->ndim; i++){
+        otherDims *= static_cast<int>(x->shape[i]);
     }
-    if (desc->dtype == F32){
-
+    int stride = 1;
+    for (int i = axis + 1; i < x->ndim; i++){
+        stride *= static_cast<int>(x->shape[i]);
     }
-    return STATUS_BAD_DEVICE;
+    int indices_size = 1;
+    for (int i = 0; i < indices->ndim; i++){
+        indices_size *= static_cast<int>(indices->shape[i]);
+    }
+    int dim_size = static_cast<int>(x->shape[axis]);
+    *desc_ptr = new GatherCudaDescriptor{
+        DevNvGpu,
+        x->dt,
+        indices->dt,
+        axis,
+        otherDims,
+        x->ndim,
+        y->ndim,
+        dim_size,
+        indices_size,
+        stride
+    };
+    return STATUS_SUCCESS;
 }
 
 infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc){
diff --git a/src/ops/gather/cuda/gather_cuda.cu b/src/ops/gather/cuda/gather_cuda.cu
index df3a9702..9f794a51 100644
--- a/src/ops/gather/cuda/gather_cuda.cu
+++ b/src/ops/gather/cuda/gather_cuda.cu
@@ -2,3 +2,182 @@
 #include "../../utils.h"
 #include "gather_cuda.h"
 #include <cuda_fp16.h>
+
+#define LDST128BITS_CONST(value) (reinterpret_cast<const float4*>(&(value))[0])
+#define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
+template<typename IndexType, bool use_shared, typename T>
+__global__ void gather_kernel(
+    const T* input,
+    const IndexType* indices,
+    T* output,
+    int otherDims,
+    int dim_size,
+    int stride,
+    int indices_size
+) {
+    extern __shared__ char shared_mem[];
+    IndexType* sharedIndices = (IndexType*)shared_mem;
+    if constexpr(use_shared) {
+        for (int i = threadIdx.x; i < indices_size; i += blockDim.x) {
+            if (i < indices_size) {
+                sharedIndices[i] = indices[i];
+            }
+        }
+        __syncthreads();
+    }
+    constexpr int ITEMS_PER_THREAD = 4;
+    int tid_base = (blockIdx.x * blockDim.x + threadIdx.x) * ITEMS_PER_THREAD;
+    int total_elements = otherDims * indices_size;
+    int stride_indices = stride * indices_size;
+    
+    #pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
+        int tid = tid_base + i;
+        if (tid >= total_elements) break;
+        int outer_idx = tid / stride_indices; 
+        int remaining = tid - outer_idx * stride_indices; 
+        int indices_idx = remaining / stride;
+        int inner_idx = remaining - indices_idx * stride;
+        IndexType gather_idx;
+        if constexpr(use_shared) {
+            gather_idx = indices_size == 1 ? sharedIndices[0] : sharedIndices[indices_idx];
+        } else {
+            gather_idx = indices_size == 1 ? indices[0] : indices[indices_idx];
+        }
+        if (gather_idx >= 0 && gather_idx < dim_size) {
+            int outer_offset = outer_idx * stride_indices;
+            int indices_offset = indices_idx * stride;
+            int input_offset = outer_idx * stride * dim_size + gather_idx * stride;
+            output[outer_offset + indices_offset + inner_idx] = input[input_offset + inner_idx];
+        }
+    }
+}
+template<typename IndexType, bool use_shared, typename T>
+__global__ void gather_vectorized_kernel(
+    const T* input,
+    const IndexType* indices,
+    T* output,
+    int otherDims,
+    int dim_size,
+    int stride,
+    int indices_size
+) {
+    extern __shared__ char shared_mem[];
+    IndexType* sharedIndices = (IndexType*)shared_mem;
+
+    if constexpr(use_shared) {
+        for (int i = threadIdx.x; i < indices_size; i += blockDim.x) {
+            if (i < indices_size) {
+                sharedIndices[i] = indices[i];
+            }
+        }
+        __syncthreads();
+    }
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride_in_vec4;
+    int total_vectors;
+    if constexpr(std::is_same<T, float>::value){
+        stride_in_vec4 = stride / 4; 
+        total_vectors = otherDims * indices_size / 4;
+    } else {
+        stride_in_vec4 = stride / 8; 
+        total_vectors = otherDims * indices_size / 8;
+    }
+    if (tid < total_vectors) {
+        int outer_idx = tid / (stride_in_vec4 * indices_size);
+        int remaining = tid - (outer_idx * stride_in_vec4 * indices_size);
+        int indices_idx = remaining / stride_in_vec4;
+        int inner_idx = remaining - (indices_idx * stride_in_vec4);
+        IndexType gather_idx;
+        if constexpr(use_shared) {
+            gather_idx = indices_size == 1 ? sharedIndices[0] : sharedIndices[indices_idx];
+        } else {
+            gather_idx = indices_size == 1 ? indices[0] : indices[indices_idx];
+        }
+        if (gather_idx >= 0 && gather_idx < dim_size) {
+            int input_idx = (outer_idx * stride_in_vec4 * dim_size + gather_idx * stride_in_vec4 + inner_idx) * 4;
+            int output_idx = (outer_idx * stride_in_vec4 * indices_size + indices_idx * stride_in_vec4 + inner_idx) * 4;
+            LDST128BITS(output[output_idx]) = LDST128BITS_CONST(input[input_idx]);
+        }
+    }
+}
+
+template <typename T, typename IndexType>
+infiniopStatus_t gather_nv_gpu(GatherCudaDescriptor_t desc,
+    void const *x,
+    void *y,
+    void const *indices,
+    void *stream){
+
+    const size_t sharedMemSize = desc->indices_size <= 1024 ? desc->indices_size * sizeof(IndexType) : 0;
+    if (desc->stride % 4 == 0 && desc->stride >= 4){
+        const int block_size = 128;
+        const int total_vectors = desc->otherDims * desc->indices_size / 4;
+        const int gridSize = (total_vectors + block_size - 1) / block_size;
+        if (sharedMemSize == 0) gather_vectorized_kernel<IndexType, false, T><<<gridSize, block_size, 0, (cudaStream_t)stream>>>(
+            reinterpret_cast<const T*>(x),
+            reinterpret_cast<const IndexType*>(indices),
+            reinterpret_cast<T*>(y),
+            desc->otherDims,
+            desc->dim_size,
+            desc->stride,
+            desc->indices_size
+        );
+        else gather_vectorized_kernel<IndexType, true, T><<<gridSize, block_size, sharedMemSize, (cudaStream_t)stream>>>(
+            reinterpret_cast<const T*>(x),
+            reinterpret_cast<const IndexType*>(indices),
+            reinterpret_cast<T*>(y),
+            desc->otherDims,
+            desc->dim_size,
+            desc->stride,
+            desc->indices_size
+        );
+    } else {
+        int block_size = 128;
+        const int total_elements = desc->otherDims * desc->indices_size;
+        const int gridSize = (total_elements + block_size * 4 - 1) / (block_size * 4);
+        if (sharedMemSize == 0) gather_kernel<IndexType, false, T><<<gridSize, block_size, 0, (cudaStream_t)stream>>>(
+            reinterpret_cast<const T*>(x),
+            reinterpret_cast<const IndexType*>(indices),
+            reinterpret_cast<T*>(y),
+            desc->otherDims,
+            desc->dim_size,
+            desc->stride,
+            desc->indices_size
+        );
+        else gather_kernel<IndexType, true, T><<<gridSize, block_size, sharedMemSize, (cudaStream_t)stream>>>(
+            reinterpret_cast<const T*>(x),
+            reinterpret_cast<const IndexType*>(indices),
+            reinterpret_cast<T*>(y),
+            desc->otherDims,
+            desc->dim_size,
+            desc->stride,
+            desc->indices_size
+        );    
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc,
+    const void *x,
+    const void *indices,
+    void *y,
+    void *stream){
+    if (desc->dtype == F32){
+        if (desc->indices_dtype == I32){
+            return gather_nv_gpu<float, int32_t>(desc, x, y, indices, stream);
+        }
+        else if (desc->indices_dtype == I64){
+            return gather_nv_gpu<float, int64_t>(desc, x, y, indices, stream);
+        }
+    }
+    if (desc->dtype == F16){
+        if (desc->indices_dtype == I32){
+            return gather_nv_gpu<half, int32_t>(desc, x, y, indices, stream);
+        }
+        else if (desc->indices_dtype == I64){
+            return gather_nv_gpu<half, int64_t>(desc, x, y, indices, stream);
+        }
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/gather/cuda/gather_cuda.h b/src/ops/gather/cuda/gather_cuda.h
index 0996ae64..562c5bbc 100644
--- a/src/ops/gather/cuda/gather_cuda.h
+++ b/src/ops/gather/cuda/gather_cuda.h
@@ -8,8 +8,15 @@
 typedef struct GatherCudaDescriptor {
     Device device;
     DT dtype;
-    uint64_t ndim;
-    uint64_t element_num;
+    DT indices_dtype;
+    int64_t axis;
+    int otherDims;
+    uint64_t input_ndim;
+    uint64_t output_ndim;
+    int dim_size;
+    int indices_size;
+    int stride;
+
 } GatherCudaDescriptor;
 
 typedef struct GatherCudaDescriptor *GatherCudaDescriptor_t;
@@ -24,8 +31,8 @@ infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t handle,
 
 
 infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc,
-                            void *x,
-                            void *indices,
+                            void const *x,
+                            void const *indices,
                             void *y,
                             void *stream);
 
diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc
index 58855c00..8d51c43c 100644
--- a/src/ops/gather/operator.cc
+++ b/src/ops/gather/operator.cc
@@ -31,7 +31,7 @@ __C infiniopStatus_t infiniopCreateGatherDescriptor(
     }
     return STATUS_BAD_DEVICE;
 }
-__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *x, void *indices, void *y, void *stream) {
+__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void const *indices, void *y, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc
index d2a25917..74ad723c 100644
--- a/src/ops/reduce/cpu/reduce_cpu.cc
+++ b/src/ops/reduce/cpu/reduce_cpu.cc
@@ -134,7 +134,6 @@ infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
     auto y_strides = desc->y_strides;
     int i, j;
     int indices[desc->x_ndim];
-    //#pragma omp parallel for
     std::vector<int> non_reduce_axes;
     for (int j = 0; j < desc->x_ndim; ++j) {
         if (!std::binary_search(axes, axes + num_axes, j)) {
@@ -214,7 +213,7 @@ infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
 
 infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
                             void *y,
-                            void *x,
+                            void const *x,
                             void *dynamic_axes,
                             uint64_t dynamic_axes_size,
                             void *stream){
diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h
index 3111c371..9b79dec0 100644
--- a/src/ops/reduce/cpu/reduce_cpu.h
+++ b/src/ops/reduce/cpu/reduce_cpu.h
@@ -44,7 +44,7 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
 
 infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
                             void *y,
-                            void *x,
+                            void const *x,
                             void *dynamic_axes,
                             uint64_t dynamic_axes_size,
                             void *stream);
diff --git a/src/ops/reduce/cuda/reduce_cuda.cc b/src/ops/reduce/cuda/reduce_cuda.cc
index 461a9ca6..60e057ed 100644
--- a/src/ops/reduce/cuda/reduce_cuda.cc
+++ b/src/ops/reduce/cuda/reduce_cuda.cc
@@ -47,8 +47,6 @@ infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
     for (int i = axes_size - 2; i >= 0; i--){
         reduce_axes_stride[i] = reduce_axes_stride[i + 1] * x->shape[axes[i + 1]];
     }
-
-
     memcpy(h_axes, axes, axes_size * sizeof(int64_t));
     memcpy(input_shape, x->shape, x->ndim * sizeof(uint64_t));
     memcpy(output_shape, y->shape, y->ndim * sizeof(uint64_t));
diff --git a/src/ops/reduce/cuda/reduce_cuda.cu b/src/ops/reduce/cuda/reduce_cuda.cu
index a6ec6963..b603a13f 100644
--- a/src/ops/reduce/cuda/reduce_cuda.cu
+++ b/src/ops/reduce/cuda/reduce_cuda.cu
@@ -8,6 +8,8 @@
 
 #define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
+#define FLOAT4_CONST(value) (reinterpret_cast<const float4*>(&(value))[0])
+#define LDST128BITS_CONST(value) (reinterpret_cast<const float4*>(&(value))[0])
 
 enum class ReduceOp {
     SUM,
@@ -207,7 +209,7 @@ __global__ void block_final_reduce_kernel(half *temp_in, half *y, int num_blocks
 }
 
 template <ReduceOp Op>
-__global__ void blockall_reduce_f32x4_kernel(float *x, float *y, uint64_t N){
+__global__ void blockall_reduce_f32x4_kernel(const float *x, float *y, uint64_t N){
     int tid = threadIdx.x;
     int base_idx = (tid + 64 * blockIdx.x) * 4;
     constexpr int NUM_WARPS = (64 + WARP_SIZE - 1) / WARP_SIZE;
@@ -215,7 +217,7 @@ __global__ void blockall_reduce_f32x4_kernel(float *x, float *y, uint64_t N){
     float result = init_value<Op, float>();
     if (base_idx < N){
         if (base_idx + 3 < N){
-            float4 reg_x = FLOAT4(x[base_idx]);
+            float4 reg_x = FLOAT4_CONST(x[base_idx]);
             if constexpr (Op == ReduceOp::SUM || Op == ReduceOp::MEAN) {
                 result += reg_x.x + reg_x.y + reg_x.z + reg_x.w;
             } else if constexpr (Op == ReduceOp::MAX) {
@@ -251,7 +253,7 @@ __global__ void blockall_reduce_f32x4_kernel(float *x, float *y, uint64_t N){
 
 // blockallreduce kernel for half(f16x8) using template operations
 template <ReduceOp Op, typename T>
-__global__ void blockall_reduce_f16x8_kernel(half *x, T *y, uint64_t N){
+__global__ void blockall_reduce_f16x8_kernel(const half *x, T *y, uint64_t N){
     int tid = threadIdx.x;
     int base_idx = (tid + 32 * blockIdx.x) * 8;
     constexpr int NUM_WARPS = (32 + WARP_SIZE - 1) / WARP_SIZE;
@@ -260,7 +262,7 @@ __global__ void blockall_reduce_f16x8_kernel(half *x, T *y, uint64_t N){
     if (base_idx < N){
         if (base_idx + 7 < N){
             half pack_x[8];
-            LDST128BITS(pack_x[0]) = LDST128BITS(x[base_idx]);
+            LDST128BITS(pack_x[0]) = LDST128BITS_CONST(x[base_idx]);
             #pragma unroll
             for (int i = 0; i < 8; i++){
                 if constexpr (std::is_same<T, half>::value) {
@@ -305,7 +307,7 @@ __global__ void blockall_reduce_f16x8_kernel(half *x, T *y, uint64_t N){
 
 template <ReduceOp Op>
 __global__ void reduce_f32x4_contigous_kernel(
-    float *x, float *y,
+    const float *x, float *y,
     int prefix_size, int suffix_size,
     uint64_t output_size, uint64_t reduce_size){
 
@@ -344,7 +346,7 @@ __global__ void reduce_f32x4_contigous_kernel(
 
 template <ReduceOp Op>
 __global__ void reduce_f16x8_contigous_kernel(
-    half *x, half *y,
+    const half *x, half *y,
     int prefix_size, int suffix_size,
     uint64_t output_size, uint64_t reduce_size){
     extern __shared__ float shared_mem_half[];
@@ -385,7 +387,7 @@ output_idx = 0
 */
 template<ReduceOp Op, bool KeepDims>
 __global__ void reduce_f32_kernel(
-    float *x, float *y,
+    const float *x, float *y,
     const int64_t *x_strides, const int64_t *y_strides,
     const uint64_t reduce_size, const uint64_t output_size,
     const int64_t *non_reduce_axes,
@@ -424,9 +426,7 @@ __global__ void reduce_f32_kernel(
             offset += coord * x_strides[axis];
             remaining %= reduce_axes_stride[j];
         }
-
         result = reduce_op<Op>(result, x[offset]);
-
     }
     shared_mem_float[tid] = result;
     __syncthreads();
@@ -449,8 +449,8 @@ __global__ void reduce_f32_kernel(
 template<typename Tdata>
 infiniopStatus_t reduce_nv_gpu(
     ReduceCudaDescriptor_t desc,
-    void *x,
     void *y,
+    void const *x,
     void *stream){
     uint64_t N = desc->element_num;
     if (desc->reduce_mode == 0){
@@ -458,16 +458,16 @@ infiniopStatus_t reduce_nv_gpu(
             dim3 block(256 / 4);
             dim3 grid((N + block.x * 4 - 1) / (block.x * 4));
             if (desc->reduce_op_type == 0) {
-                blockall_reduce_f32x4_kernel<ReduceOp::MEAN><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), N);
+                blockall_reduce_f32x4_kernel<ReduceOp::MEAN><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), N);
                 divide_by_n_kernel<<<1, 1, 0, (cudaStream_t)stream>>>(
                     reinterpret_cast<float *>(y), N);
             } else if (desc->reduce_op_type == 1 || desc->reduce_op_type == 2) {
                 float* temp_buffer;
                 cudaMalloc(&temp_buffer, grid.x * sizeof(float));
                 if (desc->reduce_op_type == 1) {            
-                    blockall_reduce_f32x4_kernel<ReduceOp::MAX><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), temp_buffer, N);
+                    blockall_reduce_f32x4_kernel<ReduceOp::MAX><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), temp_buffer, N);
                 } else if (desc->reduce_op_type == 2) {
-                    blockall_reduce_f32x4_kernel<ReduceOp::MIN><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), temp_buffer, N);
+                    blockall_reduce_f32x4_kernel<ReduceOp::MIN><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), temp_buffer, N);
                 }
                 int num_blocks = grid.x;
                 if (num_blocks <= 32) {
@@ -495,16 +495,16 @@ infiniopStatus_t reduce_nv_gpu(
             if (desc->reduce_op_type == 0) {
                 float* temp_buffer;
                 cudaMalloc(&temp_buffer, sizeof(float));
-                blockall_reduce_f16x8_kernel<ReduceOp::MEAN, float><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), temp_buffer, N);
+                blockall_reduce_f16x8_kernel<ReduceOp::MEAN, float><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), temp_buffer, N);
                 divide_by_n_kernel<<<1, 1, 0, (cudaStream_t)stream>>>(
                     reinterpret_cast<half *>(y), temp_buffer, N);
             } else if (desc->reduce_op_type == 1 || desc->reduce_op_type == 2) {
                 half *temp_buffer;
                 cudaMalloc(&temp_buffer, grid.x * sizeof(half));
                 if (desc->reduce_op_type == 1) {
-                    blockall_reduce_f16x8_kernel<ReduceOp::MAX, half><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(temp_buffer), N);
+                    blockall_reduce_f16x8_kernel<ReduceOp::MAX, half><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), reinterpret_cast<half *>(temp_buffer), N);
                 } else if (desc->reduce_op_type == 2) {
-                    blockall_reduce_f16x8_kernel<ReduceOp::MIN, half><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(temp_buffer), N);
+                    blockall_reduce_f16x8_kernel<ReduceOp::MIN, half><<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), reinterpret_cast<half *>(temp_buffer), N);
                 }
                 int num_blocks = grid.x;
                 if (num_blocks <= 32) {
@@ -536,13 +536,13 @@ infiniopStatus_t reduce_nv_gpu(
             size_t shared_mem_size = block.x * sizeof(float);
             if (desc->reduce_op_type == 0) {
                 reduce_f32x4_contigous_kernel<ReduceOp::MEAN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                    reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+                    reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
             } else if (desc->reduce_op_type == 1) {
                 reduce_f32x4_contigous_kernel<ReduceOp::MAX><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                    reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+                    reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
             } else if (desc->reduce_op_type == 2) {
                 reduce_f32x4_contigous_kernel<ReduceOp::MIN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                    reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+                    reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
             }
         } else {
             dim3 block(128);
@@ -550,13 +550,13 @@ infiniopStatus_t reduce_nv_gpu(
             size_t shared_mem_size = block.x * sizeof(half);
             if (desc->reduce_op_type == 0) {
                 reduce_f16x8_contigous_kernel<ReduceOp::MEAN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                    reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+                    reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
             }else if (desc->reduce_op_type == 1) {
                 reduce_f16x8_contigous_kernel<ReduceOp::MAX><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                    reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+                    reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
             } else if (desc->reduce_op_type == 2) {
                 reduce_f16x8_contigous_kernel<ReduceOp::MIN><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                    reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
+                    reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), desc->prefix_size, desc->suffix_size, desc->output_size, desc->reduce_size);
             }
         }
     }
@@ -569,38 +569,38 @@ infiniopStatus_t reduce_nv_gpu(
             if (desc->keepdims){
                 if (desc->reduce_op_type == 0) {
                     reduce_f32_kernel<ReduceOp::MEAN, true><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y),
                         desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
                         desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
                 }
                 else if (desc->reduce_op_type == 1) {
                     reduce_f32_kernel<ReduceOp::MAX, true><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y),
                         desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
                         desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
                 }
                 else if (desc->reduce_op_type == 2) {
                     reduce_f32_kernel<ReduceOp::MIN, true><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y),
                         desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
                         desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
                 }
             }else{
                 if (desc->reduce_op_type == 0) {
                     reduce_f32_kernel<ReduceOp::MEAN, false><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y),
                         desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
                         desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
                 }
                 else if (desc->reduce_op_type == 1) {
                     reduce_f32_kernel<ReduceOp::MAX, false><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y),
                         desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
                         desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
                 }
                 else if (desc->reduce_op_type == 2) {
                     reduce_f32_kernel<ReduceOp::MIN, false><<<grid, block, shared_mem_size, (cudaStream_t)stream>>>(
-                        reinterpret_cast<float *>(x), reinterpret_cast<float *>(y),
+                        reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y),
                         desc->input_strides, desc->output_strides, desc->reduce_size, desc->output_size,
                         desc->non_reduce_axes, desc->axes_size, desc->reduce_axes_stride, desc->reduce_axes, desc->input_ndim, desc->element_num);
                 }
@@ -613,13 +613,13 @@ infiniopStatus_t reduce_nv_gpu(
 infiniopStatus_t cudaReduce(
     ReduceCudaDescriptor_t desc,
     void *y,
-    void *x,
+    void const *x,
     void *stream){
     if (desc->dtype == F16) {
-        return reduce_nv_gpu<half>(desc, x, y, stream);
+        return reduce_nv_gpu<half>(desc, y, x, stream);
     }
     if (desc->dtype == F32) {
-        return reduce_nv_gpu<float>(desc, x, y, stream);
+        return reduce_nv_gpu<float>(desc, y, x, stream);
     }
     return STATUS_BAD_TENSOR_DTYPE;
 }
\ No newline at end of file
diff --git a/src/ops/reduce/cuda/reduce_cuda.h b/src/ops/reduce/cuda/reduce_cuda.h
index 3f433da2..9b0b2741 100644
--- a/src/ops/reduce/cuda/reduce_cuda.h
+++ b/src/ops/reduce/cuda/reduce_cuda.h
@@ -53,7 +53,7 @@ infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
 
 infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc,
     void *y,
-    void *x,
+    void const *x,
     void *stream);
 
 infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc);
diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc
index d5c70703..fcd38395 100644
--- a/src/ops/reduce/operator.cc
+++ b/src/ops/reduce/operator.cc
@@ -34,7 +34,7 @@ __C infiniopStatus_t infiniopCreateReduceDescriptor(
     return STATUS_BAD_DEVICE;
 }
 
-__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h
index af5e1964..47e49aa5 100644
--- a/src/ops/reduce/reduce.h
+++ b/src/ops/reduce/reduce.h
@@ -3,7 +3,6 @@
 
 #include "export.h"
 #include "operators.h"
-//#include "status.h"
 
 typedef struct ReduceDescriptor {
     Device device;
@@ -20,7 +19,7 @@ __C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle,
                                                      bool noop_with_empty_axes,
                                                      int reduce_type);
 
-__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc);
 #endif
\ No newline at end of file
diff --git a/src/ops/reducemax/operator.cc b/src/ops/reducemax/operator.cc
index 53951265..4b6c14ed 100644
--- a/src/ops/reducemax/operator.cc
+++ b/src/ops/reducemax/operator.cc
@@ -28,7 +28,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(
     return STATUS_SUCCESS;
 }
 
-__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
     auto _desc = (_ReducemaxDescriptor_t) desc;
     CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
     return STATUS_SUCCESS;
diff --git a/src/ops/reducemean/operator.cc b/src/ops/reducemean/operator.cc
index aaeeb9b9..5ea0121a 100644
--- a/src/ops/reducemean/operator.cc
+++ b/src/ops/reducemean/operator.cc
@@ -1,17 +1,17 @@
 #include "../utils.h"
 #include "../reduce/reduce.h"
-#include "ops/reducemean/reducemean.h"
+#include "ops/reducemin/reducemin.h"
 
-struct _ReducemeanDescriptor {
+struct _ReduceminDescriptor {
     Device device;
     infiniopReduceDescriptor_t reduce_desc;
 };
 
-typedef struct _ReducemeanDescriptor *_ReducemeanDescriptor_t;
+typedef struct _ReduceminDescriptor *_ReduceminDescriptor_t;
 
-__C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(
+__C __export infiniopStatus_t infiniopCreateReduceminDescriptor(
     infiniopHandle_t handle,
-    infiniopReducemeanDescriptor_t *desc_ptr,
+    infiniopReduceminDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y,
     infiniopTensorDescriptor_t x,
     int64_t const *axes,
@@ -20,22 +20,21 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(
     bool noop_with_empty_axes
     ) {
     infiniopReduceDescriptor_t reduce_desc;
-    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, n, keepdims, noop_with_empty_axes, 0), STATUS_SUCCESS);
-    *(_ReducemeanDescriptor_t *) desc_ptr = new _ReducemeanDescriptor{
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, n, keepdims, noop_with_empty_axes,2), STATUS_SUCCESS);
+    *(_ReduceminDescriptor_t *) desc_ptr = new _ReduceminDescriptor{
         handle->device,
         reduce_desc
     };
     return STATUS_SUCCESS;
 }
-
-__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
-    auto _desc = (_ReducemeanDescriptor_t) desc;
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+    auto _desc = (_ReduceminDescriptor_t) desc;
     CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
     return STATUS_SUCCESS;
 }
 
-__C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc) {
-    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReducemeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+__C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceminDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
     delete desc;
     return STATUS_SUCCESS;
 }
\ No newline at end of file
diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc
index 30a35688..5fd834c4 100644
--- a/src/ops/where/cpu/where_cpu.cc
+++ b/src/ops/where/cpu/where_cpu.cc
@@ -111,14 +111,14 @@ inline uint64_t broadcast_map(
 template<typename Tdata>
 infiniopStatus_t where_cpu(WhereCpuDescriptor_t desc,
     void *dst, 
-    void *src1,
-    void *src2,
-    void *condition,
+    void const *src1,
+    void const *src2,
+    void const *condition,
     void *stream){
     auto dst_ = reinterpret_cast<Tdata *>(dst);
-    auto src1_ = reinterpret_cast<Tdata *>(src1);
-    auto src2_ = reinterpret_cast<Tdata *>(src2);
-    auto condition_ = reinterpret_cast<uint8_t *>(condition);
+    auto src1_ = reinterpret_cast<const Tdata *>(src1);
+    auto src2_ = reinterpret_cast<const Tdata *>(src2);
+    auto condition_ = reinterpret_cast<const uint8_t *>(condition);
     #pragma omp parallel for
     for (uint64_t i = 0; i < desc->element_num; i++) {
         uint64_t condition_index = broadcast_map(i, desc->dst_shape, desc->dst_ndim, desc->condition_shape, desc->condition_strides, desc->condition_ndim);
@@ -131,9 +131,9 @@ infiniopStatus_t where_cpu(WhereCpuDescriptor_t desc,
 
 infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
     void *dst, 
-    void *src1,
-    void *src2,
-    void *condition,
+    void const *src1,
+    void const *src2,
+    void const *condition,
     void *stream){
     if (desc->dtype == F16) {
         return where_cpu<uint16_t>(desc, dst, src1, src2, condition, stream);
diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h
index 4e647b4f..f422980c 100644
--- a/src/ops/where/cpu/where_cpu.h
+++ b/src/ops/where/cpu/where_cpu.h
@@ -32,9 +32,9 @@ infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t handle,
 
 infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
                                   void *dst, 
-                                  void *src1,
-                                  void *src2,
-                                  void *condition,
+                                  void const *src1,
+                                  void const *src2,
+                                  void const *condition,
                                   void *stream);
 
 infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc);
diff --git a/src/ops/where/cuda/where_cuda.cu b/src/ops/where/cuda/where_cuda.cu
index df2d3277..d1873803 100644
--- a/src/ops/where/cuda/where_cuda.cu
+++ b/src/ops/where/cuda/where_cuda.cu
@@ -29,10 +29,10 @@ __device__ __forceinline__ uint64_t broadcast_map(
 
 
 
-__global__ void where_f32x4_kernel(float* src1, uint64_t const *src1_shape, 
-                                 float* src2, uint64_t const *src2_shape, 
+__global__ void where_f32x4_kernel(const float* src1, uint64_t const *src1_shape, 
+                                 const float* src2, uint64_t const *src2_shape, 
                                  float* dst, uint64_t const *dst_shape, 
-                                 uint8_t* condition, uint64_t const *condition_shape,
+                                 const uint8_t* condition, uint64_t const *condition_shape,
                                  uint64_t dst_ndim, uint64_t src1_ndim, 
                                  uint64_t src2_ndim, uint64_t condition_ndim,
                                  int64_t const *condition_strides, 
@@ -76,10 +76,10 @@ __global__ void where_f32x4_kernel(float* src1, uint64_t const *src1_shape,
     }
 }
 
-__global__ void where_f16x8_kernel(half* src1, uint64_t const *src1_shape, 
-                                half* src2, uint64_t const *src2_shape, 
+__global__ void where_f16x8_kernel(const half* src1, uint64_t const *src1_shape, 
+                                const half* src2, uint64_t const *src2_shape, 
                                 half* dst, uint64_t const *dst_shape, 
-                                uint8_t* condition, uint64_t const *condition_shape,
+                                const uint8_t* condition, uint64_t const *condition_shape,
                                 uint64_t dst_ndim, uint64_t src1_ndim, 
                                 uint64_t src2_ndim, uint64_t condition_ndim,
                                 int64_t const *condition_strides, 
@@ -129,9 +129,9 @@ template<typename Tdata>
 infiniopStatus_t where_nv_gpu(
     WhereCudaDescriptor_t desc,
     void *dst, 
-    void *src1,
-    void *src2,
-    void *condition,
+    void const *src1,
+    void const *src2,
+    void const *condition,
     int per_thread_element,
     void *stream){
     uint64_t N = desc->element_num;
@@ -139,10 +139,10 @@ infiniopStatus_t where_nv_gpu(
     dim3 grid((N + 256 - 1) / 256);
     if constexpr(std::is_same<Tdata, float>::value){
         where_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
-            reinterpret_cast<float *>(src1), desc->src1_shape, 
-            reinterpret_cast<float *>(src2), desc->src2_shape, 
+            reinterpret_cast<const float *>(src1), desc->src1_shape, 
+            reinterpret_cast<const float *>(src2), desc->src2_shape, 
             reinterpret_cast<float *>(dst), desc->dst_shape, 
-            reinterpret_cast<uint8_t *>(condition), desc->condition_shape,
+            reinterpret_cast<const uint8_t *>(condition), desc->condition_shape,
             desc->dst_ndim, desc->src1_ndim, 
             desc->src2_ndim, desc->condition_ndim,
             desc->condition_strides, 
@@ -152,10 +152,10 @@ infiniopStatus_t where_nv_gpu(
             N);
     } else if constexpr(std::is_same<Tdata, half>::value){
         where_f16x8_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
-            reinterpret_cast<half *>(src1), desc->src1_shape, 
-            reinterpret_cast<half *>(src2), desc->src2_shape, 
+            reinterpret_cast<const half *>(src1), desc->src1_shape, 
+            reinterpret_cast<const half *>(src2), desc->src2_shape, 
             reinterpret_cast<half *>(dst), desc->dst_shape, 
-            reinterpret_cast<uint8_t *>(condition), desc->condition_shape,
+            reinterpret_cast<const uint8_t *>(condition), desc->condition_shape,
             desc->dst_ndim, desc->src1_ndim, 
             desc->src2_ndim, desc->condition_ndim,
             desc->condition_strides, 
@@ -170,9 +170,9 @@ infiniopStatus_t where_nv_gpu(
 
 infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc,
     void *dst, 
-    void *src1,
-    void *src2,
-    void *condition,
+    void const *src1,
+    void const *src2,
+    void const *condition,
     void *stream){
     if (desc->dtype == F16){
         return where_nv_gpu<half>(desc, dst, src1, src2, condition, 8, stream);
diff --git a/src/ops/where/cuda/where_cuda.h b/src/ops/where/cuda/where_cuda.h
index 38c62734..5c2f6d8f 100644
--- a/src/ops/where/cuda/where_cuda.h
+++ b/src/ops/where/cuda/where_cuda.h
@@ -36,9 +36,9 @@ infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t handle,
 
 infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc,
                             void *dst, 
-                            void *src1,
-                            void *src2,
-                            void *condition,
+                            void const *src1,
+                            void const *src2,
+                            void const *condition,
                             void *stream);
 
 infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc);
diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc
index 5483803c..512638d6 100644
--- a/src/ops/where/operator.cc
+++ b/src/ops/where/operator.cc
@@ -34,7 +34,7 @@ __C infiniopStatus_t infiniopCreateWhereDescriptor(
 }
 
 
-__C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void *src1, void *src2, void *condition, void *stream) {
+__C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, void *dst, void const *src1, void const *src2, void const *condition, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:

From 4bb67ae5c91ae97b07d2a79dd7dbe4cab404c30e Mon Sep 17 00:00:00 2001
From: Graylatzhou <1391087899@qq.com>
Date: Thu, 20 Mar 2025 08:21:55 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E5=B0=86clip=E7=AE=97=E5=AD=90=E7=9A=84low?=
 =?UTF-8?q?er=20bound=E5=92=8Cupp=5Fbound=E4=BF=AE=E6=94=B9=E4=B8=BA?=
 =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=B1=9E=E6=80=A7=E8=80=8C=E9=9D=9E=E8=BE=93?=
 =?UTF-8?q?=E5=85=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/ops/clip/clip.h        |  6 +++--
 operatorspy/tests/clip.py      | 16 +++++---------
 src/ops/clip/cpu/clip_cpu.cc   | 40 +++++++++++++++-------------------
 src/ops/clip/cpu/clip_cpu.h    | 10 ++++++---
 src/ops/clip/cuda/clip_cuda.cc | 12 +++++++++-
 src/ops/clip/cuda/clip_cuda.cu | 23 ++++---------------
 src/ops/clip/cuda/clip_cuda.h  | 10 ++++++---
 src/ops/clip/operator.cc       | 14 +++++++-----
 src/ops/reducemean/operator.cc | 23 +++++++++----------
 src/ops/reducemin/operator.cc  |  2 +-
 10 files changed, 77 insertions(+), 79 deletions(-)

diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h
index f7e1abeb..a33c587c 100644
--- a/include/ops/clip/clip.h
+++ b/include/ops/clip/clip.h
@@ -12,10 +12,12 @@ typedef ClipDescriptor *infiniopClipDescriptor_t;
 __C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
                                                                 infiniopClipDescriptor_t *desc_ptr,
                                                                 infiniopTensorDescriptor_t x,
-                                                                infiniopTensorDescriptor_t y
+                                                                infiniopTensorDescriptor_t y,
+                                                                float* min,
+                                                                float* max
                                                                 );
 
-__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, float *min, float *max, void *y, void *stream);
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
 
diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py
index a2f3f20d..0874abe4 100644
--- a/operatorspy/tests/clip.py
+++ b/operatorspy/tests/clip.py
@@ -81,6 +81,8 @@ def test(
             ctypes.byref(descriptor),
             x_tensor.descriptor,
             y_tensor.descriptor,
+            ctypes.byref(c_float(min)) if min != None else None,
+            ctypes.byref(c_float(max)) if max != None else None,
         )
     )
     #Ss = [1024, 2048, 4096]
@@ -91,8 +93,6 @@ def test(
             lib.infiniopClip(
                 descriptor,
                 x_tensor.data,
-                ctypes.byref(c_float(min)) if min != None else None,
-                ctypes.byref(c_float(max)) if max != None else None,
                 y_tensor.data,
                 None,
             )
@@ -104,8 +104,6 @@ def test(
                     lib.infiniopClip(
                     descriptor,
                     x_tensor.data,
-                    ctypes.byref(c_float(min)) if min != None else None,
-                    ctypes.byref(c_float(max)) if max != None else None,
                     y_tensor.data,
                     None,
                 )
@@ -137,8 +135,8 @@ def test_cuda(lib, test_cases):
         ((3, 4), -1, None, torch.float32),
         ((3, 4), None, None, torch.float32),
         ((16), -1, 1, torch.float32),
-        ((1024, 1024), -1, 1, torch.float32),
-        ((4096, 4096), -1, 1, torch.float32),
+        # ((1024, 1024), -1, 1, torch.float32),
+        # ((4096, 4096), -1, 1, torch.float32),
         
         ((13), -1, 1, torch.float32),
         ((3, 4), -1, 1, torch.float16),
@@ -146,8 +144,8 @@ def test_cuda(lib, test_cases):
         ((3, 4), -1, None, torch.float16),
         ((3, 4), None, None, torch.float16),
         ((16), -1, 1, torch.float16),
-        ((1024, 1024), -1, 1, torch.float16),
-        ((4096, 4096), -1, 1, torch.float16),
+        # ((1024, 1024), -1, 1, torch.float16),
+        # ((4096, 4096), -1, 1, torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -163,8 +161,6 @@ def test_cuda(lib, test_cases):
         infiniopClipDescriptor_t,
         c_void_p,
         c_void_p,
-        c_void_p,
-        c_void_p,
         c_void_p
     ]
     lib.infiniopDestroyClipDescriptor.restype = c_int32
diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc
index cfdecc02..558f3ead 100644
--- a/src/ops/clip/cpu/clip_cpu.cc
+++ b/src/ops/clip/cpu/clip_cpu.cc
@@ -5,7 +5,9 @@
 infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
     ClipCpuDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t x,
-    infiniopTensorDescriptor_t y
+    infiniopTensorDescriptor_t y,
+    float* min,
+    float* max
     ){
 
     if (x->dt != F16 && x->dt != F32) {
@@ -24,9 +26,17 @@ infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
     for (uint64_t i = 0; i < x->ndim; i++) {
         element_num *= x->shape[i];
     }
+    bool has_min = min != nullptr;
+    bool has_max = max != nullptr;
+    float min_ = has_min ? *min : std::numeric_limits<float>::lowest();
+    float max_ = has_max ? *max : std::numeric_limits<float>::max();
     *desc_ptr = new ClipCpuDescriptor{
         DevCpu,
         x->dt,
+        min_,
+        max_,
+        has_min,
+        has_max,
         element_num
     };
     return STATUS_SUCCESS;
@@ -35,22 +45,18 @@ infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
 template<typename Tdata>
 infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc,
     void const *x,
-    float min,
-    float max,
-    bool has_min,
-    bool has_max,
     void *y){
     auto x_ = reinterpret_cast<Tdata const *>(x);
     auto y_ = reinterpret_cast<Tdata *>(y);
     for (uint64_t i = 0; i < desc->element_num; i++) {
         if constexpr (std::is_same<Tdata, uint16_t>::value){
             float x_f = f16_to_f32(x_[i]);
-            x_f = has_min ? std::max(x_f, min) : x_f;
-            x_f = has_max ? std::min(x_f, max) : x_f;
+            x_f = desc->has_min ? std::max(x_f, desc->min) : x_f;
+            x_f = desc->has_max ? std::min(x_f, desc->max) : x_f;
             y_[i] = f32_to_f16(x_f);
         }
         else{
-            y_[i] = std::min(std::max(x_[i], min), max);
+            y_[i] = std::min(std::max(x_[i], desc->min), desc->max);
         }
     }
     return STATUS_SUCCESS;
@@ -58,29 +64,17 @@ infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc,
 
 infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
     void const*x,
-    float *min,
-    float *max,
     void *y,
     void *stream){
-    bool has_min = true;
-    bool has_max = true;
-    if (min == nullptr){
-        has_min = false;
-    }
-    if (max == nullptr){
-        has_max = false;
-    }
-    float min_value = has_min ? *min : -std::numeric_limits<float>::infinity();
-    float max_value = has_max ? *max : std::numeric_limits<float>::infinity();
     if (desc->dtype == F16) {
-        return clip_cpu<uint16_t>(desc, x, min_value, max_value, has_min, has_max, y);
+        return clip_cpu<uint16_t>(desc, x, y);
     }
     if (desc->dtype == F32) {
-        return clip_cpu<float>(desc, x, min_value, max_value, has_min, has_max, y);
+        return clip_cpu<float>(desc, x, y);
     }
     return STATUS_BAD_TENSOR_DTYPE;
 }
 infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc){
     delete desc;
     return STATUS_SUCCESS;
-}
\ No newline at end of file
+}
diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h
index 7006a351..9bc23753 100644
--- a/src/ops/clip/cpu/clip_cpu.h
+++ b/src/ops/clip/cpu/clip_cpu.h
@@ -5,6 +5,10 @@
 struct ClipCpuDescriptor {
     Device device;
     DT dtype;
+    float min;
+    float max;
+    bool has_min;
+    bool has_max;
     uint64_t element_num;
 };
 
@@ -13,13 +17,13 @@ typedef struct ClipCpuDescriptor *ClipCpuDescriptor_t;
 infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t handle,
                                         ClipCpuDescriptor_t *desc_ptr,
                                         infiniopTensorDescriptor_t x,
-                                        infiniopTensorDescriptor_t y
+                                        infiniopTensorDescriptor_t y,
+                                        float* min,
+                                        float* max
                                         );
 
 infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
                                   void const *x, 
-                                  float *min,
-                                  float *max,
                                   void *y,
                                   void *stream);
 
diff --git a/src/ops/clip/cuda/clip_cuda.cc b/src/ops/clip/cuda/clip_cuda.cc
index f8f7ca9b..cd3a48c3 100644
--- a/src/ops/clip/cuda/clip_cuda.cc
+++ b/src/ops/clip/cuda/clip_cuda.cc
@@ -5,7 +5,9 @@
 infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
                                             ClipCudaDescriptor_t *desc_ptr,
                                             infiniopTensorDescriptor_t x,
-                                            infiniopTensorDescriptor_t y
+                                            infiniopTensorDescriptor_t y,
+                                            float *min,
+                                            float *max
                                             ) {
     if (x->dt != F16 && x->dt != F32) {
         return STATUS_BAD_TENSOR_DTYPE;
@@ -24,11 +26,19 @@ infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
         element_num *= x->shape[i];
     }
     uint64_t ndim = y->ndim;
+    bool has_min = min != nullptr;
+    bool has_max = max != nullptr;
+    float min_ = has_min ? *min : std::numeric_limits<float>::lowest();
+    float max_ = has_max ? *max : std::numeric_limits<float>::max();
     *desc_ptr = new ClipCudaDescriptor{
         DevNvGpu,
         x->dt,
         ndim,
         element_num,
+        min_,
+        max_,
+        has_min,
+        has_max
     };
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/clip/cuda/clip_cuda.cu b/src/ops/clip/cuda/clip_cuda.cu
index 17f6a4fa..ceb8663f 100644
--- a/src/ops/clip/cuda/clip_cuda.cu
+++ b/src/ops/clip/cuda/clip_cuda.cu
@@ -70,17 +70,15 @@ infiniopStatus_t clip_nv_gpu(
     ClipCudaDescriptor_t desc,
     void const *x,
     void *y,
-    float min_value,
-    float max_value,
     int per_thread_element,
     void* stream) {
     uint64_t N = desc->element_num;
     dim3 block(256 / per_thread_element);
     dim3 grid((N + 256 - 1) / 256);
     if constexpr(std::is_same<Tdata, float>::value){
-        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
+        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const float *>(x), reinterpret_cast<float *>(y), desc->max, desc->min, N);
     }else{
-        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
+        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<const half *>(x), reinterpret_cast<half *>(y), desc->max, desc->min, N);
     }
     return STATUS_SUCCESS;
 }
@@ -88,25 +86,12 @@ infiniopStatus_t clip_nv_gpu(
 infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
     void const *x,
     void *y,
-    float *min,
-    float *max,
     void *stream){
-    bool has_min = true;
-    bool has_max = true;
-    if (min == nullptr){
-        has_min = false;
-    }
-    if (max == nullptr){
-        has_max = false;
-    }
-    float min_value = has_min ? *min : -std::numeric_limits<float>::infinity();
-    float max_value = has_max ? *max : std::numeric_limits<float>::infinity();
-
     if (desc->dtype == F16) {
-        return clip_nv_gpu<half>(desc, x, y, min_value, max_value, 8, stream);
+        return clip_nv_gpu<half>(desc, x, y, 8, stream);
     }
     if (desc->dtype == F32) {
-        return clip_nv_gpu<float>(desc, x, y, min_value, max_value, 4, stream);
+        return clip_nv_gpu<float>(desc, x, y, 4, stream);
     }
     return STATUS_BAD_TENSOR_DTYPE;
 }
diff --git a/src/ops/clip/cuda/clip_cuda.h b/src/ops/clip/cuda/clip_cuda.h
index 76c6e7a5..b56911ed 100644
--- a/src/ops/clip/cuda/clip_cuda.h
+++ b/src/ops/clip/cuda/clip_cuda.h
@@ -10,6 +10,10 @@ typedef struct ClipCudaDescriptor {
     DT dtype;
     uint64_t ndim;
     uint64_t element_num;
+    float min;
+    float max;
+    bool has_min;
+    bool hax_max;
 } ClipCudaDescriptor;
 
 typedef struct ClipCudaDescriptor *ClipCudaDescriptor_t;
@@ -17,15 +21,15 @@ typedef struct ClipCudaDescriptor *ClipCudaDescriptor_t;
 infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
                                             ClipCudaDescriptor_t *desc_ptr,
                                             infiniopTensorDescriptor_t x,
-                                            infiniopTensorDescriptor_t y
+                                            infiniopTensorDescriptor_t y,
+                                            float* min,
+                                            float* max
                                             );
 
 
 infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
                             void const *x,
                             void *y,
-                            float *min,
-                            float *max,
                             void *stream);
 
 infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc);
diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc
index e5e34964..271b9b96 100644
--- a/src/ops/clip/operator.cc
+++ b/src/ops/clip/operator.cc
@@ -15,16 +15,18 @@ __C infiniopStatus_t infiniopCreateClipDescriptor(
     infiniopHandle_t handle,
     infiniopClipDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t x,
-    infiniopTensorDescriptor_t y
+    infiniopTensorDescriptor_t y,
+    float* min,
+    float* max
     ) {
     switch (handle->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *) desc_ptr, x, y);
+            return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *) desc_ptr, x, y, min, max);
 #endif
 #ifdef ENABLE_NV_GPU
         case DevNvGpu: {
-            return cudaCreateClipDescriptor((CudaHandle_t) handle, (ClipCudaDescriptor_t *) desc_ptr, x, y);
+            return cudaCreateClipDescriptor((CudaHandle_t) handle, (ClipCudaDescriptor_t *) desc_ptr, x, y, min, max);
         }
 #endif
     }
@@ -32,15 +34,15 @@ __C infiniopStatus_t infiniopCreateClipDescriptor(
 }
 
 
-__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, float *min, float *max, void *y, void *stream) {
+__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, void *y, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            return cpuClip((ClipCpuDescriptor_t) desc, x, min, max, y, stream);
+            return cpuClip((ClipCpuDescriptor_t) desc, x, y, stream);
 #endif
 #ifdef ENABLE_NV_GPU
         case DevNvGpu:
-            return cudaClip((ClipCudaDescriptor_t) desc, x, y, min, max, stream);
+            return cudaClip((ClipCudaDescriptor_t) desc, x, y, stream);
 #endif
     }
     return STATUS_BAD_DEVICE;
diff --git a/src/ops/reducemean/operator.cc b/src/ops/reducemean/operator.cc
index 5ea0121a..00b62374 100644
--- a/src/ops/reducemean/operator.cc
+++ b/src/ops/reducemean/operator.cc
@@ -1,17 +1,17 @@
 #include "../utils.h"
 #include "../reduce/reduce.h"
-#include "ops/reducemin/reducemin.h"
+#include "ops/reducemean/reducemean.h"
 
-struct _ReduceminDescriptor {
+struct _ReducemeanDescriptor {
     Device device;
     infiniopReduceDescriptor_t reduce_desc;
 };
 
-typedef struct _ReduceminDescriptor *_ReduceminDescriptor_t;
+typedef struct _ReducemeanDescriptor *_ReducemeanDescriptor_t;
 
-__C __export infiniopStatus_t infiniopCreateReduceminDescriptor(
+__C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(
     infiniopHandle_t handle,
-    infiniopReduceminDescriptor_t *desc_ptr,
+    infiniopReducemeanDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y,
     infiniopTensorDescriptor_t x,
     int64_t const *axes,
@@ -20,21 +20,22 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(
     bool noop_with_empty_axes
     ) {
     infiniopReduceDescriptor_t reduce_desc;
-    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, n, keepdims, noop_with_empty_axes,2), STATUS_SUCCESS);
-    *(_ReduceminDescriptor_t *) desc_ptr = new _ReduceminDescriptor{
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, n, keepdims, noop_with_empty_axes, 0), STATUS_SUCCESS);
+    *(_ReducemeanDescriptor_t *) desc_ptr = new _ReducemeanDescriptor{
         handle->device,
         reduce_desc
     };
     return STATUS_SUCCESS;
 }
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *y, const void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
-    auto _desc = (_ReduceminDescriptor_t) desc;
+
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *y, void const *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+    auto _desc = (_ReducemeanDescriptor_t) desc;
     CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
     return STATUS_SUCCESS;
 }
 
-__C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc) {
-    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceminDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+__C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReducemeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
     delete desc;
     return STATUS_SUCCESS;
 }
\ No newline at end of file
diff --git a/src/ops/reducemin/operator.cc b/src/ops/reducemin/operator.cc
index 3a66a218..b17ff2b8 100644
--- a/src/ops/reducemin/operator.cc
+++ b/src/ops/reducemin/operator.cc
@@ -27,7 +27,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(
     };
     return STATUS_SUCCESS;
 }
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *y, void const *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream) {
     auto _desc = (_ReduceminDescriptor_t) desc;
     CHECK_STATUS(infiniopReduce(_desc->reduce_desc, y, x, dynamic_axes, dynamic_axes_size, stream), STATUS_SUCCESS);
     return STATUS_SUCCESS;