From 532eaa6ec9211390d62933cacaef72347976142a Mon Sep 17 00:00:00 2001
From: stever178 <2874146120@qq.com>
Date: Fri, 21 Mar 2025 14:11:40 +0800
Subject: [PATCH 1/5] add codes for clip, where, gather, reduce

---
 include/ops/clip/clip.h               |  27 ++
 include/ops/gather/gather.h           |  28 ++
 include/ops/reduce_max/reduce_max.h   |  30 ++
 include/ops/reduce_mean/reduce_mean.h |  30 ++
 include/ops/reduce_min/reduce_min.h   |  30 ++
 include/ops/where/where.h             |  29 ++
 operatorspy/tests/clip.py             | 160 ++++++++++
 operatorspy/tests/gather.py           | 172 +++++++++++
 operatorspy/tests/reduce_max.py       | 224 ++++++++++++++
 operatorspy/tests/reduce_mean.py      | 224 ++++++++++++++
 operatorspy/tests/reduce_min.py       | 224 ++++++++++++++
 operatorspy/tests/where.py            | 165 ++++++++++
 src/ops/clip/cpu/clip_cpu.cc          | 110 +++++++
 src/ops/clip/cpu/clip_cpu.h           |  38 +++
 src/ops/clip/operator.cc              |  85 +++++
 src/ops/gather/cpu/gather_cpu.cc      | 160 ++++++++++
 src/ops/gather/cpu/gather_cpu.h       |  41 +++
 src/ops/gather/operator.cc            |  81 +++++
 src/ops/reduce/cpu/reduce_cpu.cc      | 428 ++++++++++++++++++++++++++
 src/ops/reduce/cpu/reduce_cpu.h       |  55 ++++
 src/ops/reduce/operator.cc            | 105 +++++++
 src/ops/reduce/reduce.h               |  30 ++
 src/ops/reduce_max/operator.cc        |  60 ++++
 src/ops/reduce_mean/operator.cc       |  59 ++++
 src/ops/reduce_min/operator.cc        |  59 ++++
 src/ops/where/cpu/where_cpu.cc        | 121 ++++++++
 src/ops/where/cpu/where_cpu.h         |  37 +++
 src/ops/where/operator.cc             |  82 +++++
 28 files changed, 2894 insertions(+)
 create mode 100644 include/ops/clip/clip.h
 create mode 100644 include/ops/gather/gather.h
 create mode 100644 include/ops/reduce_max/reduce_max.h
 create mode 100644 include/ops/reduce_mean/reduce_mean.h
 create mode 100644 include/ops/reduce_min/reduce_min.h
 create mode 100644 include/ops/where/where.h
 create mode 100644 operatorspy/tests/clip.py
 create mode 100644 operatorspy/tests/gather.py
 create mode 100644 operatorspy/tests/reduce_max.py
 create mode 100644 operatorspy/tests/reduce_mean.py
 create mode 100644 operatorspy/tests/reduce_min.py
 create mode 100644 operatorspy/tests/where.py
 create mode 100644 src/ops/clip/cpu/clip_cpu.cc
 create mode 100644 src/ops/clip/cpu/clip_cpu.h
 create mode 100644 src/ops/clip/operator.cc
 create mode 100644 src/ops/gather/cpu/gather_cpu.cc
 create mode 100644 src/ops/gather/cpu/gather_cpu.h
 create mode 100644 src/ops/gather/operator.cc
 create mode 100644 src/ops/reduce/cpu/reduce_cpu.cc
 create mode 100644 src/ops/reduce/cpu/reduce_cpu.h
 create mode 100644 src/ops/reduce/operator.cc
 create mode 100644 src/ops/reduce/reduce.h
 create mode 100644 src/ops/reduce_max/operator.cc
 create mode 100644 src/ops/reduce_mean/operator.cc
 create mode 100644 src/ops/reduce_min/operator.cc
 create mode 100644 src/ops/where/cpu/where_cpu.cc
 create mode 100644 src/ops/where/cpu/where_cpu.h
 create mode 100644 src/ops/where/operator.cc

diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h
new file mode 100644
index 00000000..180bb3ea
--- /dev/null
+++ b/include/ops/clip/clip.h
@@ -0,0 +1,27 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ClipDescriptor {
+    Device device;
+} ClipDescriptor;
+
+typedef ClipDescriptor *infiniopClipDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+                                                           infiniopClipDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t output,
+                                                           infiniopTensorDescriptor_t input,
+                                                           float min_value,
+                                                           float max_value);
+
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+                                           void *output,
+                                           void const *input,
+                                           void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
+
+#endif
diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h
new file mode 100644
index 00000000..7c5e4c38
--- /dev/null
+++ b/include/ops/gather/gather.h
@@ -0,0 +1,28 @@
+#ifndef GATHER_H
+#define GATHER_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct GatherDescriptor {
+    Device device;
+} GatherDescriptor;
+
+typedef GatherDescriptor *infiniopGatherDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle,
+                                                             infiniopGatherDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t output,
+                                                             infiniopTensorDescriptor_t data,
+                                                             infiniopTensorDescriptor_t indices,
+                                                             int64_t axis);
+
+__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc,
+                                             void *output,
+                                             void const *data,
+                                             void const *indices,
+                                             void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
+
+#endif
diff --git a/include/ops/reduce_max/reduce_max.h b/include/ops/reduce_max/reduce_max.h
new file mode 100644
index 00000000..6ef30477
--- /dev/null
+++ b/include/ops/reduce_max/reduce_max.h
@@ -0,0 +1,30 @@
+#ifndef REDUCE_MAX_H
+#define REDUCE_MAX_H 
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceMaxDescriptor {
+    Device device;
+} ReduceMaxDescriptor;
+
+typedef ReduceMaxDescriptor *infiniopReduceMaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMaxDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t reduced,
+                                                                infiniopTensorDescriptor_t data,
+                                                                int64_t *axes,
+                                                                uint64_t axes_ndim,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes);
+
+__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc,
+                                                void *workspace, uint64_t workspace_size,
+                                                void *reduced, void const *data, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc);
+
+#endif
diff --git a/include/ops/reduce_mean/reduce_mean.h b/include/ops/reduce_mean/reduce_mean.h
new file mode 100644
index 00000000..5629687e
--- /dev/null
+++ b/include/ops/reduce_mean/reduce_mean.h
@@ -0,0 +1,30 @@
+#ifndef REDUCE_MEAN_H
+#define REDUCE_MEAN_H 
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceMeanDescriptor {
+    Device device;
+} ReduceMeanDescriptor;
+
+typedef ReduceMeanDescriptor *infiniopReduceMeanDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMeanDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t reduced,
+                                                                infiniopTensorDescriptor_t data,
+                                                                int64_t *axes,
+                                                                uint64_t axes_ndim,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes);
+
+__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc,
+                                                void *workspace, uint64_t workspace_size,
+                                                void *reduced, void const *data, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc);
+
+#endif
diff --git a/include/ops/reduce_min/reduce_min.h b/include/ops/reduce_min/reduce_min.h
new file mode 100644
index 00000000..4a1459d6
--- /dev/null
+++ b/include/ops/reduce_min/reduce_min.h
@@ -0,0 +1,30 @@
+#ifndef REDUCE_MIN_H
+#define REDUCE_MIN_H 
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceMinDescriptor {
+    Device device;
+} ReduceMinDescriptor;
+
+typedef ReduceMinDescriptor *infiniopReduceMinDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMinDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t reduced,
+                                                                infiniopTensorDescriptor_t data,
+                                                                int64_t *axes,
+                                                                uint64_t axes_ndim,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes);
+
+__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc,
+                                                void *workspace, uint64_t workspace_size,
+                                                void *reduced, void const *data, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc);
+
+#endif
diff --git a/include/ops/where/where.h b/include/ops/where/where.h
new file mode 100644
index 00000000..0cec3c5e
--- /dev/null
+++ b/include/ops/where/where.h
@@ -0,0 +1,29 @@
+#ifndef WHERE_H
+#define WHERE_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct WhereDescriptor {
+    Device device;
+} WhereDescriptor;
+
+typedef WhereDescriptor *infiniopWhereDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                            infiniopWhereDescriptor_t *desc_ptr,
+                                                            infiniopTensorDescriptor_t output,
+                                                            infiniopTensorDescriptor_t condition,
+                                                            infiniopTensorDescriptor_t x,
+                                                            infiniopTensorDescriptor_t y);
+
+__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+                                            void *output,
+                                            void const *condition,
+                                            void const *x,
+                                            void const *y,
+                                            void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+
+#endif
diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py
new file mode 100644
index 00000000..10a1ee6f
--- /dev/null
+++ b/operatorspy/tests/clip.py
@@ -0,0 +1,160 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_float
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_ = auto()
+
+
+class ClipDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopClipDescriptor_t = POINTER(ClipDescriptor)
+
+def clip(x, min_value, max_value):
+    return torch.clip(x, min=min_value, max=max_value)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    o_shape, 
+    i_shape,
+    min_value=None,
+    max_value=None,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Clip on {torch_device} with o_shape:{o_shape} i_shape:{i_shape} \
+        min_value:{min_value} max_value:{max_value} \
+        dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if o_shape != i_shape:
+        print("Unsupported test: unmatched shapes for input and output")
+        return
+
+    input = torch.rand(i_shape, dtype=tensor_dtype).to(torch_device)
+    output = torch.rand(o_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else input
+
+    if min_value != None:
+        min_value = torch.tensor(min_value, dtype=tensor_dtype, device=torch_device)
+    else:
+        min_value = torch.tensor(float("-inf"), dtype=tensor_dtype, device=torch_device)
+    if max_value != None:
+        max_value = torch.tensor(max_value, dtype=tensor_dtype, device=torch_device)
+    else:
+        max_value = torch.tensor(float("inf"), dtype=tensor_dtype, device=torch_device)
+
+    descriptor = infiniopClipDescriptor_t()
+    i_tensor = to_tensor(input, lib)
+    o_tensor = to_tensor(output, lib) if inplace == Inplace.OUT_OF_PLACE else (i_tensor)
+
+    ans = clip(input, min_value, max_value)
+
+    check_error(
+        lib.infiniopCreateClipDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            o_tensor.descriptor,
+            i_tensor.descriptor,
+            min_value.item(),
+            max_value.item(),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    o_tensor.descriptor.contents.invalidate()
+    i_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopClip(descriptor, 
+                         o_tensor.data, 
+                         i_tensor.data, 
+                         None)
+    )
+    # print(f"  min:{min_value}, max:{max_value}, input:{input}, ans:{ans}, output:{output},")
+    assert torch.allclose(output, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyClipDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for o_shape, i_shape, min_value, max_value, inplace in test_cases:
+        test(lib, handle, "cpu", o_shape, i_shape, min_value, max_value, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", o_shape, i_shape, min_value, max_value, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # o_shape, i_shape, min, max, inplace
+        ((1, 3), (1, 3), None, None, Inplace.OUT_OF_PLACE),
+        ((3, 3), (3, 3), 1.0, 2.0, Inplace.OUT_OF_PLACE),
+        ((), (), None, None, Inplace.OUT_OF_PLACE),
+        ((2, 20, 3), (2, 20, 3), 0., -2.0, Inplace.INPLACE_),
+
+        ((32, 20, 512), (32, 20, 512), -0.3, 0.3, Inplace.INPLACE_),
+        ((32, 256, 112, 112), (32, 256, 112, 112), -0.1, 0.1, Inplace.OUT_OF_PLACE),
+        # ((32, 150, 5120), (32, 150, 5120), None, None, Inplace.OUT_OF_PLACE),
+
+        ((2, 4, 3), (2, 1, 3), None, None, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), -0.1, 0.1, Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), None, None, Inplace.OUT_OF_PLACE),
+    ]
+    
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateClipDescriptor.restype = c_int32
+    lib.infiniopCreateClipDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopClipDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+    ]
+    lib.infiniopClip.restype = c_int32
+    lib.infiniopClip.argtypes = [
+        infiniopClipDescriptor_t,
+        c_void_p,
+        c_void_p,
+        # 
+        c_void_p,
+    ]
+    lib.infiniopDestroyClipDescriptor.restype = c_int32
+    lib.infiniopDestroyClipDescriptor.argtypes = [
+        infiniopClipDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    # if args.cuda:
+    #     test_cuda(lib, test_cases)
+    # if args.bang:
+    #     test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
new file mode 100644
index 00000000..118ac342
--- /dev/null
+++ b/operatorspy/tests/gather.py
@@ -0,0 +1,172 @@
+from ctypes import POINTER, Structure, c_int32, c_int64, c_void_p
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+import numpy as np
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+
+
+class GatherDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGatherDescriptor_t = POINTER(GatherDescriptor)
+
+
+def gather(data, indices, axis):
+    np_input = data.numpy()
+    np_indices = indices.numpy()
+    np_output = np.take(np_input, np_indices, axis=axis)
+    return torch.from_numpy(np_output)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    o_shape, 
+    data_shape, 
+    indices_shape,
+    axis=0,
+    tensor_dtype=torch.float16,
+    indices_dtype=torch.int32,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Gather on {torch_device} with o_shape:{o_shape} data_shape:{data_shape} indices_shape:{indices_shape} \
+            dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: calculatin in-place is not supported")
+        return
+    if data_shape != indices_shape: 
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+    # q+r-1
+    if len(o_shape) != len(indices_shape) + len(data_shape) - 1:
+        print("Unsupported test: output-shape is not valid")
+        return
+
+    # indices = torch.rand(indices_shape, dtype=indices_dtype).to(torch_device)
+    indices = torch.randint(0, 3, indices_shape, dtype=indices_dtype).to(torch_device)
+    data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device)
+    output = torch.rand(o_shape, dtype=tensor_dtype).to(torch_device) 
+
+    descriptor = infiniopGatherDescriptor_t()
+    data_tensor = to_tensor(data, lib)
+    indices_tensor = to_tensor(indices, lib)
+    o_tensor = to_tensor(output, lib) 
+
+    ans = gather(data, indices, axis)
+
+    check_error(
+        lib.infiniopCreateGatherDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            o_tensor.descriptor,
+            data_tensor.descriptor,
+            indices_tensor.descriptor,
+            axis,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    o_tensor.descriptor.contents.invalidate()
+    data_tensor.descriptor.contents.invalidate()
+    indices_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopGather(descriptor, 
+                           o_tensor.data, 
+                           data_tensor.data, 
+                           indices_tensor.data, 
+                           None)
+    )
+    assert torch.allclose(output, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
+
+
+def get_o_shape(input_shape, indices_shape, axis):
+    output_shape = input_shape[:axis] + tuple(indices_shape) + input_shape[axis + 1:]
+    return output_shape
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for data_shape, indices_shape, axis, inplace in test_cases:
+        o_shape = get_o_shape(data_shape, indices_shape, axis)
+        test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float16, indices_dtype=torch.int32, inplace=inplace)
+        test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float32, indices_dtype=torch.int64, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # q+r-1
+        # data_shape, indices_shape, axis, inplace
+        ((3, 3), (2, ), 0, Inplace.OUT_OF_PLACE),
+        ((4, 3, 4), (2, 2), 1, Inplace.OUT_OF_PLACE),
+        ((5, 3, 4, 5), (3, 2), 2, Inplace.OUT_OF_PLACE),
+
+        # ((), (), (), Inplace.OUT_OF_PLACE),
+        # ((32, 20, 512), (32, 20, 512), 0, Inplace.OUT_OF_PLACE),
+        # ((3, 2, 4, 5), (4, 5), 0, Inplace.OUT_OF_PLACE),
+        # ((32, 256, 112, 112), (32, 256, 112, 1), 0, Inplace.OUT_OF_PLACE),
+    ]
+
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGatherDescriptor.restype = c_int32
+    lib.infiniopCreateGatherDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGatherDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int64,
+    ]
+    lib.infiniopGather.restype = c_int32
+    lib.infiniopGather.argtypes = [
+        infiniopGatherDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        #
+        c_void_p,
+    ]
+    lib.infiniopDestroyGatherDescriptor.restype = c_int32
+    lib.infiniopDestroyGatherDescriptor.argtypes = [
+        infiniopGatherDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    # if args.cuda:
+    #     test_cuda(lib, test_cases)
+    # if args.bang:
+    #     test_bang(lib, test_cases)
+    # if args.musa:
+    #     test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/reduce_max.py b/operatorspy/tests/reduce_max.py
new file mode 100644
index 00000000..38da1bce
--- /dev/null
+++ b/operatorspy/tests/reduce_max.py
@@ -0,0 +1,224 @@
+from ctypes import POINTER, Structure, c_void_p, c_int32, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+import numpy as np
+
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+
+
+class ReduceMaxDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopReduceMaxDescriptor_t = POINTER(ReduceMaxDescriptor)
+
+
+def reduce_max(input, axis, keepdims=True, noop_with_empty_axes=False):
+    if axis == ():
+        if noop_with_empty_axes:
+            return input
+        else:
+            return torch.amax(input, dim=axis, keepdim=keepdims)
+    return torch.amax(input, dim=axis, keepdim=keepdims)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_int64_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int64))
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    data_shape, 
+    axes,
+    keepdims=True,
+    noop_with_empty_axes=False,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing ReduceMax on {torch_device} with data_shape:{data_shape}  axes:{axes}  keepdims:{keepdims}  noop_with_empty_axes:{noop_with_empty_axes}  dtype:{tensor_dtype}"
+    )
+
+    # if inplace != Inplace.OUT_OF_PLACE:
+    #     print("Unsupported test: calculatin in-place is not supported")
+    #     return
+
+    data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device)
+    ans = reduce_max(data, axes, keepdims, noop_with_empty_axes)
+    reduced = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device)
+    # print(f"in pytorch, data.size:{data.size()}, reduced.size:{reduced.size()}")
+
+    for _ in range(NUM_PRERUN if PROFILE else 0):
+        ans = reduce_max(data, axes, keepdims, noop_with_empty_axes)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduce_max(data, axes, keepdims, noop_with_empty_axes)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    reduced_tensor = to_tensor(reduced, lib) 
+    data_tensor = to_tensor(data, lib)
+    descriptor = infiniopReduceMaxDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReduceMaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            reduced_tensor.descriptor,
+            data_tensor.descriptor,
+            tuple_to_int64_p(axes),
+            c_uint64(len(axes)),
+            c_bool(keepdims),
+            c_bool(noop_with_empty_axes),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    reduced_tensor.descriptor.contents.invalidate()
+    data_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetReduceMaxWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+    # print("!!")
+
+    for _ in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopReduceMax(
+                descriptor, 
+                workspace_ptr,
+                workspaceSize,
+                reduced_tensor.data, 
+                data_tensor.data, 
+                None)
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+            lib.infiniopReduceMax(
+                descriptor, 
+                workspace_ptr,
+                workspaceSize,
+                reduced_tensor.data, 
+                data_tensor.data, 
+                None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    
+    # print(f"  reduced: {reduced}\n  ans: {ans}")
+    assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyReduceMaxDescriptor(descriptor))
+    # print("!!!!")
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for data_shape, axes, keepdims, noop_with_empty_axes in test_cases:
+        test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # data_shape, axes, keepdims, noop_with_empty_axes
+
+        ((1, 3), (), True, True),
+        ((1, 3), (), False, True),
+
+        ((2, 3), (0,), True, False),
+        ((32, 20, 512), (2,), True, False),
+        ((2, 3, 4, 5), (1, 3), True, False),
+        ((3, 2, 5, 4), (0, 1, 2, 3), True, False),
+        ((32, 56, 112, 112), (0, 1, 2), True, False),      
+
+        ((2, 3, 5), (0,), False, False),
+        ((2, 3, 4, 5), (1, 3), False, False),
+        ((32, 20, 512), (1, 2), False, False),
+        ((3, 2, 5, 4), (0, 1, 2, 3), False, False),
+        ((64, 64, 64, 64), (0, 2, 3), False, False),
+    ]
+
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceMaxDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceMaxDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        POINTER(ctypes.c_int64),
+        c_uint64,
+        c_bool,
+        c_bool,
+    ]
+    lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [
+        infiniopReduceMaxDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopReduceMax.restype = c_int32
+    lib.infiniopReduceMax.argtypes = [
+        infiniopReduceMaxDescriptor_t,
+        c_void_p,
+        c_uint64,
+        # 
+        c_void_p,
+        c_void_p,
+        #
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMaxDescriptor.argtypes = [
+        infiniopReduceMaxDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    # if args.cuda:
+    #     test_cuda(lib, test_cases)
+    # if args.bang:
+    #     test_bang(lib, test_cases)
+    # if args.musa:
+    #     test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/reduce_mean.py b/operatorspy/tests/reduce_mean.py
new file mode 100644
index 00000000..c3a714fd
--- /dev/null
+++ b/operatorspy/tests/reduce_mean.py
@@ -0,0 +1,224 @@
+from ctypes import POINTER, Structure, c_void_p, c_int32, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+import numpy as np
+
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+
+
+class ReduceMeanDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopReduceMeanDescriptor_t = POINTER(ReduceMeanDescriptor)
+
+
+def reduce_mean(input, axis, keepdims=True, noop_with_empty_axes=False):
+    if axis == ():
+        if noop_with_empty_axes:
+            return input
+        else:
+            return torch.mean(input, dim=axis, keepdim=keepdims)
+    return torch.mean(input, dim=axis, keepdim=keepdims)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_int64_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int64))
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    data_shape, 
+    axes,
+    keepdims=True,
+    noop_with_empty_axes=False,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing ReduceMean on {torch_device} with data_shape:{data_shape}  axes:{axes}  keepdims:{keepdims}  noop_with_empty_axes:{noop_with_empty_axes}  dtype:{tensor_dtype}"
+    )
+
+    # if inplace != Inplace.OUT_OF_PLACE:
+    #     print("Unsupported test: calculatin in-place is not supported")
+    #     return
+
+    data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device)
+    ans = reduce_mean(data, axes, keepdims, noop_with_empty_axes)
+    reduced = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device)
+    # print(f"in pytorch, data.size:{data.size()}, reduced.size:{reduced.size()}")
+
+    for _ in range(NUM_PRERUN if PROFILE else 0):
+        ans = reduce_mean(data, axes, keepdims, noop_with_empty_axes)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduce_mean(data, axes, keepdims, noop_with_empty_axes)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    reduced_tensor = to_tensor(reduced, lib) 
+    data_tensor = to_tensor(data, lib)
+    descriptor = infiniopReduceMeanDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReduceMeanDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            reduced_tensor.descriptor,
+            data_tensor.descriptor,
+            tuple_to_int64_p(axes),
+            c_uint64(len(axes)),
+            c_bool(keepdims),
+            c_bool(noop_with_empty_axes),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    reduced_tensor.descriptor.contents.invalidate()
+    data_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetReduceMeanWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+    # print("!!")
+
+    for _ in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopReduceMean(
+                descriptor, 
+                workspace_ptr,
+                workspaceSize,
+                reduced_tensor.data, 
+                data_tensor.data, 
+                None)
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+            lib.infiniopReduceMean(
+                descriptor, 
+                workspace_ptr,
+                workspaceSize,
+                reduced_tensor.data, 
+                data_tensor.data, 
+                None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    
+    # print(f"reduced: {reduced}\nans: {ans}")
+    assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyReduceMeanDescriptor(descriptor))
+    # print("!!!!")
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for data_shape, axes, keepdims, noop_with_empty_axes in test_cases:
+        test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # data_shape, axes, keepdims, noop_with_empty_axes
+
+        ((1, 3), (), True, True),
+        ((1, 3), (), False, True),
+
+        ((2, 3), (0,), True, False),
+        ((32, 20, 512), (2,), True, False),
+        ((2, 3, 4, 5), (1, 3), True, False),
+        ((3, 2, 5, 4), (0, 1, 2, 3), True, False),
+        ((32, 56, 112, 112), (0, 1, 2), True, False),      
+
+        ((2, 3, 5), (0,), False, False),
+        ((2, 3, 4, 5), (1, 3), False, False),
+        ((32, 20, 512), (1, 2), False, False),
+        ((3, 2, 5, 4), (0, 1, 2, 3), False, False),
+        ((64, 64, 64, 64), (0, 2, 3), False, False),
+    ]
+
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceMeanDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMeanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceMeanDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        POINTER(ctypes.c_int64),
+        c_uint64,
+        c_bool,
+        c_bool,
+    ]
+    lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [
+        infiniopReduceMeanDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopReduceMean.restype = c_int32
+    lib.infiniopReduceMean.argtypes = [
+        infiniopReduceMeanDescriptor_t,
+        c_void_p,
+        c_uint64,
+        # 
+        c_void_p,
+        c_void_p,
+        #
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMeanDescriptor.argtypes = [
+        infiniopReduceMeanDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    # if args.cuda:
+    #     test_cuda(lib, test_cases)
+    # if args.bang:
+    #     test_bang(lib, test_cases)
+    # if args.musa:
+    #     test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/reduce_min.py b/operatorspy/tests/reduce_min.py
new file mode 100644
index 00000000..2db240d0
--- /dev/null
+++ b/operatorspy/tests/reduce_min.py
@@ -0,0 +1,224 @@
+from ctypes import POINTER, Structure, c_void_p, c_int32, c_uint64, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+import numpy as np
+
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+
+
+class ReduceMinDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopReduceMinDescriptor_t = POINTER(ReduceMinDescriptor)
+
+
+def reduce_min(input, axis, keepdims=True, noop_with_empty_axes=False):
+    if axis == ():
+        if noop_with_empty_axes:
+            return input
+        else:
+            return torch.amin(input, dim=axis, keepdim=keepdims)
+    return torch.amin(input, dim=axis, keepdim=keepdims)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_int64_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int64))
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    data_shape, 
+    axes,
+    keepdims=True,
+    noop_with_empty_axes=False,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing ReduceMin on {torch_device} with data_shape:{data_shape}  axes:{axes}  keepdims:{keepdims}  noop_with_empty_axes:{noop_with_empty_axes}  dtype:{tensor_dtype}"
+    )
+
+    # if inplace != Inplace.OUT_OF_PLACE:
+    #     print("Unsupported test: calculatin in-place is not supported")
+    #     return
+
+    data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device)
+    ans = reduce_min(data, axes, keepdims, noop_with_empty_axes)
+    reduced = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device)
+    # print(f"in pytorch, data.size:{data.size()}, reduced.size:{reduced.size()}")
+
+    for _ in range(NUM_PRERUN if PROFILE else 0):
+        ans = reduce_min(data, axes, keepdims, noop_with_empty_axes)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduce_min(data, axes, keepdims, noop_with_empty_axes)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    reduced_tensor = to_tensor(reduced, lib) 
+    data_tensor = to_tensor(data, lib)
+    descriptor = infiniopReduceMinDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReduceMinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            reduced_tensor.descriptor,
+            data_tensor.descriptor,
+            tuple_to_int64_p(axes),
+            c_uint64(len(axes)),
+            c_bool(keepdims),
+            c_bool(noop_with_empty_axes),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    reduced_tensor.descriptor.contents.invalidate()
+    data_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetReduceMinWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+    # print("!!")
+
+    for _ in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopReduceMin(
+                descriptor, 
+                workspace_ptr,
+                workspaceSize,
+                reduced_tensor.data, 
+                data_tensor.data, 
+                None)
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+            lib.infiniopReduceMin(
+                descriptor, 
+                workspace_ptr,
+                workspaceSize,
+                reduced_tensor.data, 
+                data_tensor.data, 
+                None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    
+    # print(f"reduced: {reduced}\nans: {ans}")
+    assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyReduceMinDescriptor(descriptor))
+    # print("!!!!")
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for data_shape, axes, keepdims, noop_with_empty_axes in test_cases:
+        test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # data_shape, axes, keepdims, noop_with_empty_axes
+
+        ((1, 3), (), True, True),
+        ((1, 3), (), False, True),
+
+        ((2, 3), (0,), True, False),
+        ((32, 20, 512), (2,), True, False),
+        ((2, 3, 4, 5), (1, 3), True, False),
+        ((3, 2, 5, 4), (0, 1, 2, 3), True, False),
+        ((32, 56, 112, 112), (0, 1, 2), True, False),      
+
+        ((2, 3, 5), (0,), False, False),
+        ((2, 3, 4, 5), (1, 3), False, False),
+        ((32, 20, 512), (1, 2), False, False),
+        ((3, 2, 5, 4), (0, 1, 2, 3), False, False),
+        ((64, 64, 64, 64), (0, 2, 3), False, False),
+    ]
+
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceMinDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceMinDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        POINTER(ctypes.c_int64),
+        c_uint64,
+        c_bool,
+        c_bool,
+    ]
+    lib.infiniopGetReduceMinWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMinWorkspaceSize.argtypes = [
+        infiniopReduceMinDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopReduceMin.restype = c_int32
+    lib.infiniopReduceMin.argtypes = [
+        infiniopReduceMinDescriptor_t,
+        c_void_p,
+        c_uint64,
+        # 
+        c_void_p,
+        c_void_p,
+        #
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceMinDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMinDescriptor.argtypes = [
+        infiniopReduceMinDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    # if args.cuda:
+    #     test_cuda(lib, test_cases)
+    # if args.bang:
+    #     test_bang(lib, test_cases)
+    # if args.musa:
+    #     test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py
new file mode 100644
index 00000000..46dbacc8
--- /dev/null
+++ b/operatorspy/tests/where.py
@@ -0,0 +1,165 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_float
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+    INPLACE_Y = auto()
+
+
+class WhereDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopWhereDescriptor_t = POINTER(WhereDescriptor)
+
+def where(condition, x, y):
+    return torch.where(condition, x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    o_shape, 
+    condition_shape,
+    x_shape,
+    y_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Where on {torch_device} with o_shape:{o_shape} x_shape:{x_shape} y_shape:{y_shape}\
+        dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if x_shape != y_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
+    output = torch.rand(o_shape, dtype=tensor_dtype).to(torch_device) \
+        if inplace == Inplace.OUT_OF_PLACE else (x if inplace == Inplace.INPLACE_X else y)
+    condition = torch.randint(0, 2, condition_shape, dtype=torch.uint8).to(torch_device)
+
+    descriptor = infiniopWhereDescriptor_t()
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    o_tensor = to_tensor(output, lib) \
+        if inplace == Inplace.OUT_OF_PLACE else (x_tensor if inplace == Inplace.INPLACE_X else y_tensor)
+    condition_tensor = to_tensor(condition, lib)
+
+    ans = where(condition, x, y)
+
+    check_error(
+        lib.infiniopCreateWhereDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            o_tensor.descriptor,
+            condition_tensor.descriptor,
+            x_tensor.descriptor,
+            y_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    o_tensor.descriptor.contents.invalidate()
+    condition_tensor.descriptor.contents.invalidate()
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopWhere(descriptor, 
+                          o_tensor.data, 
+                          condition_tensor.data,
+                          x_tensor.data,
+                          y_tensor.data,
+                          None)
+    )
+
+    # print(f"--ans:{ans}, output:{output}")
+    assert torch.allclose(output, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyWhereDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for o_shape, condition_shape, x_shape, y_shape, inplace in test_cases:
+        test(lib, handle, "cpu", o_shape, condition_shape, x_shape, y_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", o_shape, condition_shape, x_shape, y_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # o_shape, condition_shape, x_shape, y_shape, inplace
+        ((1, 3), (1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3), (3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 20, 3), (2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
+
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_X),
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_Y),
+        ((32, 150, 512), (32, 150, 512), (32, 150, 512), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+
+        ((2, 4, 3), (2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+    ]
+
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateWhereDescriptor.restype = c_int32
+    lib.infiniopCreateWhereDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopWhereDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopWhere.restype = c_int32
+    lib.infiniopWhere.argtypes = [
+        infiniopWhereDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        # 
+        c_void_p,
+    ]
+    lib.infiniopDestroyWhereDescriptor.restype = c_int32
+    lib.infiniopDestroyWhereDescriptor.argtypes = [
+        infiniopWhereDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    # if args.cuda:
+    #     test_cuda(lib, test_cases)
+    # if args.bang:
+    #     test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc
new file mode 100644
index 00000000..12d86f3c
--- /dev/null
+++ b/src/ops/clip/cpu/clip_cpu.cc
@@ -0,0 +1,110 @@
+#include "clip_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+#include <limits> 
+
+inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) {
+    for (int64_t i = ndim - 1; i >= 0; --i) {
+        if (++indices[i] != shape[i]) {
+            return;
+        }
+        indices[i] = 0;
+    }
+}
+
+inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) {
+    return std::inner_product(indices, indices + ndim, strides, uint64_t(0));
+}
+
+infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t,
+                                         ClipCpuDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t output,
+                                         infiniopTensorDescriptor_t input,
+                                         float min_value = std::numeric_limits<float>::lowest(),
+                                         float max_value = std::numeric_limits<float>::max()) {
+    uint64_t ndim = output->ndim;
+
+    if (!is_contiguous(output) || !is_contiguous(input)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (output->dt != F16 && output->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (output->dt != input->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t o_data_size = std::accumulate(
+        output->shape, output->shape + output->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    uint64_t *i_strides = new uint64_t[ndim];
+    std::copy(input->strides, input->strides + ndim, i_strides);
+
+    uint64_t *o_indices = new uint64_t[ndim];
+    std::fill(o_indices, o_indices + ndim, 0);
+
+    uint64_t *o_shape = new uint64_t[ndim];
+    std::copy(output->shape, output->shape + ndim, o_shape);
+
+    *desc_ptr = new ClipCpuDescriptor{
+        DevCpu,
+        output->dt,
+        ndim,
+        o_data_size,
+        o_shape,
+        i_strides,
+        o_indices,
+        min_value,
+        max_value,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc) {
+    delete[] desc->o_shape;
+    delete[] desc->i_strides;
+    delete[] desc->o_indices;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc,
+                          void *output,
+                          void const *input) {
+    auto i_ = reinterpret_cast<Tdata const *>(input);
+    auto o_ = reinterpret_cast<Tdata *>(output);
+    const auto &indices = desc->o_indices;
+
+    float min_value = desc->min_value;
+    float max_value = desc->max_value;
+    for (uint64_t i = 0; i < desc->o_data_size; ++i, incrementOne(indices, desc->o_shape, desc->ndim)) {
+        auto i_index = compactToFlat(indices, desc->i_strides, desc->ndim);
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            o_[i] = f32_to_f16(std::fmin(
+                max_value,
+                std::fmax(f16_to_f32(i_[i_index]), min_value)));
+        } else {
+            o_[i] = std::fmin(
+                max_value,
+                std::fmax(i_[i_index], min_value));
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
+                         void *output,
+                         void const *input,
+                        //  float min_value = std::numeric_limits<float>::lowest(),
+                        //  float max_value = std::numeric_limits<float>::max(),
+                         void *stream = nullptr) {
+    if (desc->dtype == F16) {
+        return clip_cpu<uint16_t>(desc, output, input);
+    }
+    if (desc->dtype == F32) {
+        return clip_cpu<float>(desc, output, input);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h
new file mode 100644
index 00000000..c9c45add
--- /dev/null
+++ b/src/ops/clip/cpu/clip_cpu.h
@@ -0,0 +1,38 @@
+#ifndef __CPU_CLIP_H__
+#define __CPU_CLIP_H__
+
+#include "operators.h"
+#include <numeric>
+#include <type_traits>
+
+struct ClipCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t o_data_size;
+    uint64_t const *o_shape;
+    uint64_t const *i_strides;
+    uint64_t *o_indices;
+    float min_value;
+    float max_value;
+};
+
+typedef struct ClipCpuDescriptor *ClipCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t,
+                                         ClipCpuDescriptor_t *,
+                                         infiniopTensorDescriptor_t output,
+                                         infiniopTensorDescriptor_t input,
+                                         float min_value,
+                                         float max_value);
+
+infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
+                         void *output,
+                         void const *input,
+                        //  float min_value,
+                        //  float max_value,
+                         void *stream);
+
+infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc
new file mode 100644
index 00000000..18a00169
--- /dev/null
+++ b/src/ops/clip/operator.cc
@@ -0,0 +1,85 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/clip/clip.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/clip_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+// #include "../../devices/cuda/cuda_handle.h"
+// #include "cuda/clip.cuh"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+// #include "musa/clip_musa.h"
+#endif
+
+__C infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+                                                  infiniopClipDescriptor_t *desc_ptr,
+                                                  infiniopTensorDescriptor_t output,
+                                                  infiniopTensorDescriptor_t input,
+                                                  float min,
+                                                  float max) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *) desc_ptr, output, input, min, max);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+                                  void *output,
+                                  void const *input,
+                                  void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuClip((ClipCpuDescriptor_t) desc, output, input, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyClipDescriptor((ClipCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc
new file mode 100644
index 00000000..8e276235
--- /dev/null
+++ b/src/ops/gather/cpu/gather_cpu.cc
@@ -0,0 +1,160 @@
+#include "gather_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+// inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) {
+//     for (int64_t i = ndim - 1; i >= 0; --i) {
+//         if (++indices[i] != shape[i]) {
+//             return;
+//         }
+//         indices[i] = 0;
+//     }
+// }
+
+inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) {
+    return std::inner_product(indices, indices + ndim, strides, uint64_t(0));
+}
+
+infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t,
+                                           GatherCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t output,
+                                           infiniopTensorDescriptor_t data,
+                                           infiniopTensorDescriptor_t indices,
+                                           int64_t axis = 0) {
+    uint64_t indices_ndim = indices->ndim;// rank q
+    uint64_t data_ndim = data->ndim;      // rank r
+    uint64_t ndim = output->ndim;         // q+r-1
+
+    if (!is_contiguous(data) || !is_contiguous(indices) || !is_contiguous(output)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (output->dt != F16 && output->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (output->dt != data->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (indices->dt != I32 && indices->dt != I64) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    // q+r-1
+    if (ndim != indices_ndim + data_ndim - 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    // [-r, r-1]
+    if (axis < -data_ndim || axis > data_ndim - 1) {
+        return STATUS_BAD_PARAM;
+    }
+    axis = (axis + data_ndim) % data_ndim;
+
+    // uint64_t o_data_size = std::accumulate(output->shape, output->shape + output->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // uint64_t *o_shape = new uint64_t[ndim];
+    // std::copy(output->shape, output->shape + ndim, o_shape);
+
+    uint64_t *data_shape = new uint64_t[data->ndim];
+    memcpy(data_shape, data->shape, data->ndim * sizeof(uint64_t));
+    
+    uint64_t *indices_shape = new uint64_t[indices->ndim];
+    memcpy(indices_shape, indices->shape, indices->ndim * sizeof(uint64_t));
+
+    uint64_t *data_strides = new uint64_t[ndim];
+    memcpy(data_strides, data->strides, data->ndim * sizeof(uint64_t));
+
+    uint64_t *indices_strides = new uint64_t[ndim];
+    memcpy(indices_strides, indices->strides, indices->ndim * sizeof(uint64_t));
+
+    // uint64_t *o_indices = new uint64_t[ndim];
+    // std::fill(o_indices, o_indices + ndim, 0);
+
+    *desc_ptr = new GatherCpuDescriptor{
+        DevCpu,
+        output->dt,
+        indices->dt,
+        data_ndim,
+        indices_ndim,
+        // o_data_size,
+        // o_shape,
+        data_shape,
+        indices_shape,
+        data_strides,
+        indices_strides,
+        // o_indices,
+        axis,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc) {
+    // delete[] desc->o_shape;
+    delete[] desc->data_shape;
+    delete[] desc->indices_shape;
+    delete[] desc->data_strides;
+    delete[] desc->indices_strides;
+    // delete[] desc->o_indices;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata, typename Tindices>
+infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
+                            void *output,
+                            void const *data,
+                            void const *indices) {
+    auto data_ = reinterpret_cast<Tdata const *>(data);
+    auto output_ = reinterpret_cast<Tdata *>(output);
+    auto indices_ = reinterpret_cast<Tindices const *>(indices);
+    const uint64_t axis = desc->axis;
+
+    uint64_t outer_ = 1;
+    for (uint64_t i = 0; i < axis; ++i) {
+        outer_ *= desc->data_shape[i];  
+    }
+    uint64_t indices_element_count = 1;
+    for (uint64_t i = 0; i < desc->indices_ndim; ++i) {
+        indices_element_count *= desc->indices_shape[i];
+    }
+    uint64_t cpy_len = 1;
+    for (uint64_t i = axis + 1; i < desc->data_ndim; ++i) {
+        cpy_len *= desc->data_shape[i];
+    }
+
+    for (uint64_t outer = 0; outer < outer_; ++outer) {
+        for (uint64_t idx = 0; idx < indices_element_count; ++idx) {
+            Tindices index_val = indices_[idx];// bounds [-s, s-1]
+            index_val = (index_val + desc->data_shape[axis]) % desc->data_shape[axis];
+
+            const uint64_t data_offset = outer * desc->data_shape[axis] * cpy_len +
+                                         index_val * cpy_len;
+            const uint64_t output_offset = outer * indices_element_count * cpy_len +
+                                           idx * cpy_len;
+            memcpy(
+                output_ + output_offset,
+                data_ + data_offset,
+                sizeof(Tdata) * cpy_len);
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
+                           void *output,
+                           void const *data,
+                           void const *indices,
+                           void *stream) {
+    if (desc->dtype == F16 && desc->indices_dtype == I32) {
+        return gather_cpu<uint16_t, int32_t>(desc, output, data, indices);
+    }
+    if (desc->dtype == F32 && desc->indices_dtype == I32) {
+        return gather_cpu<float, int32_t>(desc, output, data, indices);
+    }
+    if (desc->dtype == F16 && desc->indices_dtype == I64) {
+        return gather_cpu<uint16_t, int64_t>(desc, output, data, indices);
+    }
+    if (desc->dtype == F32 && desc->indices_dtype == I64) {
+        return gather_cpu<float, int64_t>(desc, output, data, indices);
+    }
+    
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h
new file mode 100644
index 00000000..7bb5d346
--- /dev/null
+++ b/src/ops/gather/cpu/gather_cpu.h
@@ -0,0 +1,41 @@
+#ifndef __CPU_GATHER_H__
+#define __CPU_GATHER_H__
+
+#include "operators.h"
+#include <numeric>
+#include <type_traits>
+
+struct GatherCpuDescriptor {
+    Device device;
+    DT dtype;
+    DT indices_dtype;
+    uint64_t data_ndim;
+    uint64_t indices_ndim;
+    // uint64_t o_data_size;
+    // uint64_t const *o_shape;
+    uint64_t const *data_shape;
+    uint64_t const *indices_shape;
+    uint64_t const *data_strides;
+    uint64_t const *indices_strides;
+    // uint64_t *o_indices;
+    int64_t axis;
+};
+
+typedef struct GatherCpuDescriptor *GatherCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t,
+                                           GatherCpuDescriptor_t *,
+                                           infiniopTensorDescriptor_t output,
+                                           infiniopTensorDescriptor_t data,
+                                           infiniopTensorDescriptor_t indices,
+                                           int64_t axis);
+
+infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
+                           void *output,
+                           void const *data,
+                           void const *indices,
+                           void *stream);
+
+infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc
new file mode 100644
index 00000000..a6ddbc03
--- /dev/null
+++ b/src/ops/gather/operator.cc
@@ -0,0 +1,81 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/gather/gather.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/gather_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#endif
+
+__C infiniopStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t data,
+    infiniopTensorDescriptor_t indices,
+    int64_t axis) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateGatherDescriptor(handle, (GatherCpuDescriptor_t *) desc_ptr, output, data, indices, axis);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *output, void const *data, void const *indices, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGather((GatherCpuDescriptor_t) desc, output, data, indices, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyGatherDescriptor((GatherCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc
new file mode 100644
index 00000000..4f56aaa0
--- /dev/null
+++ b/src/ops/reduce/cpu/reduce_cpu.cc
@@ -0,0 +1,428 @@
+#include "reduce_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+#include <cassert>
+
+inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) {
+    for (int64_t i = ndim - 1; i >= 0; --i) {
+        if (++indices[i] != shape[i]) {
+            return;
+        }
+        indices[i] = 0;
+    }
+}
+
+inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) {
+    return std::inner_product(indices, indices + ndim, strides, uint64_t(0));
+}
+
+infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
+                                           ReduceCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t reduced,
+                                           infiniopTensorDescriptor_t data,
+                                           int64_t *axes,
+                                           uint64_t axes_ndim,
+                                           bool keepdims,
+                                           bool noop_with_empty_axes,
+                                           int reduce_type) {
+    if (!is_contiguous(reduced) || !is_contiguous(data)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (reduce_type > 2){
+        return STATUS_BAD_PARAM;
+    }
+    if (reduced->dt != data->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (reduced->dt != F16 && reduced->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t data_ndim = data->ndim;
+    uint64_t reduced_ndim = data_ndim - axes_ndim;
+
+    uint64_t data_size = std::accumulate(data->shape, data->shape + data->ndim, 1ULL, std::multiplies<uint64_t>());
+    // 
+    uint64_t axes_size = 1;
+    uint64_t reduced_size = 1;
+    uint64_t reduced_size_ = std::accumulate(reduced->shape, reduced->shape + reduced->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    int64_t *reduced_axes;
+    int64_t *axes_axes;
+    
+    int64_t *data_strides = new int64_t[data_ndim];
+    memcpy(data_strides, data->strides, data_ndim * sizeof(int64_t));
+    int64_t *reduced_strides;
+    int64_t *axes_strides;
+    
+    // axes is empty
+    if (axes_ndim == 0) {
+        if (noop_with_empty_axes == false) {
+            // reduce all axes
+            axes_ndim = data_ndim;
+            axes_size = data_size;
+            axes_axes = new int64_t[axes_ndim];
+            for (int i = 0; i < axes_ndim; i ++) {
+                axes_axes[i] = i;
+            }
+            axes_strides = new int64_t[axes_ndim];
+            memcpy(axes_strides, data_strides, axes_ndim * sizeof(int64_t));
+
+            reduced_ndim = 1;
+            reduced_size = 1;
+            reduced_axes = nullptr;
+            reduced_strides = new int64_t[reduced_ndim];
+            reduced_strides[0] = 1;
+        } else {
+            // input tensor will not be reduced
+            axes_ndim = 0;
+            axes_size = 0;
+            axes_axes = nullptr;
+            axes_strides = new int64_t[axes_ndim];
+
+            reduced_ndim = data_ndim;
+            reduced_size = data_size;
+            reduced_axes = new int64_t[reduced_ndim];
+            for (int i = 0; i < reduced_ndim; i ++) {
+                reduced_axes[i] = i;
+            }
+            reduced_strides = new int64_t[reduced_ndim];
+            memcpy(reduced_strides, data_strides, reduced_ndim * sizeof(int64_t));
+        }
+    } else {
+        // if (axes != nullptr && axes_ndim > 0) 
+        if (keepdims == true) {
+            // 保持
+            // reduced_ndim = reduced->ndim;
+        } else {
+            // 减少张量维度
+            reduced_ndim = data_ndim - axes_ndim;
+        }
+        // reduced_shape = new uint64_t[reduced_ndim];
+
+        axes_size = 1;
+        reduced_size = 1;
+        std::vector<int> axes_vec, reduced_axes_vec;
+        for(uint64_t i = 0; i < data_ndim; i++) {
+            bool is_axis = false;
+            for(uint64_t j = 0; j < axes_ndim; j++) {
+                // printf("\t\tdata-dim %lu, axes-dim %lu, axes is %lu\n", i, j, axes[j]);
+                if (axes[j] == i) {
+                    is_axis = true;
+                    break;
+                }
+            }
+            
+            if (is_axis) {
+                axes_size *= data->shape[i];
+                axes_vec.emplace_back(i);
+                if (keepdims == true) {
+                    // reduced_axes_vec.emplace_back(i);
+                } 
+            } else {
+                reduced_size *= data->shape[i];
+                // printf("\t\tdim %lu, data_shape is %lu, reduced_size now is %lu\n", i, data->shape[i], reduced_size);
+                reduced_axes_vec.emplace_back(i);
+            }
+        }
+        // ?
+        // sort(axes_.begin(), axes_.end());
+        // sort(reduced_axes_.begin(), reduced_axes_.end());
+        axes_axes = new int64_t[axes_ndim];
+        for (int i = 0; i < axes_ndim; i ++) {
+            axes_axes[i] = axes_vec[i];
+        }
+        reduced_axes = new int64_t[reduced_ndim];
+        for (int i = 0; i < reduced_ndim; i ++) {
+            reduced_axes[i] = reduced_axes_vec[i];
+        }
+
+        // 
+        axes_strides = new int64_t[axes_ndim];
+        for(uint64_t i = 0; i < axes_ndim; i++) {
+            axes_strides[i] = 1;
+            for(uint64_t j = i + 1; j < axes_ndim; j++) {
+                axes_strides[i] *= data->shape[axes[j]];
+            }
+        }
+
+        // 
+        reduced_strides = new int64_t[reduced_ndim];
+        for(uint64_t i = 0; i < reduced_ndim; i++) {
+            reduced_strides[i] = 1;
+            for (uint64_t j = i + 1; j < reduced_ndim; j++) {
+                uint64_t shape = data->shape[reduced_axes_vec[j]];
+                if (keepdims == true) {
+                    // shape = 1;
+                } 
+                reduced_strides[i] *= shape;
+            }
+        }
+    }
+    // printf("\tdata_ndim: %lu, reduced_ndim: %lu\n", data_ndim, reduced_ndim);
+    assert(reduced_size == reduced_size_);
+    
+    *desc_ptr = new ReduceCpuDescriptor{
+        DevCpu,
+        reduced->dt,
+
+        reduced_ndim,
+        data_ndim,
+        axes_ndim,
+
+        reduced_size,
+        data_size,
+        axes_size,
+        
+        reduced_axes,
+        axes_axes,
+
+        reduced_strides,
+        data_strides,
+        axes_strides,
+
+        keepdims,
+        reduce_type
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc) {
+    // printf("\t--cpuDestroyReduceDescriptor\n");
+    // for (int i = 0; i < desc->reduced_ndim; i ++)
+        // printf("\treduced_axes[%d]=%lu\n", i, desc->reduced_axes[i]);
+
+    delete[] (desc->reduced_axes);
+    delete[] (desc->axes);
+    delete[] (desc->reduced_strides);
+    delete[] (desc->data_strides);
+    delete[] (desc->axes_strides);
+    delete desc;
+    // printf("\t--delete desc\n");
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
+                            void *reduced,
+                            void const *data) {
+    auto reduced_ = reinterpret_cast<Tdata *>(reduced);
+    auto data_ = reinterpret_cast<Tdata const *>(data);
+
+    auto reduced_ndim_ = desc->reduced_ndim;
+    auto data_ndim_ = desc->data_ndim;
+    auto axes_ndim_ = desc->axes_ndim;
+    
+    auto reduced_size_ = desc->reduced_size;
+    auto data_size_ = desc->data_size;
+    auto axes_size_ = desc->axes_size;
+    // printf("\tdata_size: %lu, reduced_size: %lu, axes_size: %lu\n", data_size_, reduced_size_, axes_size_);
+
+    auto reduced_axes_ = desc->reduced_axes;
+    auto axes_ = desc->axes;
+
+    auto reduced_strides_ = desc->reduced_strides;
+    auto data_strides_ = desc->data_strides;
+    auto axes_strides_ = desc->axes_strides;
+
+    auto reduce_mode_ = desc->reduce_mode;
+    
+    // input tensor will not be reduced
+    if (axes_ == nullptr) {
+        for (uint64_t i = 0; i < data_size_; i++) {
+            reduced_[i] = data_[i];
+        }
+        return STATUS_SUCCESS;
+    }
+
+    // reduce all axes
+    if (reduced_axes_ == nullptr) {
+        float result;
+        // intitialzation
+        switch (reduce_mode_) {
+            case 0: { // ReduceMin
+                result = std::numeric_limits<float>::max();
+            }
+            break;
+            case 1: { // ReduceMax
+                result = std::numeric_limits<float>::lowest();
+            }
+            break;
+            case 2: { // ReduceMean
+                result = 0;
+            }
+            break;
+        }
+        for (int i = 0; i < data_size_; i++) {
+            if constexpr (std::is_same<Tdata, uint16_t>::value) {
+                switch (reduce_mode_) {
+                    case 0: { // ReduceMin
+                        result = std::min(result, f16_to_f32(data_[i]));
+                    }
+                    break;
+                    case 1: { // ReduceMax
+                        result = std::max(result, f16_to_f32(data_[i]));
+                    }
+                    break;
+                    case 2: { // ReduceMean
+                        result += f16_to_f32(data_[i]);
+                    }
+                    break;
+                }
+            } else {
+                switch (reduce_mode_) {
+                    case 0: { // ReduceMin
+                        result = std::min(result, data_[i]);
+                    }
+                    break;
+                    case 1: { // ReduceMax
+                        result = std::max(result, data_[i]);
+                    }
+                    break;
+                    case 2: { // ReduceMean
+                        result += data_[i];
+                    }
+                    break;
+                }
+            }
+        }
+        reduced_[0] = result;
+        return STATUS_SUCCESS;
+    }
+
+    // printf("reduce_cpu handling\n");
+// #pragma omp parallel for
+    for(uint64_t i = 0; i < reduced_size_; i++) {
+        float result;
+        // intitialzation
+        switch (reduce_mode_) {
+            case 0: { // ReduceMin
+                result = std::numeric_limits<float>::max();
+            }
+            break;
+            case 1: { // ReduceMax
+                result = std::numeric_limits<float>::lowest();
+            }
+            break;
+            case 2: { // ReduceMean
+                result = 0;
+            }
+            break;
+        }
+
+        uint64_t idx = 0;
+        uint64_t temp_i = i;
+        for(uint64_t j = 0; j < reduced_ndim_; j++) {
+            idx += temp_i / reduced_strides_[j] * data_strides_[reduced_axes_[j]];
+            temp_i %= reduced_strides_[j];
+        }
+        for(uint64_t j = 0; j < axes_size_; j++) {
+            uint64_t data_idx_ = idx;
+            uint64_t temp_j = j;
+            for(uint64_t k = 0; k < axes_ndim_; k++) {
+                data_idx_ += temp_j / axes_strides_[k] * data_strides_[axes_[k]];
+                temp_j %= axes_strides_[k];
+            }
+
+            if constexpr (std::is_same<Tdata, uint16_t>::value) {
+                switch (reduce_mode_) {
+                    case 0: { // ReduceMin
+                        result = std::min(result, f16_to_f32(data_[data_idx_]));
+                    }
+                    break;
+                    case 1: { // ReduceMax
+                        result = std::max(result, f16_to_f32(data_[data_idx_]));
+                    }
+                    break;
+                    case 2: { // ReduceMean
+                        result += f16_to_f32(data_[data_idx_]);
+                    }
+                    break;
+                }
+            } else {
+                switch (reduce_mode_) {
+                    case 0: { // ReduceMin
+                        result = std::min(result, data_[data_idx_]);
+                    }
+                    break;
+                    case 1: { // ReduceMax
+                        result = std::max(result, data_[data_idx_]);
+                    }
+                    break;
+                    case 2: { // ReduceMean
+                        result += data_[data_idx_];
+                    }
+                    break;
+                }
+            }
+        }
+
+        // final
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            switch (reduce_mode_) {
+                case 0: { // ReduceMin
+                    reduced_[i] = f32_to_f16(result);
+                }
+                break;
+                case 1: { // ReduceMax
+                    reduced_[i] = f32_to_f16(result);
+                }
+                break;
+                case 2: { // ReduceMean
+                    reduced_[i] = f32_to_f16(result / axes_size_);
+                }
+                break;
+            }
+        } else {
+            switch (reduce_mode_) {
+                case 0: { // ReduceMin
+                    reduced_[i] = result;
+                }
+                break;
+                case 1: { // ReduceMax
+                    reduced_[i] = result;
+                }
+                break;
+                case 2: { // ReduceMean
+                    reduced_[i] = result / axes_size_;
+                }
+                break;
+            }
+        }
+    }
+
+    if (desc->keepdims == true && false) {
+        auto new_reduced_strides = new int64_t[reduced_ndim_];
+        uint64_t j = 0;
+        for(uint64_t i = 0, j = 0; i < data_ndim_; i++) {
+            new_reduced_strides[i] = 1;
+            
+            for (; j < reduced_ndim_; j++) {
+                if (i == reduced_axes_[j]) {
+                    new_reduced_strides[i] = reduced_strides_[j];
+                    break;
+                }
+            }
+        }
+        
+        desc->reduced_ndim = data_ndim_;
+        delete[] desc->reduced_strides;
+        printf("\tdelete reduced_strides\n");
+        desc->reduced_strides = new_reduced_strides;
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
+                           void *reduced,
+                           void const *data,
+                           void *stream) {
+    if (desc->dt == F16) {
+        return reduce_cpu<uint16_t>(desc, reduced, data);
+    }
+    if (desc->dt == F32) {
+        return reduce_cpu<float>(desc, reduced, data);
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h
new file mode 100644
index 00000000..2dd13327
--- /dev/null
+++ b/src/ops/reduce/cpu/reduce_cpu.h
@@ -0,0 +1,55 @@
+#ifndef __CPU_REDUCE_H__
+#define __CPU_REDUCE_H__
+
+#include "../../../devices/cpu/common_cpu.h"
+#include "operators.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <unordered_set>
+
+struct ReduceCpuDescriptor {
+    Device device;
+    DataLayout dt;
+
+    uint64_t reduced_ndim;
+    uint64_t data_ndim;
+    uint64_t axes_ndim;
+
+    uint64_t reduced_size;
+    uint64_t data_size;
+    uint64_t axes_size;
+
+    int64_t const *reduced_axes;
+    int64_t const *axes;
+    
+    int64_t const *reduced_strides;
+    int64_t const *data_strides;
+    int64_t const *axes_strides;
+
+    // int reduce_element_num;
+    bool keepdims;
+    int reduce_mode;
+};
+
+typedef struct ReduceCpuDescriptor *ReduceCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
+                                           ReduceCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t reduced,
+                                           infiniopTensorDescriptor_t data,
+                                           int64_t *axes,
+                                           uint64_t axes_ndim,
+                                           bool keepdims,
+                                           bool noop_with_empty_axes,
+                                           int reduce_type);
+
+infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
+                            void *reduced,
+                            void const *data,
+                            void *stream);
+
+infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc
new file mode 100644
index 00000000..8b3333fd
--- /dev/null
+++ b/src/ops/reduce/operator.cc
@@ -0,0 +1,105 @@
+#include "../utils.h"
+#include "operators.h"
+#include "reduce.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/reduce_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#endif
+
+__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle,
+                                                    infiniopReduceDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t reduced,
+                                                    infiniopTensorDescriptor_t data,
+                                                    int64_t *axes,
+                                                    uint64_t axes_ndim,
+                                                    bool keepdims,
+                                                    bool noop_with_empty_axes,
+                                                    int reduce_type) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateReduceDescriptor(handle, (ReduceCpuDescriptor_t *) desc_ptr,
+                                             reduced, data, axes, axes_ndim, keepdims, noop_with_empty_axes, reduce_type);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size) {
+    // std::cout << desc->device << std::endl;
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu: {
+            return STATUS_SUCCESS;
+        }
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+        default:
+            return STATUS_BAD_DEVICE;
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc,
+                                    void *reduced,
+                                    void const *data,
+                                    void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuReduce((ReduceCpuDescriptor_t) desc, reduced, data, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyReduceDescriptor((ReduceCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h
new file mode 100644
index 00000000..62307127
--- /dev/null
+++ b/src/ops/reduce/reduce.h
@@ -0,0 +1,30 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include "export.h"
+#include "operators.h"
+
+typedef struct ReduceDescriptor {
+    Device device;
+} ReduceDescriptor;
+typedef ReduceDescriptor *infiniopReduceDescriptor_t;
+
+__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle,
+                                                    infiniopReduceDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t reduced,
+                                                    infiniopTensorDescriptor_t data,
+                                                    int64_t *axes,
+                                                    uint64_t axes_ndim,
+                                                    bool keepdims,
+                                                    bool noop_with_empty_axes,
+                                                    int reduce_type);
+
+__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size);
+
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc,
+                                    void *reduced,
+                                    void const *data,
+                                    void *stream);
+
+__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc);
+#endif
diff --git a/src/ops/reduce_max/operator.cc b/src/ops/reduce_max/operator.cc
new file mode 100644
index 00000000..91798ab5
--- /dev/null
+++ b/src/ops/reduce_max/operator.cc
@@ -0,0 +1,60 @@
+#include "../reduce/reduce.h"
+#include "../utils.h"
+#include "ops/reduce_max/reduce_max.h"
+
+struct _ReduceMaxDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _ReduceMaxDescriptor *_ReduceMaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMaxDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t reduced,
+                                                                infiniopTensorDescriptor_t data,
+                                                                int64_t *axes,
+                                                                uint64_t axes_ndim,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes) {
+    infiniopReduceDescriptor_t reduce_desc;
+    // case 0: { // ReduceMin
+    // case 1: { // ReduceMax
+    // case 2: { // ReduceMean
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, reduced, data, axes, axes_ndim,
+                                                keepdims, noop_with_empty_axes, 1), STATUS_SUCCESS);
+
+    uint64_t workspace_size = 0;
+    CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS);
+
+    *(_ReduceMaxDescriptor_t *) desc_ptr = new _ReduceMaxDescriptor{
+        handle->device,
+        reduce_desc,
+        workspace_size};
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size) {
+    *size = ((_ReduceMaxDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                                                void *reduced, void const *data, void *stream) {
+    auto _desc = (_ReduceMaxDescriptor_t) desc;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    CHECK_STATUS(infiniopReduce(_desc->reduce_desc, reduced, data, stream),
+                 STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMaxDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/reduce_mean/operator.cc b/src/ops/reduce_mean/operator.cc
new file mode 100644
index 00000000..31460382
--- /dev/null
+++ b/src/ops/reduce_mean/operator.cc
@@ -0,0 +1,59 @@
+#include "../reduce/reduce.h"
+#include "../utils.h"
+#include "ops/reduce_mean/reduce_mean.h"
+
+struct _ReduceMeanDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _ReduceMeanDescriptor *_ReduceMeanDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMeanDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t reduced,
+                                                                infiniopTensorDescriptor_t data,
+                                                                int64_t *axes,
+                                                                uint64_t axes_ndim,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes) {
+    infiniopReduceDescriptor_t reduce_desc;
+
+    // case 0: { // ReduceMin
+    // case 1: { // ReduceMax
+    // case 2: { // ReduceMean
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, reduced, data, axes, axes_ndim,
+                                                keepdims, noop_with_empty_axes, 2), STATUS_SUCCESS);
+    uint64_t workspace_size = 0;
+    CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS);
+
+    *(_ReduceMeanDescriptor_t *) desc_ptr = new _ReduceMeanDescriptor{
+        handle->device,
+        reduce_desc,
+        workspace_size};
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size) {
+    *size = ((_ReduceMeanDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *reduced, void const *data, void *stream) {
+    auto _desc = (_ReduceMeanDescriptor_t) desc;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    CHECK_STATUS(infiniopReduce(_desc->reduce_desc, reduced, data, stream),
+                 STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/reduce_min/operator.cc b/src/ops/reduce_min/operator.cc
new file mode 100644
index 00000000..b3d1303e
--- /dev/null
+++ b/src/ops/reduce_min/operator.cc
@@ -0,0 +1,59 @@
+#include "../reduce/reduce.h"
+#include "../utils.h"
+#include "ops/reduce_min/reduce_min.h"
+
+struct _ReduceMinDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _ReduceMinDescriptor *_ReduceMinDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMinDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t reduced,
+                                                                infiniopTensorDescriptor_t data,
+                                                                int64_t *axes,
+                                                                uint64_t axes_ndim,
+                                                                bool keepdims,
+                                                                bool noop_with_empty_axes) {
+    infiniopReduceDescriptor_t reduce_desc;
+
+    // case 0: { // ReduceMin
+    // case 1: { // ReduceMax
+    // case 2: { // ReduceMean
+    CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, reduced, data, axes, axes_ndim,
+                                                keepdims, noop_with_empty_axes, 0), STATUS_SUCCESS);
+    uint64_t workspace_size = 0;
+    CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS);
+
+    *(_ReduceMinDescriptor_t *) desc_ptr = new _ReduceMinDescriptor{
+        handle->device,
+        reduce_desc,
+        workspace_size};
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size) {
+    *size = ((_ReduceMinDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *reduced, void const *data, void *stream) {
+    auto _desc = (_ReduceMinDescriptor_t) desc;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    CHECK_STATUS(infiniopReduce(_desc->reduce_desc, reduced, data, stream),
+                 STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMinDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc
new file mode 100644
index 00000000..5da759a4
--- /dev/null
+++ b/src/ops/where/cpu/where_cpu.cc
@@ -0,0 +1,121 @@
+#include "where_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+#include <limits> 
+
+inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) {
+    for (int64_t i = ndim - 1; i >= 0; --i) {
+        if (++indices[i] != shape[i]) {
+            return;
+        }
+        indices[i] = 0;
+    }
+}
+
+inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) {
+    return std::inner_product(indices, indices + ndim, strides, uint64_t(0));
+}
+
+infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t,
+                                          WhereCpuDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t output,
+                                          infiniopTensorDescriptor_t condition,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t y) {
+    uint64_t ndim = output->ndim;
+
+    if (!isValidBroadcastShape(x, y, output)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(x) || !is_contiguous(y) || !is_contiguous(output)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (output->dt != F16 && output->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (output->dt != x->dt || output->dt != y->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (condition->dt != U8) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t o_data_size = std::accumulate(
+        output->shape, output->shape + output->ndim, 1ULL, std::multiplies<uint64_t>());
+    
+    uint64_t *o_shape = new uint64_t[ndim];
+    std::copy(output->shape, output->shape + ndim, o_shape);
+
+    uint64_t *x_strides = new uint64_t[ndim];
+    uint64_t *y_strides = new uint64_t[ndim];
+    for (size_t i = 0; i < ndim; ++i) {
+        x_strides[i] = (i < ndim - x->ndim || output->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim];
+        y_strides[i] = (i < ndim - y->ndim || output->shape[i] != y->shape[i + y->ndim - ndim]) ? 0 : y->strides[i + y->ndim - ndim];
+    }
+
+    uint64_t *o_indices = new uint64_t[ndim];
+    std::fill(o_indices, o_indices + ndim, 0);
+
+    *desc_ptr = new WhereCpuDescriptor{
+        DevCpu,
+        output->dt,
+        ndim,
+        o_data_size,
+        o_shape,
+        x_strides,
+        y_strides,
+        o_indices,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc) {
+    delete[] desc->o_shape;
+    delete[] desc->x_strides;
+    delete[] desc->y_strides;
+    delete[] desc->o_indices;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t clip_cpu(WhereCpuDescriptor_t desc,
+                          void *output,
+                          void const *condition,
+                          void const *x,
+                          void const *y) {
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto y_ = reinterpret_cast<Tdata const *>(y);
+    auto condition_ = reinterpret_cast<uint8_t const *>(condition);
+
+    auto o_ = reinterpret_cast<Tdata *>(output);
+    const auto &o_indices = desc->o_indices;
+
+    for (uint64_t i = 0; i < desc->o_data_size; ++i, incrementOne(o_indices, desc->o_shape, desc->ndim)) {
+        auto x_index = compactToFlat(o_indices, desc->x_strides, desc->ndim);
+        auto y_index = compactToFlat(o_indices, desc->y_strides, desc->ndim);
+
+        if (condition_[i]) {
+            o_[i] = x_[x_index];
+        } else {
+            o_[i] = y_[y_index];
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
+                          void *output,
+                          void const *condition,
+                          void const *x,
+                          void const *y,
+                          void *stream) {
+    if (desc->dtype == F16) {
+        return clip_cpu<uint16_t>(desc, output, condition, x, y);
+    }
+    if (desc->dtype == F32) {
+        return clip_cpu<float>(desc, output, condition, x, y);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h
new file mode 100644
index 00000000..9c9b66aa
--- /dev/null
+++ b/src/ops/where/cpu/where_cpu.h
@@ -0,0 +1,37 @@
+#ifndef __CPU_WHERE_H__
+#define __CPU_WHERE_H__
+
+#include "operators.h"
+#include <numeric>
+#include <type_traits>
+
+struct WhereCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t o_data_size;
+    uint64_t const *o_shape;
+    uint64_t const *x_strides;
+    uint64_t const *y_strides;
+    uint64_t *o_indices;
+};
+
+typedef struct WhereCpuDescriptor *WhereCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t,
+                                          WhereCpuDescriptor_t *,
+                                          infiniopTensorDescriptor_t output,
+                                          infiniopTensorDescriptor_t condition,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t y);
+
+infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
+                          void *output,
+                          void const *condition,
+                          void const *x,
+                          void const *y,
+                          void *stream);
+
+infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc
new file mode 100644
index 00000000..7bdfbbd1
--- /dev/null
+++ b/src/ops/where/operator.cc
@@ -0,0 +1,82 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/where/where.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/where_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#endif
+
+__C infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                   infiniopWhereDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t output,
+                                                   infiniopTensorDescriptor_t condition,
+                                                   infiniopTensorDescriptor_t x,
+                                                   infiniopTensorDescriptor_t y) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateWhereDescriptor(handle, (WhereCpuDescriptor_t *) desc_ptr, output, condition, x, y);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+                                   void *output,
+                                   void const *condition,
+                                   void const *x,
+                                   void const *y,
+                                   void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuWhere((WhereCpuDescriptor_t) desc, output, condition, x, y, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyWhereDescriptor((WhereCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}

From df485242a2c2fe750812344f6d2230ae13317959 Mon Sep 17 00:00:00 2001
From: stever178 <2874146120@qq.com>
Date: Fri, 21 Mar 2025 15:25:09 +0800
Subject: [PATCH 2/5] modify ops: reduce

---
 operatorspy/tests/clip.py        |  2 +-
 operatorspy/tests/gather.py      |  7 +++----
 operatorspy/tests/reduce_max.py  | 12 +++++++++---
 operatorspy/tests/reduce_mean.py | 14 ++++++++++----
 operatorspy/tests/reduce_min.py  | 12 ++++++++----
 operatorspy/tests/where.py       |  2 +-
 src/ops/gather/cpu/gather_cpu.cc |  3 ++-
 src/ops/gather/operator.cc       | 13 ++++++-------
 src/ops/reduce/cpu/reduce_cpu.cc |  7 ++++++-
 9 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py
index 10a1ee6f..b0ae8e5c 100644
--- a/operatorspy/tests/clip.py
+++ b/operatorspy/tests/clip.py
@@ -95,7 +95,7 @@ def test(
                          None)
     )
     # print(f"  min:{min_value}, max:{max_value}, input:{input}, ans:{ans}, output:{output},")
-    assert torch.allclose(output, ans, atol=0, rtol=1e-3)
+    assert torch.allclose(output, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyClipDescriptor(descriptor))
 
 
diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
index 118ac342..0f8179a8 100644
--- a/operatorspy/tests/gather.py
+++ b/operatorspy/tests/gather.py
@@ -101,7 +101,7 @@ def test(
                            indices_tensor.data, 
                            None)
     )
-    assert torch.allclose(output, ans, atol=0, rtol=1e-3)
+    assert torch.allclose(output, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
 
 
@@ -127,11 +127,10 @@ def test_cpu(lib, test_cases):
         ((3, 3), (2, ), 0, Inplace.OUT_OF_PLACE),
         ((4, 3, 4), (2, 2), 1, Inplace.OUT_OF_PLACE),
         ((5, 3, 4, 5), (3, 2), 2, Inplace.OUT_OF_PLACE),
+        ((32, 20, 512), (2, 16), 0, Inplace.OUT_OF_PLACE),
 
+        # ((32, 256, 112, 112), (, 112, 1), 0, Inplace.OUT_OF_PLACE),
         # ((), (), (), Inplace.OUT_OF_PLACE),
-        # ((32, 20, 512), (32, 20, 512), 0, Inplace.OUT_OF_PLACE),
-        # ((3, 2, 4, 5), (4, 5), 0, Inplace.OUT_OF_PLACE),
-        # ((32, 256, 112, 112), (32, 256, 112, 1), 0, Inplace.OUT_OF_PLACE),
     ]
 
     args = get_args()
diff --git a/operatorspy/tests/reduce_max.py b/operatorspy/tests/reduce_max.py
index 38da1bce..50e43def 100644
--- a/operatorspy/tests/reduce_max.py
+++ b/operatorspy/tests/reduce_max.py
@@ -143,7 +143,10 @@ def test(
         print(f"    lib time: {elapsed :6f}")
     
     # print(f"  reduced: {reduced}\n  ans: {ans}")
-    assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    if tensor_dtype == torch.float16:
+        assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    elif tensor_dtype == torch.float32:
+        assert torch.allclose(reduced, ans, atol=0, rtol=1e-5)
     check_error(lib.infiniopDestroyReduceMaxDescriptor(descriptor))
     # print("!!!!")
 
@@ -161,17 +164,20 @@ def test_cpu(lib, test_cases):
     test_cases = [
         # data_shape, axes, keepdims, noop_with_empty_axes
 
+        ((2, 3, 2, 5), (1, 3), True, False),
+        ((2, 3, 2, 5), (-2, -1), False, False),
+        # ((2, 3, 2, 5), (-3, -2), True, False),  #Floating point exception
+        # ((3, 2, 5, 4), (-4, -3, -2, -1), True, False),
+
         ((1, 3), (), True, True),
         ((1, 3), (), False, True),
 
         ((2, 3), (0,), True, False),
         ((32, 20, 512), (2,), True, False),
-        ((2, 3, 4, 5), (1, 3), True, False),
         ((3, 2, 5, 4), (0, 1, 2, 3), True, False),
         ((32, 56, 112, 112), (0, 1, 2), True, False),      
 
         ((2, 3, 5), (0,), False, False),
-        ((2, 3, 4, 5), (1, 3), False, False),
         ((32, 20, 512), (1, 2), False, False),
         ((3, 2, 5, 4), (0, 1, 2, 3), False, False),
         ((64, 64, 64, 64), (0, 2, 3), False, False),
diff --git a/operatorspy/tests/reduce_mean.py b/operatorspy/tests/reduce_mean.py
index c3a714fd..8844ac0e 100644
--- a/operatorspy/tests/reduce_mean.py
+++ b/operatorspy/tests/reduce_mean.py
@@ -142,8 +142,11 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
     
-    # print(f"reduced: {reduced}\nans: {ans}")
-    assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    if tensor_dtype == torch.float16:
+        assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    elif tensor_dtype == torch.float32:
+        # assert torch.allclose(reduced, ans, atol=0, rtol=1e-5)
+        assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyReduceMeanDescriptor(descriptor))
     # print("!!!!")
 
@@ -161,17 +164,20 @@ def test_cpu(lib, test_cases):
     test_cases = [
         # data_shape, axes, keepdims, noop_with_empty_axes
 
+        ((2, 3, 2, 5), (1, 3), True, False),
+        ((2, 3, 2, 5), (-2, -1), False, False),
+        ((3, 2, 5, 4), (-3, -2, -1), True, False), #Floating point exception
+        # ((3, 2, 5, 4), (-4, -3, -2, -1), True, False), #AssertionError
+
         ((1, 3), (), True, True),
         ((1, 3), (), False, True),
 
         ((2, 3), (0,), True, False),
         ((32, 20, 512), (2,), True, False),
-        ((2, 3, 4, 5), (1, 3), True, False),
         ((3, 2, 5, 4), (0, 1, 2, 3), True, False),
         ((32, 56, 112, 112), (0, 1, 2), True, False),      
 
         ((2, 3, 5), (0,), False, False),
-        ((2, 3, 4, 5), (1, 3), False, False),
         ((32, 20, 512), (1, 2), False, False),
         ((3, 2, 5, 4), (0, 1, 2, 3), False, False),
         ((64, 64, 64, 64), (0, 2, 3), False, False),
diff --git a/operatorspy/tests/reduce_min.py b/operatorspy/tests/reduce_min.py
index 2db240d0..a7c08a11 100644
--- a/operatorspy/tests/reduce_min.py
+++ b/operatorspy/tests/reduce_min.py
@@ -142,8 +142,10 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
     
-    # print(f"reduced: {reduced}\nans: {ans}")
-    assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    if tensor_dtype == torch.float16:
+        assert torch.allclose(reduced, ans, atol=0, rtol=1e-3)
+    elif tensor_dtype == torch.float32:
+        assert torch.allclose(reduced, ans, atol=0, rtol=1e-5)
     check_error(lib.infiniopDestroyReduceMinDescriptor(descriptor))
     # print("!!!!")
 
@@ -161,17 +163,19 @@ def test_cpu(lib, test_cases):
     test_cases = [
         # data_shape, axes, keepdims, noop_with_empty_axes
 
+        ((2, 3, 2, 5), (1, 3), True, False),
+        ((2, 3, 2, 5), (-2, -1), False, False),
+        ((3, 2, 5, 4), (-3, -2, -1), True, False),
+
         ((1, 3), (), True, True),
         ((1, 3), (), False, True),
 
         ((2, 3), (0,), True, False),
         ((32, 20, 512), (2,), True, False),
-        ((2, 3, 4, 5), (1, 3), True, False),
         ((3, 2, 5, 4), (0, 1, 2, 3), True, False),
         ((32, 56, 112, 112), (0, 1, 2), True, False),      
 
         ((2, 3, 5), (0,), False, False),
-        ((2, 3, 4, 5), (1, 3), False, False),
         ((32, 20, 512), (1, 2), False, False),
         ((3, 2, 5, 4), (0, 1, 2, 3), False, False),
         ((64, 64, 64, 64), (0, 2, 3), False, False),
diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py
index 46dbacc8..7088620f 100644
--- a/operatorspy/tests/where.py
+++ b/operatorspy/tests/where.py
@@ -97,7 +97,7 @@ def test(
     )
 
     # print(f"--ans:{ans}, output:{output}")
-    assert torch.allclose(output, ans, atol=0, rtol=1e-3)
+    assert torch.allclose(output, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyWhereDescriptor(descriptor))
 
 
diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc
index 8e276235..d391da74 100644
--- a/src/ops/gather/cpu/gather_cpu.cc
+++ b/src/ops/gather/cpu/gather_cpu.cc
@@ -122,7 +122,8 @@ infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
 
     for (uint64_t outer = 0; outer < outer_; ++outer) {
         for (uint64_t idx = 0; idx < indices_element_count; ++idx) {
-            Tindices index_val = indices_[idx];// bounds [-s, s-1]
+            Tindices index_val = indices_[idx];
+            // bounds [-s, s-1]
             index_val = (index_val + desc->data_shape[axis]) % desc->data_shape[axis];
 
             const uint64_t data_offset = outer * desc->data_shape[axis] * cpy_len +
diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc
index a6ddbc03..6b77a3d3 100644
--- a/src/ops/gather/operator.cc
+++ b/src/ops/gather/operator.cc
@@ -10,13 +10,12 @@
 #ifdef ENABLE_MTHREADS_GPU
 #endif
 
-__C infiniopStatus_t infiniopCreateGatherDescriptor(
-    infiniopHandle_t handle,
-    infiniopGatherDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output,
-    infiniopTensorDescriptor_t data,
-    infiniopTensorDescriptor_t indices,
-    int64_t axis) {
+__C infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle,
+                                                    infiniopGatherDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t output,
+                                                    infiniopTensorDescriptor_t data,
+                                                    infiniopTensorDescriptor_t indices,
+                                                    int64_t axis) {
     switch (handle->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc
index 4f56aaa0..476e8d12 100644
--- a/src/ops/reduce/cpu/reduce_cpu.cc
+++ b/src/ops/reduce/cpu/reduce_cpu.cc
@@ -107,7 +107,8 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
             bool is_axis = false;
             for(uint64_t j = 0; j < axes_ndim; j++) {
                 // printf("\t\tdata-dim %lu, axes-dim %lu, axes is %lu\n", i, j, axes[j]);
-                if (axes[j] == i) {
+                uint64_t axe = (axes[j] + data_ndim) % data_ndim;
+                if (axe == i) {
                     is_axis = true;
                     break;
                 }
@@ -128,13 +129,17 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
         // ?
         // sort(axes_.begin(), axes_.end());
         // sort(reduced_axes_.begin(), reduced_axes_.end());
+
+        // [-r,r-1], r = rank(data)
         axes_axes = new int64_t[axes_ndim];
         for (int i = 0; i < axes_ndim; i ++) {
             axes_axes[i] = axes_vec[i];
+            // axes_axes[i] = (axes_vec[i] + data_ndim) % data_ndim;
         }
         reduced_axes = new int64_t[reduced_ndim];
         for (int i = 0; i < reduced_ndim; i ++) {
             reduced_axes[i] = reduced_axes_vec[i];
+            // reduced_axes[i] = (reduced_axes_vec[i] + data_ndim) % data_ndim;
         }
 
         // 

From 1a12a179e592b741c5dbfaf908e658567d14fdd8 Mon Sep 17 00:00:00 2001
From: stever178 <2874146120@qq.com>
Date: Fri, 21 Mar 2025 16:08:37 +0800
Subject: [PATCH 3/5] little change

---
 include/infini_operators.h       |  6 ++++++
 src/ops/reduce/cpu/reduce_cpu.cc | 34 ++++++++++++++++++--------------
 src/ops/reduce/cpu/reduce_cpu.h  |  6 +++---
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/include/infini_operators.h b/include/infini_operators.h
index 9a5a2555..0ebeafc4 100644
--- a/include/infini_operators.h
+++ b/include/infini_operators.h
@@ -3,8 +3,10 @@
 #include "ops/attention/attention.h"
 #include "ops/avg_pool/avg_pool.h"
 #include "ops/causal_softmax/causal_softmax.h"
+#include "ops/clip/clip.h"
 #include "ops/global_avg_pool/global_avg_pool.h"
 #include "ops/expand/expand.h"
+#include "ops/gather/gather.h"
 #include "ops/gemm/gemm.h"
 #include "ops/conv/conv.h"
 #include "ops/matmul/matmul.h"
@@ -12,8 +14,12 @@
 #include "ops/mlp/mlp.h"
 #include "ops/random_sample/random_sample.h"
 #include "ops/rearrange/rearrange.h"
+#include "ops/reduce_max/reduce_max.h"
+#include "ops/reduce_mean/reduce_mean.h"
+#include "ops/reduce_min/reduce_min.h"
 #include "ops/relu/relu.h"
 #include "ops/rms_norm/rms_norm.h"
 #include "ops/rotary_embedding/rotary_embedding.h"
 #include "ops/swiglu/swiglu.h"
+#include "ops/where/where.h"
 #include "tensor/tensor_descriptor.h"
diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc
index 476e8d12..eeaae862 100644
--- a/src/ops/reduce/cpu/reduce_cpu.cc
+++ b/src/ops/reduce/cpu/reduce_cpu.cc
@@ -53,7 +53,7 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
     int64_t *data_strides = new int64_t[data_ndim];
     memcpy(data_strides, data->strides, data_ndim * sizeof(int64_t));
     int64_t *reduced_strides;
-    int64_t *axes_strides;
+    int64_t *anti_out_strides;
     
     // axes is empty
     if (axes_ndim == 0) {
@@ -65,8 +65,8 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
             for (int i = 0; i < axes_ndim; i ++) {
                 axes_axes[i] = i;
             }
-            axes_strides = new int64_t[axes_ndim];
-            memcpy(axes_strides, data_strides, axes_ndim * sizeof(int64_t));
+            anti_out_strides = new int64_t[axes_ndim];
+            memcpy(anti_out_strides, data_strides, axes_ndim * sizeof(int64_t));
 
             reduced_ndim = 1;
             reduced_size = 1;
@@ -78,7 +78,7 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
             axes_ndim = 0;
             axes_size = 0;
             axes_axes = nullptr;
-            axes_strides = new int64_t[axes_ndim];
+            anti_out_strides = new int64_t[axes_ndim];
 
             reduced_ndim = data_ndim;
             reduced_size = data_size;
@@ -143,11 +143,11 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
         }
 
         // 
-        axes_strides = new int64_t[axes_ndim];
+        anti_out_strides = new int64_t[axes_ndim];
         for(uint64_t i = 0; i < axes_ndim; i++) {
-            axes_strides[i] = 1;
+            anti_out_strides[i] = 1;
             for(uint64_t j = i + 1; j < axes_ndim; j++) {
-                axes_strides[i] *= data->shape[axes[j]];
+                anti_out_strides[i] *= data->shape[axes[j]];
             }
         }
 
@@ -182,9 +182,9 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
         reduced_axes,
         axes_axes,
 
-        reduced_strides,
         data_strides,
-        axes_strides,
+        reduced_strides,
+        anti_out_strides,
 
         keepdims,
         reduce_type
@@ -201,7 +201,7 @@ infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc) {
     delete[] (desc->axes);
     delete[] (desc->reduced_strides);
     delete[] (desc->data_strides);
-    delete[] (desc->axes_strides);
+    delete[] (desc->anti_out_strides);
     delete desc;
     // printf("\t--delete desc\n");
     return STATUS_SUCCESS;
@@ -226,9 +226,9 @@ infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
     auto reduced_axes_ = desc->reduced_axes;
     auto axes_ = desc->axes;
 
-    auto reduced_strides_ = desc->reduced_strides;
     auto data_strides_ = desc->data_strides;
-    auto axes_strides_ = desc->axes_strides;
+    auto reduced_strides_ = desc->reduced_strides;
+    auto anti_out_strides_ = desc->anti_out_strides;
 
     auto reduce_mode_ = desc->reduce_mode;
     
@@ -318,15 +318,19 @@ infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
         uint64_t idx = 0;
         uint64_t temp_i = i;
         for(uint64_t j = 0; j < reduced_ndim_; j++) {
+            // 将 reduced 张量的多维索引映射到 data 张量的线性索引
             idx += temp_i / reduced_strides_[j] * data_strides_[reduced_axes_[j]];
             temp_i %= reduced_strides_[j];
         }
         for(uint64_t j = 0; j < axes_size_; j++) {
             uint64_t data_idx_ = idx;
             uint64_t temp_j = j;
-            for(uint64_t k = 0; k < axes_ndim_; k++) {
-                data_idx_ += temp_j / axes_strides_[k] * data_strides_[axes_[k]];
-                temp_j %= axes_strides_[k];
+
+            // reduced_ndim + axes_ndim = data_ndim
+            for (uint64_t k = 0; k < axes_ndim_; k++) {
+                // 继续
+                data_idx_ += temp_j / anti_out_strides_[k] * data_strides_[axes_[k]];
+                temp_j %= anti_out_strides_[k];
             }
 
             if constexpr (std::is_same<Tdata, uint16_t>::value) {
diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h
index 2dd13327..01ff5cce 100644
--- a/src/ops/reduce/cpu/reduce_cpu.h
+++ b/src/ops/reduce/cpu/reduce_cpu.h
@@ -21,12 +21,12 @@ struct ReduceCpuDescriptor {
     uint64_t data_size;
     uint64_t axes_size;
 
-    int64_t const *reduced_axes;
+    int64_t const *reduced_axes; //在 data 中需要保留的维度
     int64_t const *axes;
     
-    int64_t const *reduced_strides;
     int64_t const *data_strides;
-    int64_t const *axes_strides;
+    int64_t const *reduced_strides;
+    int64_t const *anti_out_strides;
 
     // int reduce_element_num;
     bool keepdims;

From 6e90edccfb63fe8250f3fd7437c435ab6596ebc3 Mon Sep 17 00:00:00 2001
From: stever178 <2874146120@qq.com>
Date: Fri, 21 Mar 2025 20:24:15 +0800
Subject: [PATCH 4/5] modify tests

---
 operatorspy/tests/gather.py      | 65 +++++++++++++++++---------------
 operatorspy/tests/reduce_mean.py |  2 +-
 operatorspy/tests/where.py       |  5 ++-
 src/ops/gather/cpu/gather_cpu.cc | 15 +++++++-
 4 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
index 0f8179a8..73eec566 100644
--- a/operatorspy/tests/gather.py
+++ b/operatorspy/tests/gather.py
@@ -38,6 +38,12 @@ def gather(data, indices, axis):
     np_output = np.take(np_input, np_indices, axis=axis)
     return torch.from_numpy(np_output)
 
+def gather(rank, axis, inputTensor, indexTensor):
+    indices = [slice(None)] * rank
+    indices[axis] = indexTensor
+    outTensor = inputTensor[tuple(indices)]
+    return outTensor
+
 
 def test(
     lib,
@@ -48,36 +54,35 @@ def test(
     indices_shape,
     axis=0,
     tensor_dtype=torch.float16,
-    indices_dtype=torch.int32,
-    inplace=Inplace.OUT_OF_PLACE,
 ):
-    print(
-        f"Testing Gather on {torch_device} with o_shape:{o_shape} data_shape:{data_shape} indices_shape:{indices_shape} \
-            dtype:{tensor_dtype} inplace: {inplace.name}"
-    )
-    if inplace != Inplace.OUT_OF_PLACE:
-        print("Unsupported test: calculatin in-place is not supported")
-        return
-    if data_shape != indices_shape: 
-        print("Unsupported test: broadcasting does not support in-place")
-        return
+    # if inplace != Inplace.OUT_OF_PLACE:
+    #     print("Unsupported test: calculatin in-place is not supported")
+    #     return
+    # if data_shape != indices_shape: 
+    #     print("Unsupported test: broadcasting does not support in-place")
+    #     return
     # q+r-1
-    if len(o_shape) != len(indices_shape) + len(data_shape) - 1:
-        print("Unsupported test: output-shape is not valid")
-        return
+    # if len(o_shape) != len(indices_shape) + len(data_shape) - 1:
+    #     print("Unsupported test: output-shape is not valid")
+    #     return
 
-    # indices = torch.rand(indices_shape, dtype=indices_dtype).to(torch_device)
-    indices = torch.randint(0, 3, indices_shape, dtype=indices_dtype).to(torch_device)
     data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device)
-    output = torch.rand(o_shape, dtype=tensor_dtype).to(torch_device) 
+    indices = torch.randint(0, data.shape[axis], data_shape, device=torch_device).to(torch.int32)
+    # ans = gather(data, indices, axis)
+    ans = gather(len(data_shape), axis, data, indices)
+    # output = torch.zeros(o_shape, dtype=tensor_dtype).to(torch_device) 
+    output = torch.zeros(ans.shape, dtype=tensor_dtype, device=torch_device)
+
+    print(
+        f"Testing Gather on {torch_device} with output_shape:{ans.shape} data_shape:{data_shape} indices_shape:{indices_shape} axis:{axis} dtype:{tensor_dtype}"
+    )
+    # print(ans.shape, indices_shape, data_shape)    
 
     descriptor = infiniopGatherDescriptor_t()
     data_tensor = to_tensor(data, lib)
     indices_tensor = to_tensor(indices, lib)
     o_tensor = to_tensor(output, lib) 
 
-    ans = gather(data, indices, axis)
-
     check_error(
         lib.infiniopCreateGatherDescriptor(
             handle,
@@ -85,7 +90,7 @@ def test(
             o_tensor.descriptor,
             data_tensor.descriptor,
             indices_tensor.descriptor,
-            axis,
+            c_int64(axis),
         )
     )
 
@@ -113,10 +118,10 @@ def get_o_shape(input_shape, indices_shape, axis):
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for data_shape, indices_shape, axis, inplace in test_cases:
+    for data_shape, indices_shape, axis, _ in test_cases:
         o_shape = get_o_shape(data_shape, indices_shape, axis)
-        test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float16, indices_dtype=torch.int32, inplace=inplace)
-        test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float32, indices_dtype=torch.int64, inplace=inplace)
+        test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
 
@@ -124,13 +129,14 @@ def test_cpu(lib, test_cases):
     test_cases = [
         # q+r-1
         # data_shape, indices_shape, axis, inplace
+        # ((), (), (), Inplace.OUT_OF_PLACE),
         ((3, 3), (2, ), 0, Inplace.OUT_OF_PLACE),
+        ((32, 64), (10, 4), 1, Inplace.OUT_OF_PLACE), 
         ((4, 3, 4), (2, 2), 1, Inplace.OUT_OF_PLACE),
-        ((5, 3, 4, 5), (3, 2), 2, Inplace.OUT_OF_PLACE),
-        ((32, 20, 512), (2, 16), 0, Inplace.OUT_OF_PLACE),
-
-        # ((32, 256, 112, 112), (, 112, 1), 0, Inplace.OUT_OF_PLACE),
-        # ((), (), (), Inplace.OUT_OF_PLACE),
+        ((32, 20, 64), (2, 3), 2, Inplace.OUT_OF_PLACE),
+        ((1, 3, 4, 5), (1, 2), 3, Inplace.OUT_OF_PLACE),
+        # ((32, 64, 16, 8), (12, 2), 0, Inplace.OUT_OF_PLACE),
+        #Cannot allocate memory
     ]
 
     args = get_args()
@@ -150,7 +156,6 @@ def test_cpu(lib, test_cases):
         c_void_p,
         c_void_p,
         c_void_p,
-        #
         c_void_p,
     ]
     lib.infiniopDestroyGatherDescriptor.restype = c_int32
diff --git a/operatorspy/tests/reduce_mean.py b/operatorspy/tests/reduce_mean.py
index 8844ac0e..8e135522 100644
--- a/operatorspy/tests/reduce_mean.py
+++ b/operatorspy/tests/reduce_mean.py
@@ -166,7 +166,7 @@ def test_cpu(lib, test_cases):
 
         ((2, 3, 2, 5), (1, 3), True, False),
         ((2, 3, 2, 5), (-2, -1), False, False),
-        ((3, 2, 5, 4), (-3, -2, -1), True, False), #Floating point exception
+        # ((3, 2, 5, 4), (-3, -2, -1), True, False), #Floating point exception
         # ((3, 2, 5, 4), (-4, -3, -2, -1), True, False), #AssertionError
 
         ((1, 3), (), True, True),
diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py
index 7088620f..12c849c0 100644
--- a/operatorspy/tests/where.py
+++ b/operatorspy/tests/where.py
@@ -59,7 +59,8 @@ def test(
     y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
     output = torch.rand(o_shape, dtype=tensor_dtype).to(torch_device) \
         if inplace == Inplace.OUT_OF_PLACE else (x if inplace == Inplace.INPLACE_X else y)
-    condition = torch.randint(0, 2, condition_shape, dtype=torch.uint8).to(torch_device)
+    condition_ = torch.randint(0, 2, condition_shape, dtype=torch.bool).to(torch_device)
+    condition = condition_.to(dtype=torch.uint8).to(torch_device)
 
     descriptor = infiniopWhereDescriptor_t()
     x_tensor = to_tensor(x, lib)
@@ -68,7 +69,7 @@ def test(
         if inplace == Inplace.OUT_OF_PLACE else (x_tensor if inplace == Inplace.INPLACE_X else y_tensor)
     condition_tensor = to_tensor(condition, lib)
 
-    ans = where(condition, x, y)
+    ans = where(condition_, x, y)
 
     check_error(
         lib.infiniopCreateWhereDescriptor(
diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc
index d391da74..2a609999 100644
--- a/src/ops/gather/cpu/gather_cpu.cc
+++ b/src/ops/gather/cpu/gather_cpu.cc
@@ -21,6 +21,7 @@ infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t,
                                            infiniopTensorDescriptor_t data,
                                            infiniopTensorDescriptor_t indices,
                                            int64_t axis = 0) {
+    // printf("\tcpuCreateGatherDescriptor\n");
     uint64_t indices_ndim = indices->ndim;// rank q
     uint64_t data_ndim = data->ndim;      // rank r
     uint64_t ndim = output->ndim;         // q+r-1
@@ -39,13 +40,24 @@ infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t,
     }
     // q+r-1
     if (ndim != indices_ndim + data_ndim - 1) {
+        printf("\tseems wrong shape\n");
         return STATUS_BAD_TENSOR_SHAPE;
     }
     // [-r, r-1]
-    if (axis < -data_ndim || axis > data_ndim - 1) {
+    // 0 < -2  || 0 > 1
+    if (axis < (int64_t)(-data_ndim) || axis > (int64_t)(data_ndim - 1)) {
+        // axis:0, data_ndim:2
+        printf("\tseems wrong dim, axis:%ld, data_ndim:%ld\n", axis, data_ndim);
+        if (axis > data_ndim - 1) {
+            printf("\t%ld > %ld\n", axis, data_ndim - 1);
+        }
+        if (axis < 0-data_ndim) {
+            printf("\t%ld < %ld\n", axis, -data_ndim);
+        }
         return STATUS_BAD_PARAM;
     }
     axis = (axis + data_ndim) % data_ndim;
+    // printf("\tcheck over\n");
 
     // uint64_t o_data_size = std::accumulate(output->shape, output->shape + output->ndim, 1ULL, std::multiplies<uint64_t>());
 
@@ -102,6 +114,7 @@ infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc,
                             void *output,
                             void const *data,
                             void const *indices) {
+    // printf("\tgather_cpu\n");
     auto data_ = reinterpret_cast<Tdata const *>(data);
     auto output_ = reinterpret_cast<Tdata *>(output);
     auto indices_ = reinterpret_cast<Tindices const *>(indices);

From 3100656abbbccf8de3e6a7b1594c446f23b94500 Mon Sep 17 00:00:00 2001
From: stever178 <2874146120@qq.com>
Date: Fri, 21 Mar 2025 20:53:42 +0800
Subject: [PATCH 5/5] little change

---
 operatorspy/tests/gather.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
index 73eec566..98d496e7 100644
--- a/operatorspy/tests/gather.py
+++ b/operatorspy/tests/gather.py
@@ -67,7 +67,8 @@ def test(
     #     return
 
     data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device)
-    indices = torch.randint(0, data.shape[axis], data_shape, device=torch_device).to(torch.int32)
+    # indices = torch.randint(0, data.shape[axis], data_shape, device=torch_device).to(torch.int32)
+    indices = torch.randint(-data.shape[axis], data.shape[axis], data_shape, device=torch_device).to(torch.int32)
     # ans = gather(data, indices, axis)
     ans = gather(len(data_shape), axis, data, indices)
     # output = torch.zeros(o_shape, dtype=tensor_dtype).to(torch_device)