From 75b78d967823959ca961e8f4fb36aa41c428003d Mon Sep 17 00:00:00 2001
From: Zimin Li <coollizimin@gmail.com>
Date: Thu, 21 Nov 2024 18:08:27 +0800
Subject: [PATCH 1/5] Refactor add to binary elementwise operation and add sub,
 mul, div, pow, mod, min, max

---
 include/ops/div/div.h            |  27 +++
 include/ops/max/max.h            |  27 +++
 include/ops/min/min.h            |  27 +++
 include/ops/mod/mod.h            |  27 +++
 include/ops/mul/mul.h            |  27 +++
 include/ops/pow/pow.h            |  27 +++
 include/ops/sub/sub.h            |  27 +++
 operatorspy/tests/add.py         |  53 ++++--
 operatorspy/tests/div.py         | 208 ++++++++++++++++++++++
 operatorspy/tests/max.py         | 190 +++++++++++++++++++++
 operatorspy/tests/min.py         | 189 ++++++++++++++++++++
 operatorspy/tests/mod.py         | 208 ++++++++++++++++++++++
 operatorspy/tests/mul.py         | 189 ++++++++++++++++++++
 operatorspy/tests/pow.py         | 187 ++++++++++++++++++++
 operatorspy/tests/sub.py         | 189 ++++++++++++++++++++
 src/devices/cpu/common_cpu.cc    |  25 ++-
 src/ops/add/cpu/add_cpu.cc       | 101 -----------
 src/ops/add/cpu/add_cpu.h        |  33 ----
 src/ops/add/cuda/add.cc          |  81 ---------
 src/ops/add/cuda/add.cu          | 116 -------------
 src/ops/add/cuda/add.cuh         |  37 ----
 src/ops/add/operator.cc          |  95 ++++-------
 src/ops/binary/binary.h          |  72 ++++++++
 src/ops/binary/cpu/binary_cpu.cc | 176 +++++++++++++++++++
 src/ops/binary/cpu/binary_cpu.h  |  37 ++++
 src/ops/binary/cuda/binary.cc    |  82 +++++++++
 src/ops/binary/cuda/binary.cu    | 285 +++++++++++++++++++++++++++++++
 src/ops/binary/cuda/binary.cuh   |  38 +++++
 src/ops/binary/operator.cc       |  74 ++++++++
 src/ops/div/operator.cc          |  44 +++++
 src/ops/max/operator.cc          |  44 +++++
 src/ops/min/operator.cc          |  44 +++++
 src/ops/mod/operator.cc          |  44 +++++
 src/ops/mul/operator.cc          |  44 +++++
 src/ops/pow/operator.cc          |  44 +++++
 src/ops/sub/operator.cc          |  44 +++++
 36 files changed, 2705 insertions(+), 457 deletions(-)
 create mode 100644 include/ops/div/div.h
 create mode 100644 include/ops/max/max.h
 create mode 100644 include/ops/min/min.h
 create mode 100644 include/ops/mod/mod.h
 create mode 100644 include/ops/mul/mul.h
 create mode 100644 include/ops/pow/pow.h
 create mode 100644 include/ops/sub/sub.h
 create mode 100644 operatorspy/tests/div.py
 create mode 100644 operatorspy/tests/max.py
 create mode 100644 operatorspy/tests/min.py
 create mode 100644 operatorspy/tests/mod.py
 create mode 100644 operatorspy/tests/mul.py
 create mode 100644 operatorspy/tests/pow.py
 create mode 100644 operatorspy/tests/sub.py
 delete mode 100644 src/ops/add/cpu/add_cpu.cc
 delete mode 100644 src/ops/add/cpu/add_cpu.h
 delete mode 100644 src/ops/add/cuda/add.cc
 delete mode 100644 src/ops/add/cuda/add.cu
 delete mode 100644 src/ops/add/cuda/add.cuh
 create mode 100644 src/ops/binary/binary.h
 create mode 100644 src/ops/binary/cpu/binary_cpu.cc
 create mode 100644 src/ops/binary/cpu/binary_cpu.h
 create mode 100644 src/ops/binary/cuda/binary.cc
 create mode 100644 src/ops/binary/cuda/binary.cu
 create mode 100644 src/ops/binary/cuda/binary.cuh
 create mode 100644 src/ops/binary/operator.cc
 create mode 100644 src/ops/div/operator.cc
 create mode 100644 src/ops/max/operator.cc
 create mode 100644 src/ops/min/operator.cc
 create mode 100644 src/ops/mod/operator.cc
 create mode 100644 src/ops/mul/operator.cc
 create mode 100644 src/ops/pow/operator.cc
 create mode 100644 src/ops/sub/operator.cc

diff --git a/include/ops/div/div.h b/include/ops/div/div.h
new file mode 100644
index 00000000..a749272d
--- /dev/null
+++ b/include/ops/div/div.h
@@ -0,0 +1,27 @@
+#ifndef DIV_H
+#define DIV_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct DivDescriptor {
+    Device device;
+} DivDescriptor;
+
+typedef DivDescriptor *infiniopDivDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
+                                                          infiniopDivDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+
+#endif
diff --git a/include/ops/max/max.h b/include/ops/max/max.h
new file mode 100644
index 00000000..446dcb2e
--- /dev/null
+++ b/include/ops/max/max.h
@@ -0,0 +1,27 @@
+#ifndef MAX_H
+#define MAX_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct MaxDescriptor {
+    Device device;
+} MaxDescriptor;
+
+typedef MaxDescriptor *infiniopMaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle,
+                                                          infiniopMaxDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopMax(infiniopMaxDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc);
+
+#endif
diff --git a/include/ops/min/min.h b/include/ops/min/min.h
new file mode 100644
index 00000000..28767f93
--- /dev/null
+++ b/include/ops/min/min.h
@@ -0,0 +1,27 @@
+#ifndef MIN_H
+#define MIN_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct MinDescriptor {
+    Device device;
+} MinDescriptor;
+
+typedef MinDescriptor *infiniopMinDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle,
+                                                          infiniopMinDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopMin(infiniopMinDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc);
+
+#endif
diff --git a/include/ops/mod/mod.h b/include/ops/mod/mod.h
new file mode 100644
index 00000000..0065c885
--- /dev/null
+++ b/include/ops/mod/mod.h
@@ -0,0 +1,27 @@
+#ifndef MOD_H
+#define MOD_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ModDescriptor {
+    Device device;
+} ModDescriptor;
+
+typedef ModDescriptor *infiniopModDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle,
+                                                          infiniopModDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopMod(infiniopModDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc);
+
+#endif
diff --git a/include/ops/mul/mul.h b/include/ops/mul/mul.h
new file mode 100644
index 00000000..4a30afe4
--- /dev/null
+++ b/include/ops/mul/mul.h
@@ -0,0 +1,27 @@
+#ifndef MUL_H
+#define MUL_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct MulDescriptor {
+    Device device;
+} MulDescriptor;
+
+typedef MulDescriptor *infiniopMulDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
+                                                          infiniopMulDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopMul(infiniopMulDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
+
+#endif
diff --git a/include/ops/pow/pow.h b/include/ops/pow/pow.h
new file mode 100644
index 00000000..3ba6eb7c
--- /dev/null
+++ b/include/ops/pow/pow.h
@@ -0,0 +1,27 @@
+#ifndef POW_H
+#define POW_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct PowDescriptor {
+    Device device;
+} PowDescriptor;
+
+typedef PowDescriptor *infiniopPowDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle,
+                                                          infiniopPowDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopPow(infiniopPowDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc);
+
+#endif
diff --git a/include/ops/sub/sub.h b/include/ops/sub/sub.h
new file mode 100644
index 00000000..f5577720
--- /dev/null
+++ b/include/ops/sub/sub.h
@@ -0,0 +1,27 @@
+#ifndef SUB_H
+#define SUB_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct SubDescriptor {
+    Device device;
+} SubDescriptor;
+
+typedef SubDescriptor *infiniopSubDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle,
+                                                          infiniopSubDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopSub(infiniopSubDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc);
+
+#endif
diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py
index d766208c..b4d2203e 100644
--- a/operatorspy/tests/add.py
+++ b/operatorspy/tests/add.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -19,11 +20,18 @@
 from enum import Enum, auto
 import torch
 
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class Inplace(Enum):
     OUT_OF_PLACE = auto()
     INPLACE_A = auto()
     INPLACE_B = auto()
+    INPLACE_AB = auto()
 
 
 class AddDescriptor(Structure):
@@ -34,6 +42,10 @@ class AddDescriptor(Structure):
 
 
 def add(x, y):
+    if PROFILE:
+        ans = torch.add(x, y)
+        torch.cuda.synchronize()
+        return ans
     return torch.add(x, y)
 
 
@@ -55,13 +67,20 @@ def test(
         return
 
     a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
-    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a
     c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
     
-    ans = add(a, b)
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = add(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = add(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
 
     a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
     c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
     descriptor = infiniopAddDescriptor_t()
 
@@ -74,9 +93,20 @@ def test(
             b_tensor.descriptor,
         )
     )
-    lib.infiniopAdd(
-        descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
-    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopAdd(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopAdd(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
     assert torch.allclose(c, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyAddDescriptor(descriptor))
 
@@ -113,20 +143,17 @@ def test_bang(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # c_shape, a_shape, b_shape, inplace
-        # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
         # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
         # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
         ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
         ((), (), (), Inplace.OUT_OF_PLACE),
-        ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
-        ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
-        ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-        ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
         ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
         ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
         ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
     ]
     args = get_args()
     lib = open_lib()
diff --git a/operatorspy/tests/div.py b/operatorspy/tests/div.py
new file mode 100644
index 00000000..c303c635
--- /dev/null
+++ b/operatorspy/tests/div.py
@@ -0,0 +1,208 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_AB = auto()
+
+
+class DivDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopDivDescriptor_t = POINTER(DivDescriptor)
+
+def find_and_print_differing_indices(
+    a, b, c, ans, atol=0, rtol=1e-2
+):
+    if c.shape != ans.shape:
+        raise ValueError("Tensors must have the same shape to compare.")
+
+    # Calculate the difference mask based on atol and rtol
+    diff_mask = torch.abs(c - ans) > (atol + rtol * torch.abs(ans))
+    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
+
+    # Print the indices and the differing elements
+    for idx in diff_indices:
+        index_tuple = tuple(idx.tolist())
+        print(
+            f"Index: {index_tuple}, a: {a[index_tuple]}, b: {b[index_tuple]}, y element: {c[index_tuple]}, ans element: {ans[index_tuple]}"
+        )
+
+    return diff_indices
+
+
+def div(x, y):
+    if PROFILE:
+        ans = torch.div(x, y)
+        torch.cuda.synchronize()
+        return ans
+    return torch.div(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Div on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) * 2 if inplace != Inplace.INPLACE_AB else a
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = div(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = div(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopDivDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateDivDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopDiv(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopDiv(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3, equal_nan=True)
+    check_error(lib.infiniopDestroyDivDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateDivDescriptor.restype = c_int32
+    lib.infiniopCreateDivDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopDivDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopDiv.restype = c_int32
+    lib.infiniopDiv.argtypes = [
+        infiniopDivDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyDivDescriptor.restype = c_int32
+    lib.infiniopDestroyDivDescriptor.argtypes = [
+        infiniopDivDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/max.py b/operatorspy/tests/max.py
new file mode 100644
index 00000000..39eaf2b3
--- /dev/null
+++ b/operatorspy/tests/max.py
@@ -0,0 +1,190 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_AB = auto()
+
+
+class MaxDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMaxDescriptor_t = POINTER(MaxDescriptor)
+
+
+def max(x, y):
+    if PROFILE:
+        ans = torch.max(x, y)
+        torch.cuda.synchronize()
+        return ans
+    return torch.max(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Max on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = max(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = max(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopMaxDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateMaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopMax(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopMax(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(c, ans, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyMaxDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((33, 333, 333), (33, 333, 333), (33, 333, 333), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateMaxDescriptor.restype = c_int32
+    lib.infiniopCreateMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMaxDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopMax.restype = c_int32
+    lib.infiniopMax.argtypes = [
+        infiniopMaxDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxDescriptor.argtypes = [
+        infiniopMaxDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/min.py b/operatorspy/tests/min.py
new file mode 100644
index 00000000..050ea64a
--- /dev/null
+++ b/operatorspy/tests/min.py
@@ -0,0 +1,189 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_AB = auto()
+
+
+class MinDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMinDescriptor_t = POINTER(MinDescriptor)
+
+
+def min(x, y):
+    if PROFILE:
+        ans = torch.min(x, y)
+        torch.cuda.synchronize()
+        return ans
+    return torch.min(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Min on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = min(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = min(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopMinDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateMinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopMin(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopMin(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(c, ans, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyMinDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateMinDescriptor.restype = c_int32
+    lib.infiniopCreateMinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMinDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopMin.restype = c_int32
+    lib.infiniopMin.argtypes = [
+        infiniopMinDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMinDescriptor.restype = c_int32
+    lib.infiniopDestroyMinDescriptor.argtypes = [
+        infiniopMinDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/mod.py b/operatorspy/tests/mod.py
new file mode 100644
index 00000000..342e3662
--- /dev/null
+++ b/operatorspy/tests/mod.py
@@ -0,0 +1,208 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_AB = auto()
+
+
+class ModDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopModDescriptor_t = POINTER(ModDescriptor)
+
+def find_and_print_differing_indices(
+    a, b, c, ans, atol=0, rtol=1e-2
+):
+    if c.shape != ans.shape:
+        raise ValueError("Tensors must have the same shape to compare.")
+
+    # Calculate the difference mask based on atol and rtol
+    diff_mask = torch.abs(c - ans) > (atol + rtol * torch.abs(ans))
+    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
+
+    # Print the indices and the differing elements
+    for idx in diff_indices:
+        index_tuple = tuple(idx.tolist())
+        print(
+            f"Index: {index_tuple}, a: {a[index_tuple]}, b: {b[index_tuple]}, y element: {c[index_tuple]}, ans element: {ans[index_tuple]}"
+        )
+
+    return diff_indices
+
+
+def mod(x, y):
+    if PROFILE:
+        ans = torch.fmod(x, y)
+        torch.cuda.synchronize() 
+        return ans
+    return torch.fmod(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Mod on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = mod(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = mod(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopModDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateModDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopMod(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopMod(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(c, ans, atol=0, rtol=0, equal_nan=True)
+    check_error(lib.infiniopDestroyModDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateModDescriptor.restype = c_int32
+    lib.infiniopCreateModDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopModDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopMod.restype = c_int32
+    lib.infiniopMod.argtypes = [
+        infiniopModDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyModDescriptor.restype = c_int32
+    lib.infiniopDestroyModDescriptor.argtypes = [
+        infiniopModDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/mul.py b/operatorspy/tests/mul.py
new file mode 100644
index 00000000..57085b42
--- /dev/null
+++ b/operatorspy/tests/mul.py
@@ -0,0 +1,189 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_AB = auto()
+
+
+class MulDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMulDescriptor_t = POINTER(MulDescriptor)
+
+
+def mul(x, y):
+    if PROFILE:
+        ans = torch.mul(x, y)
+        torch.cuda.synchronize()
+        return ans
+    return torch.mul(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Mul on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = mul(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = mul(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopMulDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateMulDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopMul(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopMul(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(c, ans, atol=1e-6, rtol=1e-3)
+    check_error(lib.infiniopDestroyMulDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateMulDescriptor.restype = c_int32
+    lib.infiniopCreateMulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMulDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopMul.restype = c_int32
+    lib.infiniopMul.argtypes = [
+        infiniopMulDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMulDescriptor.restype = c_int32
+    lib.infiniopDestroyMulDescriptor.argtypes = [
+        infiniopMulDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/pow.py b/operatorspy/tests/pow.py
new file mode 100644
index 00000000..e5106b36
--- /dev/null
+++ b/operatorspy/tests/pow.py
@@ -0,0 +1,187 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_AB = auto()
+
+
+class PowDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopPowDescriptor_t = POINTER(PowDescriptor)
+
+
+def pow(x, y):
+    if PROFILE:
+        ans = torch.pow(x, y)
+        torch.cuda.synchronize()
+        return ans
+    return torch.pow(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Pow on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 5
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) * 5 if inplace != Inplace.INPLACE_AB else a
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = pow(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pow(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopPowDescriptor_t()
+
+    check_error(
+        lib.infiniopCreatePowDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopPow(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopPow(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(c, ans, atol=1e-7, rtol=1e-3, equal_nan=True)
+    check_error(lib.infiniopDestroyPowDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((33, 333, 333), (33, 333, 333), (33, 333, 333), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreatePowDescriptor.restype = c_int32
+    lib.infiniopCreatePowDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopPowDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopPow.restype = c_int32
+    lib.infiniopPow.argtypes = [
+        infiniopPowDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyPowDescriptor.restype = c_int32
+    lib.infiniopDestroyPowDescriptor.argtypes = [
+        infiniopPowDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/sub.py b/operatorspy/tests/sub.py
new file mode 100644
index 00000000..cc8eb406
--- /dev/null
+++ b/operatorspy/tests/sub.py
@@ -0,0 +1,189 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function (elementwise.cu),
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_AB = auto()
+
+
+class SubDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopSubDescriptor_t = POINTER(SubDescriptor)
+
+
+def subtract(x, y):
+    if PROFILE:
+        ans = torch.subtract(x, y)
+        torch.cuda.synchronize()
+        return ans
+    return torch.subtract(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Sub on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = subtract(a, b)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = subtract(a, b)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopSubDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateSubDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopSub(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopSub(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroySubDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateSubDescriptor.restype = c_int32
+    lib.infiniopCreateSubDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopSubDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopSub.restype = c_int32
+    lib.infiniopSub.argtypes = [
+        infiniopSubDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySubDescriptor.restype = c_int32
+    lib.infiniopDestroySubDescriptor.argtypes = [
+        infiniopSubDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/src/devices/cpu/common_cpu.cc b/src/devices/cpu/common_cpu.cc
index b5b5f0fd..3e097446 100644
--- a/src/devices/cpu/common_cpu.cc
+++ b/src/devices/cpu/common_cpu.cc
@@ -44,22 +44,19 @@ uint16_t f32_to_f16(float val) {
     int32_t exponent = ((f32 >> 23) & 0xFF) - 127;// Extract and de-bias the exponent
     uint32_t mantissa = f32 & 0x7FFFFF;           // Extract the mantissa (fraction part)
 
-    if (exponent == 128) {// Special case for Inf and NaN
-        if (mantissa != 0) {
-            // NaN
-            return sign | 0x7C00 | (mantissa >> 13);// Convert the NaN payload
-        } else {
-            // Infinity
-            return sign | 0x7C00;
+    if (exponent >= 31) {// Special cases for Inf and NaN
+        // NaN
+        if (exponent == 128 && mantissa != 0) {
+            return sign | 0x7E00;
         }
-    } else if (exponent > 15) {  // Overflow: Larger than float16 max
-        return sign | 0x7C00;    // Return infinity
-    } else if (exponent >= -14) {// Normalized float16
+        // Infinity
+        return sign | 0x7C00;
+    } else if (exponent >= -14) {// Normalized case
         return sign | ((exponent + 15) << 10) | (mantissa >> 13);
-    } else if (exponent >= -24) {     // Subnormal float16 (leading denormals)
-        mantissa |= 0x800000;         // Add implicit leading 1
-        int32_t shift = -exponent - 1;// Calculate shift for subnormal numbers
-        return sign | (mantissa >> (13 + shift));
+    } else if (exponent >= -24) {
+        mantissa |= 0x800000;// Add implicit leading 1
+        mantissa >>= (-14 - exponent);
+        return sign | (mantissa >> 13);
     } else {
         // Too small for subnormal: return signed zero
         return sign;
diff --git a/src/ops/add/cpu/add_cpu.cc b/src/ops/add/cpu/add_cpu.cc
deleted file mode 100644
index 649fa052..00000000
--- a/src/ops/add/cpu/add_cpu.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-#include "add_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include "../../utils.h"
-
-inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) {
-    for (int64_t i = ndim - 1; i >= 0; --i) {
-        if (++indices[i] != shape[i]) {
-            return;
-        }
-        indices[i] = 0;
-    }
-}
-
-inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) {
-    return std::inner_product(indices, indices + ndim, strides, uint64_t(0));
-}
-
-infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t,
-                                        AddCpuDescriptor_t *desc_ptr,
-                                        infiniopTensorDescriptor_t c,
-                                        infiniopTensorDescriptor_t a,
-                                        infiniopTensorDescriptor_t b) {
-    uint64_t ndim = c->ndim;
-    if (!isValidBroadcastShape(a, b, c)) {
-        return STATUS_BAD_TENSOR_SHAPE;
-    }
-    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
-        return STATUS_BAD_TENSOR_STRIDES;
-    }
-    if (c->dt != F16 && c->dt != F32) {
-        return STATUS_BAD_TENSOR_DTYPE;
-    }
-    if (c->dt != a->dt || c->dt != b->dt) {
-        return STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
-
-    // get the adjusted strides for a and b
-    uint64_t *a_strides = new uint64_t[ndim];
-    uint64_t *b_strides = new uint64_t[ndim];
-    for (size_t i = 0; i < ndim; ++i) {
-        a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
-        b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
-    }
-
-    uint64_t *c_indices = new uint64_t[ndim];
-    std::fill(c_indices, c_indices + ndim, 0);
-
-    *desc_ptr = new AddCpuDescriptor{
-        DevCpu,
-        c->dt,
-        ndim,
-        c_data_size,
-        c->shape,
-        a_strides,
-        b_strides,
-        c_indices,
-    };
-
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc) {
-    delete[] desc->a_strides;
-    delete[] desc->b_strides;
-    delete[] desc->c_indices;
-    delete desc;
-    return STATUS_SUCCESS;
-}
-
-template<typename Tdata>
-infiniopStatus_t add_cpu(AddCpuDescriptor_t desc, void *c, void const *a, void const *b) {
-    auto a_ = reinterpret_cast<Tdata const *>(a);
-    auto b_ = reinterpret_cast<Tdata const *>(b);
-    auto c_ = reinterpret_cast<Tdata *>(c);
-    const auto &indices = desc->c_indices;
-
-    for (uint64_t i = 0; i < desc->c_data_size; ++i, incrementOne(indices, desc->c_shape, desc->ndim)) {
-        auto a_index = compactToFlat(indices, desc->a_strides, desc->ndim);
-        auto b_index = compactToFlat(indices, desc->b_strides, desc->ndim);
-        if constexpr (std::is_same<Tdata, uint16_t>::value) {
-            c_[i] = f32_to_f16(f16_to_f32(a_[a_index]) + f16_to_f32(b_[b_index]));
-        } else {
-            c_[i] = a_[a_index] + b_[b_index];
-        }
-    }
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc,
-                        void *c, void const *a, void const *b,
-                        void *stream) {
-    if (desc->dtype == F16) {
-        return add_cpu<uint16_t>(desc, c, a, b);
-    }
-    if (desc->dtype == F32) {
-        return add_cpu<float>(desc, c, a, b);
-    }
-    return STATUS_BAD_TENSOR_DTYPE;
-}
diff --git a/src/ops/add/cpu/add_cpu.h b/src/ops/add/cpu/add_cpu.h
deleted file mode 100644
index 42e62435..00000000
--- a/src/ops/add/cpu/add_cpu.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __CPU_ADD_H__
-#define __CPU_ADD_H__
-
-#include "operators.h"
-#include <numeric>
-#include <type_traits>
-
-struct AddCpuDescriptor {
-    Device device;
-    DT dtype;
-    uint64_t ndim;
-    uint64_t c_data_size;
-    uint64_t const *c_shape;
-    uint64_t const *a_strides;
-    uint64_t const *b_strides;
-    uint64_t *c_indices;
-};
-
-typedef struct AddCpuDescriptor *AddCpuDescriptor_t;
-
-infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t,
-                                        AddCpuDescriptor_t *,
-                                        infiniopTensorDescriptor_t c,
-                                        infiniopTensorDescriptor_t a,
-                                        infiniopTensorDescriptor_t b);
-
-infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc,
-                        void *c, void const *a, void const *b,
-                        void *stream);
-
-infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc);
-
-#endif
diff --git a/src/ops/add/cuda/add.cc b/src/ops/add/cuda/add.cc
deleted file mode 100644
index b010894f..00000000
--- a/src/ops/add/cuda/add.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "add.cuh"
-#include "../../../devices/cuda/common_cuda.h"
-#include "../../utils.h"
-
-infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t handle,
-                                         AddCudaDescriptor_t *desc_ptr,
-                                         infiniopTensorDescriptor_t c,
-                                         infiniopTensorDescriptor_t a,
-                                         infiniopTensorDescriptor_t b) {
-    uint64_t ndim = c->ndim;
-    if (!isValidBroadcastShape(a, b, c)) {
-        return STATUS_BAD_TENSOR_SHAPE;
-    }
-    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
-        return STATUS_BAD_TENSOR_STRIDES;
-    }
-    if (c->dt != F16 && c->dt != F32) {
-        return STATUS_BAD_TENSOR_DTYPE;
-    }
-    if (c->dt != a->dt || c->dt != b->dt) {
-        return STATUS_BAD_TENSOR_DTYPE;
-    }
-    bool broadcasted = false;
-    if (ndim != a->ndim || ndim != b->ndim) {
-        broadcasted = true;
-    } else {
-        for (uint64_t i = 0; i < ndim; ++i) {
-            if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) {
-                broadcasted = true;
-                break;
-            }
-        }
-    }
-
-    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
-
-    // get the adjusted strides for a and b
-    int64_t *a_strides = new int64_t[ndim];
-    int64_t *b_strides = new int64_t[ndim];
-    for (size_t i = 0; i < ndim; ++i) {
-        a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
-        b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
-    }
-
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, handle->device_id);
-
-    int64_t *a_strides_d, *b_strides_d, *c_strides_d;
-    checkCudaErrorWithCode(cudaMalloc(&a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
-    checkCudaErrorWithCode(cudaMalloc(&b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
-    checkCudaErrorWithCode(cudaMalloc(&c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
-    checkCudaErrorWithCode(cudaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
-    checkCudaErrorWithCode(cudaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
-    checkCudaErrorWithCode(cudaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
-
-    *desc_ptr = new AddCudaDescriptor{
-        DevNvGpu,
-        c->dt,
-        handle->device_id,
-        ndim,
-        c_data_size,
-        static_cast<uint64_t>(prop.maxGridSize[0]),
-        a_strides_d,
-        b_strides_d,
-        c_strides_d,
-        broadcasted,
-    };
-
-    delete[] a_strides;
-    delete[] b_strides;
-
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc) {
-    checkCudaErrorWithCode(cudaFree((void *) desc->a_strides), STATUS_EXECUTION_FAILED);
-    checkCudaErrorWithCode(cudaFree((void *) desc->b_strides), STATUS_EXECUTION_FAILED);
-    checkCudaErrorWithCode(cudaFree((void *) desc->c_strides), STATUS_EXECUTION_FAILED);
-    delete desc;
-    return STATUS_SUCCESS;
-}
diff --git a/src/ops/add/cuda/add.cu b/src/ops/add/cuda/add.cu
deleted file mode 100644
index 9d9aefcb..00000000
--- a/src/ops/add/cuda/add.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "../../../devices/cuda/common_cuda.h"
-#include "../../utils.h"
-#include "add.cuh"
-
-/**
- * @brief A templated vector struct that supports element-wise addition on arrays.
- *
- * @tparam T - The access data type for elements in the vector.
- * @tparam TComp - The computation data type used for arithmetic operations. 
- * @tparam N - The number of elements of type T in the vector for a single access.
- */
-template<typename T, typename TComp, size_t N>
-struct vecN {
-    T data[N];
-
-    __device__ __forceinline__ vecN operator+(const vecN<T, TComp, N> &other) const {
-        vecN<T, TComp, N> result;
-
-        for (int i = 0; i < N; ++i) {
-            if constexpr (std::is_same<T, TComp>::value) {
-                result.data[i] = data[i] + other.data[i];
-            } else {
-                constexpr static size_t pack_size = sizeof(T) / sizeof(TComp);
-                auto data_ = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(result.data);
-                data_[i] = std::move(reinterpret_cast<vecN<TComp, TComp, pack_size> const *>(data)[i] +
-                                     reinterpret_cast<vecN<TComp, TComp, pack_size> const *>(other.data)[i]);
-            }
-        }
-
-        return result;
-    }
-
-    __device__ __forceinline__ const T &operator[](size_t i) const {
-        return data[i];
-    }
-};
-
-template<typename Tdata, typename BTdata>
-__global__ void add(
-    Tdata *c,
-    const Tdata *a,
-    const Tdata *b,
-    const int64_t *a_strides,
-    const int64_t *b_strides,
-    const int64_t *c_strides,
-    uint64_t data_size,
-    uint64_t ndim,
-    uint64_t offset,
-    bool broadcasted,
-    unsigned pack_size) {
-    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
-
-    if (idx < data_size) {
-        if (broadcasted) {
-            idx *= pack_size;
-            auto a_ = reinterpret_cast<const BTdata *>(a);
-            auto b_ = reinterpret_cast<const BTdata *>(b);
-            auto c_ = reinterpret_cast<BTdata *>(c);
-#pragma unroll
-            for (size_t i = 0; i < pack_size; ++i) {
-                auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides);
-                auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides);
-                c_[idx + i] = a_[a_idx] + b_[b_idx];
-            }
-            return;
-        }
-        c[idx] = a[idx] + b[idx];
-    }
-}
-
-template<typename Tdata, typename BTdata>
-void _add_nv_gpu(AddCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) {
-    if (data_size == 0) {
-        return;
-    }
-    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), data_size));
-    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size));
-    uint64_t step = gridDims.x * blockDims.x;
-
-    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-
-#pragma unroll
-    for (uint64_t i = 0; i < data_size; i += step) {
-        add<Tdata, BTdata><<<gridDims, blockDims, 0, cuda_stream>>>(
-            c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size);
-    }
-}
-
-template<typename Tdata, typename TIdata>
-infiniopStatus_t add_nv_gpu(AddCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) {
-    const auto data_size = desc->c_data_size / pack_size;
-    const auto a_vec = reinterpret_cast<const Tdata *>(a);
-    const auto b_vec = reinterpret_cast<const Tdata *>(b);
-    const auto c_vec = reinterpret_cast<Tdata *>(c);
-    _add_nv_gpu<Tdata, TIdata>(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream);
-
-    const auto remainder = desc->c_data_size % pack_size;
-    const auto a_ = reinterpret_cast<const TIdata *>(a);
-    const auto b_ = reinterpret_cast<const TIdata *>(b);
-    const auto c_ = reinterpret_cast<TIdata *>(c);
-    _add_nv_gpu<TIdata, TIdata>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
-    return STATUS_SUCCESS;
-}
-
-infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc,
-                         void *c, void const *a, void const *b,
-                         void *stream) {
-    checkCudaError(cudaSetDevice(desc->device_id));
-    if (desc->dtype == F16) {
-        return add_nv_gpu<vecN<float2, half2, 2>, half>(desc, c, a, b, stream, 8);
-    }
-    if (desc->dtype == F32) {
-        return add_nv_gpu<vecN<float2, float, 2>, float>(desc, c, a, b, stream, 4);
-    }
-    return STATUS_BAD_TENSOR_DTYPE;
-}
diff --git a/src/ops/add/cuda/add.cuh b/src/ops/add/cuda/add.cuh
deleted file mode 100644
index 03a181eb..00000000
--- a/src/ops/add/cuda/add.cuh
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef __CUDA_ADD_H__
-#define __CUDA_ADD_H__
-
-#include "../../../devices/cuda/common_cuda.h"
-#include "../../../devices/cuda/cuda_handle.h"
-#include "operators.h"
-#include <cuda_fp16.h>
-#include <numeric>
-
-struct AddCudaDescriptor {
-    Device device;
-    DT dtype;
-    int device_id;
-    uint64_t ndim;
-    uint64_t c_data_size;
-    uint64_t max_grid_size;
-    int64_t const *a_strides;
-    int64_t const *b_strides;
-    int64_t const *c_strides;
-    bool broadcasted;
-};
-
-typedef struct AddCudaDescriptor *AddCudaDescriptor_t;
-
-infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t,
-                                         AddCudaDescriptor_t *,
-                                         infiniopTensorDescriptor_t c,
-                                         infiniopTensorDescriptor_t a,
-                                         infiniopTensorDescriptor_t b);
-
-infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc,
-                         void *c, void const *a, void const *b,
-                         void *stream);
-
-infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc);
-
-#endif
diff --git a/src/ops/add/operator.cc b/src/ops/add/operator.cc
index c2a30ea8..b2c51f70 100644
--- a/src/ops/add/operator.cc
+++ b/src/ops/add/operator.cc
@@ -1,73 +1,44 @@
+#include "../binary/binary.h"
 #include "../utils.h"
-#include "operators.h"
 #include "ops/add/add.h"
+#include "tensor/tensor_descriptor.h"
 
-#ifdef ENABLE_CPU
-#include "cpu/add_cpu.h"
-#endif
-#ifdef ENABLE_NV_GPU
-#include "../../devices/cuda/cuda_handle.h"
-#include "cuda/add.cuh"
-#endif
+struct _AddDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
 
-__C infiniopStatus_t infiniopCreateAddDescriptor(
-    infiniopHandle_t handle,
-    infiniopAddDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c,
-    infiniopTensorDescriptor_t a,
-    infiniopTensorDescriptor_t b) {
-    switch (handle->device) {
-#ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateAddDescriptor(handle, (AddCpuDescriptor_t *) desc_ptr, c, a, b);
-#endif
-#ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaCreateAddDescriptor((CudaHandle_t) handle, (AddCudaDescriptor_t *) desc_ptr, c, a, b);
-        }
+typedef struct _AddDescriptor *_AddDescriptor_t;
 
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-        // TODO
-#endif
-    }
-    return STATUS_BAD_DEVICE;
-}
+__C __export infiniopStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
+                                                          infiniopAddDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Add), STATUS_SUCCESS);
 
-__C infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
-    switch (desc->device) {
-#ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuAdd((AddCpuDescriptor_t) desc, c, a, b, stream);
-#endif
-#ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaAdd((AddCudaDescriptor_t) desc, c, a, b, stream);
-        }
+    *(_AddDescriptor_t *) desc_ptr = new _AddDescriptor{
+        handle->device,
+        binary_desc,
+    };
 
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-        // TODO
-#endif
-    }
-    return STATUS_BAD_DEVICE;
+    return STATUS_SUCCESS;
 }
 
-__C infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
-    switch (desc->device) {
-#ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyAddDescriptor((AddCpuDescriptor_t) desc);
-#endif
-#ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaDestroyAddDescriptor((AddCudaDescriptor_t) desc);
-        }
+__C __export infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_AddDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
 
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-        // TODO
-#endif
-    }
-    return STATUS_BAD_DEVICE;
+__C __export infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_AddDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
 }
diff --git a/src/ops/binary/binary.h b/src/ops/binary/binary.h
new file mode 100644
index 00000000..1e953b51
--- /dev/null
+++ b/src/ops/binary/binary.h
@@ -0,0 +1,72 @@
+#ifndef BINARY_H
+#define BINARY_H
+
+#include "export.h"
+#include "operators.h"
+#include <cstddef>
+
+typedef struct BinaryDescriptor {
+    Device device;
+} BinaryDescriptor;
+
+typedef BinaryDescriptor *infiniopBinaryDescriptor_t;
+
+/**
+ * @brief Represents all the currently defined binary operations.
+ */
+struct BinaryMode {
+    /** 
+     * Note: new operation type should be added **BEFORE** the "Count" type
+     *       "Count" should remain as the last element. New Arithmetic operations should be
+     *       added in the 'Arithmetic operations' section, i.e., before the 'Logical operations:' section.
+     */
+    enum Mode {
+        // Arithmetic operations:
+        Add,
+        Subtract,
+        Multiply,
+        Divide,
+        Pow,
+        Mod,
+        Max,
+        Min,
+        BitwiseAnd,
+        BitwiseOr,
+        BitwiseXor,
+
+        // Logical operations:
+        // **TODO Not currently supported**
+        // Requires Boolean data type
+        And,
+        Or,
+        Xor,
+        Less,
+        LessOrEqual,
+        Equal,
+        Greater,
+        GreaterOrEqual,
+
+        Count,///< Number of binary operation types (marker for counting purposes).
+    };
+
+    // This static constant holds the total number of defined binary operations.
+    static const size_t numBinaryMode = Count;
+};
+
+__C infiniopStatus_t infiniopCreateBinaryDescriptor(infiniopHandle_t handle,
+                                                    infiniopBinaryDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t c,
+                                                    infiniopTensorDescriptor_t a,
+                                                    infiniopTensorDescriptor_t b,
+                                                    int mode);
+
+__C infiniopStatus_t infiniopBinary(infiniopBinaryDescriptor_t desc,
+                                    void *c,
+                                    void const *a,
+                                    void const *b,
+                                    void *stream);
+
+__C infiniopStatus_t infiniopDestroyBinaryDescriptor(infiniopBinaryDescriptor_t desc);
+
+
+#endif
diff --git a/src/ops/binary/cpu/binary_cpu.cc b/src/ops/binary/cpu/binary_cpu.cc
new file mode 100644
index 00000000..6ee50655
--- /dev/null
+++ b/src/ops/binary/cpu/binary_cpu.cc
@@ -0,0 +1,176 @@
+#include "binary_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateBinaryDescriptor(infiniopHandle_t,
+                                           BinaryCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t c,
+                                           infiniopTensorDescriptor_t a,
+                                           infiniopTensorDescriptor_t b,
+                                           int mode) {
+    uint64_t ndim = c->ndim;
+    if (!isValidBroadcastShape(a, b, c)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (c->dt != F16 && c->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (c->dt != a->dt || c->dt != b->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    bool broadcasted = false;
+    if (ndim != a->ndim || ndim != b->ndim) {
+        broadcasted = true;
+    } else {
+        for (uint64_t i = 0; i < ndim; ++i) {
+            if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) {
+                broadcasted = true;
+                break;
+            }
+        }
+    }
+    if (mode < 0 || mode >= BinaryMode::numBinaryMode) {
+        return STATUS_BAD_PARAM;
+    }
+    // bitwise operations are only valid for integral types
+    if (c->dt.exponent != 0 && (mode == BinaryMode::BitwiseAnd || mode == BinaryMode::BitwiseOr || mode == BinaryMode::BitwiseXor)) {
+        return STATUS_BAD_PARAM;
+    }
+
+    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    int64_t *a_strides = nullptr;
+    int64_t *b_strides = nullptr;
+    int64_t *c_strides = nullptr;
+    uint64_t *c_shape = new uint64_t[ndim];
+    std::copy(c->shape, c->shape + ndim, c_shape);
+
+    if (broadcasted) {
+        // get the adjusted strides for a and b
+        a_strides = new int64_t[ndim];
+        b_strides = new int64_t[ndim];
+        c_strides = new int64_t[ndim];
+        for (size_t i = 0; i < ndim; ++i) {
+            a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
+            b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
+            c_strides[i] = c->strides[i];
+        }
+    }
+
+    *desc_ptr = new BinaryCpuDescriptor{
+        DevCpu,
+        c->dt,
+        ndim,
+        c_data_size,
+        c_shape,
+        a_strides,
+        b_strides,
+        c_strides,
+        broadcasted,
+        mode,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyBinaryDescriptor(BinaryCpuDescriptor_t desc) {
+    delete[] desc->a_strides;
+    delete[] desc->b_strides;
+    delete[] desc->c_strides;
+    delete[] desc->c_shape;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+// binary elementwise kernel that performs the actual binary computation
+template<typename Tdata>
+Tdata binary_kernel(const Tdata &a, const Tdata &b, int mode) {
+    switch (mode) {
+        // Arithmetic operations:
+        case BinaryMode::Add:
+            return a + b;
+        case BinaryMode::Subtract:
+            return a - b;
+        case BinaryMode::Multiply:
+            return a * b;
+        case BinaryMode::Divide:
+            return a / b;
+        case BinaryMode::Pow:
+            return std::pow(a, b);
+        case BinaryMode::Mod:
+            if constexpr (std::is_floating_point_v<Tdata>) {
+                return std::fmod(a, b);
+            } else {
+                return a % b;
+            }
+        case BinaryMode::Max:
+            if constexpr (std::is_floating_point_v<Tdata>) {
+                return std::fmax(a, b);
+            } else {
+                return std::max(a, b);
+            }
+        case BinaryMode::Min:
+            if constexpr (std::is_floating_point_v<Tdata>) {
+                return std::fmin(a, b);
+            } else {
+                return std::min(a, b);
+            }
+        case BinaryMode::BitwiseAnd:
+            if constexpr (std::is_integral_v<Tdata>) {
+                return a & b;
+            } else {
+                ASSERT(false);
+            }
+        case BinaryMode::BitwiseOr:
+            if constexpr (std::is_integral_v<Tdata>) {
+                return a | b;
+            } else {
+                ASSERT(false);
+            }
+        case BinaryMode::BitwiseXor:
+            if constexpr (std::is_integral_v<Tdata>) {
+                return a ^ b;
+            } else {
+                ASSERT(false);
+            }
+        // Logical operations:
+        default:
+            fprintf(stderr, "Unrecognized binary operation: how did you get here??\n");
+            ASSERT(false);
+            return a;
+    }
+}
+
+template<typename Tdata>
+infiniopStatus_t binary_cpu(BinaryCpuDescriptor_t desc, void *c, void const *a, void const *b) {
+    auto a_ = reinterpret_cast<Tdata const *>(a);
+    auto b_ = reinterpret_cast<Tdata const *>(b);
+    auto c_ = reinterpret_cast<Tdata *>(c);
+
+#pragma omp parallel for
+    for (uint64_t i = 0; i < desc->c_data_size; ++i) {
+        auto a_index = desc->broadcasted ? getDstOffset(i, desc->ndim, desc->c_strides, desc->a_strides) : i;
+        auto b_index = desc->broadcasted ? getDstOffset(i, desc->ndim, desc->c_strides, desc->b_strides) : i;
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            c_[i] = f32_to_f16(binary_kernel(f16_to_f32(a_[a_index]), f16_to_f32(b_[b_index]), desc->mode));
+        } else {
+            c_[i] = binary_kernel(a_[a_index], b_[b_index], desc->mode);
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuBinary(BinaryCpuDescriptor_t desc,
+                           void *c, void const *a, void const *b,
+                           void *stream) {
+    if (desc->dtype == F16) {
+        return binary_cpu<uint16_t>(desc, c, a, b);
+    }
+    if (desc->dtype == F32) {
+        return binary_cpu<float>(desc, c, a, b);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/binary/cpu/binary_cpu.h b/src/ops/binary/cpu/binary_cpu.h
new file mode 100644
index 00000000..b9502bd1
--- /dev/null
+++ b/src/ops/binary/cpu/binary_cpu.h
@@ -0,0 +1,37 @@
+#ifndef __CPU_BINARY_H__
+#define __CPU_BINARY_H__
+
+#include "../binary.h"
+#include "operators.h"
+#include <numeric>
+#include <type_traits>
+
+struct BinaryCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t c_data_size;
+    uint64_t const *c_shape;
+    int64_t const *a_strides;
+    int64_t const *b_strides;
+    int64_t const *c_strides;
+    bool broadcasted;
+    int mode;
+};
+
+typedef struct BinaryCpuDescriptor *BinaryCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateBinaryDescriptor(infiniopHandle_t,
+                                           BinaryCpuDescriptor_t *,
+                                           infiniopTensorDescriptor_t c,
+                                           infiniopTensorDescriptor_t a,
+                                           infiniopTensorDescriptor_t b,
+                                           int mode);
+
+infiniopStatus_t cpuBinary(BinaryCpuDescriptor_t desc,
+                           void *c, void const *a, void const *b,
+                           void *stream);
+
+infiniopStatus_t cpuDestroyBinaryDescriptor(BinaryCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/binary/cuda/binary.cc b/src/ops/binary/cuda/binary.cc
new file mode 100644
index 00000000..9b91ea1f
--- /dev/null
+++ b/src/ops/binary/cuda/binary.cc
@@ -0,0 +1,82 @@
+#include "binary.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateBinaryDescriptor(CudaHandle_t handle,
+                                            BinaryCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c,
+                                            infiniopTensorDescriptor_t a,
+                                            infiniopTensorDescriptor_t b,
+                                            int mode) {
+    uint64_t ndim = c->ndim;
+    if (!isValidBroadcastShape(a, b, c)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (c->dt != F16 && c->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (c->dt != a->dt || c->dt != b->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    bool broadcasted = false;
+    if (ndim != a->ndim || ndim != b->ndim) {
+        broadcasted = true;
+    } else {
+        for (uint64_t i = 0; i < ndim; ++i) {
+            if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) {
+                broadcasted = true;
+                break;
+            }
+        }
+    }
+    if (mode < 0 || mode >= BinaryMode::numBinaryMode) {
+        return STATUS_BAD_PARAM;
+    }
+    // bitwise operations are only valid for integral types
+    if (c->dt.exponent != 0 && (mode == BinaryMode::BitwiseAnd || mode == BinaryMode::BitwiseOr || mode == BinaryMode::BitwiseXor)) {
+        return STATUS_BAD_PARAM;
+    }
+
+    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    char *strides_d = nullptr;
+    size_t stride_arr_size = ndim * sizeof(int64_t);
+    if (broadcasted) {
+        // get the adjusted strides for a and b
+        int64_t a_strides[ndim];
+        int64_t b_strides[ndim];
+        for (size_t i = 0; i < ndim; ++i) {
+            a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
+            b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
+        }
+
+        // malloc and copy the strides to the device
+        checkCudaErrorWithCode(cudaMalloc(&strides_d, 3 * stride_arr_size), STATUS_MEMORY_NOT_ALLOCATED);
+        checkCudaErrorWithCode(cudaMemcpy(strides_d, a_strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+        checkCudaErrorWithCode(cudaMemcpy(strides_d + stride_arr_size, b_strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+        checkCudaErrorWithCode(cudaMemcpy(strides_d + 2 * stride_arr_size, c->strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    }
+
+    *desc_ptr = new BinaryCudaDescriptor{
+        DevNvGpu,
+        c->dt,
+        handle->device_id,
+        ndim,
+        c_data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+        reinterpret_cast<int64_t const *>(strides_d),
+        broadcasted,
+        mode,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyBinaryDescriptor(BinaryCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree((void *) desc->strides_d), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu
new file mode 100644
index 00000000..a690ad14
--- /dev/null
+++ b/src/ops/binary/cuda/binary.cu
@@ -0,0 +1,285 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "binary.cuh"
+
+/**
+ * @brief A templated vector type that supports binary elementwise operation.
+ *
+ * @tparam T - The access data type for elements in the vector.
+ * @tparam TComp - The computation data type used for arithmetic operations. 
+ * @tparam N - The number of elements of type T in the vector for a single access.
+ */
+template<typename T, typename TComp, size_t N>
+struct vecN {
+    T data[N];
+    constexpr static size_t pack_size = sizeof(T) / sizeof(TComp);
+
+    __device__ __forceinline__ vecN compute(const vecN<T, TComp, N> &other, BinaryMode::Mode mode) const {
+        vecN<T, TComp, N> result;
+        for (int i = 0; i < N; ++i) {
+            if constexpr (std::is_same<T, TComp>::value) {
+                result[i] = binary_op(data[i], other[i], mode);
+            } else {
+                auto data_ = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(result.data);
+                auto this_data = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(data);
+                auto other_data = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(other.data);
+                data_[i] = binary_op(this_data[i], other_data[i], mode);
+            }
+        }
+        return result;
+    }
+
+    // Overloaded operators that call the compute method with the appropriate mode.
+    __device__ __forceinline__ vecN operator+(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Add);
+    }
+
+    __device__ __forceinline__ vecN operator-(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Subtract);
+    }
+
+    __device__ __forceinline__ vecN operator*(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Multiply);
+    }
+
+    __device__ __forceinline__ vecN operator/(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Divide);
+    }
+
+    __device__ __forceinline__ vecN operator&(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::BitwiseAnd);
+    }
+
+    __device__ __forceinline__ vecN operator|(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::BitwiseOr);
+    }
+
+    __device__ __forceinline__ vecN operator^(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::BitwiseXor);
+    }
+
+    __device__ __forceinline__ vecN pow(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Pow);
+    }
+
+    __device__ __forceinline__ vecN mod(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Mod);
+    }
+
+    __device__ __forceinline__ vecN max(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Max);
+    }
+
+    __device__ __forceinline__ vecN min(const vecN<T, TComp, N> &other) const {
+        return compute(other, BinaryMode::Min);
+    }
+
+    __device__ __forceinline__ const T &operator[](size_t i) const {
+        return data[i];
+    }
+
+    __device__ __forceinline__ T &operator[](size_t i) {
+        return data[i];
+    }
+};
+
+
+template<typename Tdata>
+__device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int mode) {
+    switch (mode) {
+        // Arithmetic operations:
+        case BinaryMode::Add:
+            return a + b;
+        case BinaryMode::Subtract:
+            return a - b;
+        case BinaryMode::Multiply:
+            return a * b;
+        case BinaryMode::Divide:
+            return a / b;
+        case BinaryMode::Pow:
+            if constexpr (std::is_arithmetic_v<Tdata>) {
+                return std::pow(a, b);
+            } else if constexpr (std::is_same_v<Tdata, half>) {
+                return __float2half(std::pow(__half2float(a), __half2float(b)));
+            } else if constexpr (!std::is_same_v<Tdata, half2>) {
+                return a.pow(b);
+            }
+            return a;
+        case BinaryMode::Mod:
+            if constexpr (std::is_floating_point_v<Tdata>) {
+                return std::fmod(a, b);
+            } else if constexpr (std::is_arithmetic_v<Tdata>) {
+                return a % b;
+            } else if constexpr (std::is_same_v<Tdata, half>) {
+                return __float2half(std::fmod(__half2float(a), __half2float(b)));
+            } else if constexpr (!std::is_same_v<Tdata, half2>) {
+                return a.mod(b);
+            }
+            return a;
+        case BinaryMode::Max:
+            if constexpr (std::is_fundamental_v<Tdata>) {
+                if constexpr (std::is_floating_point_v<Tdata>) {
+                    return std::fmax(a, b);
+                } else {
+                    return std::max(a, b);
+                }
+            } else if constexpr (std::is_same_v<Tdata, half>) {
+                return __half2float(a) > __half2float(b) ? a : b;
+            } else if constexpr (!std::is_same_v<Tdata, half2>) {
+                return a.max(b);
+            }
+        case BinaryMode::Min:
+            if constexpr (std::is_fundamental_v<Tdata>) {
+                if constexpr (std::is_floating_point_v<Tdata>) {
+                    return std::fmin(a, b);
+                } else {
+                    return std::min(a, b);
+                }
+            } else if constexpr (std::is_same_v<Tdata, half>) {
+                return __half2float(a) < __half2float(b) ? a : b;
+            } else if constexpr (!std::is_same_v<Tdata, half2>) {
+                return a.min(b);
+            }
+        case BinaryMode::BitwiseAnd:
+            if constexpr (std::is_integral_v<Tdata>) {
+                return a & b;
+            } else {
+                printf("Error: Non-integral input to bitwise operatior\n");
+                return a;
+            }
+        case BinaryMode::BitwiseOr:
+            if constexpr (std::is_integral_v<Tdata>) {
+                return a | b;
+            } else {
+                printf("Error: Non-integral input to bitwise operatior\n");
+                return a;
+            }
+        case BinaryMode::BitwiseXor:
+            if constexpr (std::is_integral_v<Tdata>) {
+                return a ^ b;
+            } else {
+                printf("Error: Non-integral input to bitwise operatior\n");
+                return a;
+            }
+        // Logical operations:
+        // TODO. Currently Not supported yet
+        default:
+            printf("Unrecognized binary operation: how did you get here??\n");
+            return a;
+    }
+}
+
+template<typename Tdata, typename BTdata>
+__global__ void binary(
+    Tdata *c,
+    const Tdata *a,
+    const Tdata *b,
+    const int64_t *__restrict__ a_strides,
+    const int64_t *__restrict__ b_strides,
+    const int64_t *__restrict__ c_strides,
+    uint64_t data_size,
+    uint64_t ndim,
+    uint64_t offset,
+    bool broadcasted,
+    unsigned pack_size,
+    const int mode) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        if (broadcasted) {
+            idx *= pack_size;
+            auto a_ = reinterpret_cast<const BTdata *>(a);
+            auto b_ = reinterpret_cast<const BTdata *>(b);
+            auto c_ = reinterpret_cast<BTdata *>(c);
+#pragma unroll
+            for (size_t i = 0; i < pack_size; ++i) {
+                auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides);
+                auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides);
+                c_[idx + i] = binary_op(a_[a_idx], b_[b_idx], mode);
+            }
+            return;
+        }
+        c[idx] = binary_op(a[idx], b[idx], mode);
+    }
+}
+
+template<typename Tdata, typename BTdata, unsigned int BLOCK_SIZE>
+void _binary_nv_gpu(BinaryCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(BLOCK_SIZE), data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    const int64_t *a_strides = nullptr;
+    const int64_t *b_strides = nullptr;
+    const int64_t *c_strides = nullptr;
+    if (desc->broadcasted) {
+        a_strides = desc->strides_d;
+        b_strides = desc->strides_d + desc->ndim;
+        c_strides = desc->strides_d + 2 * desc->ndim;
+    }
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < data_size; i += step) {
+        binary<Tdata, BTdata><<<gridDims, blockDims, 0, cuda_stream>>>(
+            c, a, b, a_strides, b_strides, c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size, desc->mode);
+    }
+}
+
+template<typename Tdata, typename TIdata, unsigned int BLOCK_SIZE = 256>
+infiniopStatus_t binary_nv_gpu(BinaryCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) {
+    const auto data_size = desc->c_data_size / pack_size;
+    const auto a_vec = reinterpret_cast<const Tdata *>(a);
+    const auto b_vec = reinterpret_cast<const Tdata *>(b);
+    const auto c_vec = reinterpret_cast<Tdata *>(c);
+    _binary_nv_gpu<Tdata, TIdata, BLOCK_SIZE>(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream);
+
+    const auto remainder = desc->c_data_size % pack_size;
+    const auto a_ = reinterpret_cast<const TIdata *>(a);
+    const auto b_ = reinterpret_cast<const TIdata *>(b);
+    const auto c_ = reinterpret_cast<TIdata *>(c);
+    _binary_nv_gpu<TIdata, TIdata, BLOCK_SIZE>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
+    cudaDeviceSynchronize();
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc,
+                            void *c, void const *a, void const *b,
+                            void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        switch (desc->mode) {
+            case BinaryMode::Add:
+            case BinaryMode::Subtract:
+            case BinaryMode::Multiply:
+            case BinaryMode::Divide:
+                return binary_nv_gpu<vecN<float2, half2, 2>, half, 256>(desc, c, a, b, stream, 8);
+            case BinaryMode::Pow:
+                return binary_nv_gpu<vecN<float4, half, 1>, half, 128>(desc, c, a, b, stream, 8);
+            default:
+                return binary_nv_gpu<vecN<float4, half, 1>, half, 64>(desc, c, a, b, stream, 8);
+        }
+    }
+    if (desc->dtype == F32) {
+        switch (desc->mode) {
+            case BinaryMode::Add:
+            case BinaryMode::Subtract:
+            case BinaryMode::Multiply:
+            case BinaryMode::Divide:
+                return binary_nv_gpu<vecN<float2, float, 2>, float, 256>(desc, c, a, b, stream, 4);
+            case BinaryMode::Pow:
+                return binary_nv_gpu<vecN<float4, float, 2>, float, 256>(desc, c, a, b, stream, 8);
+            case BinaryMode::Mod:
+                return binary_nv_gpu<vecN<float4, float, 1>, float, 128>(desc, c, a, b, stream, 4);
+            case BinaryMode::Max:
+            case BinaryMode::Min:
+                return binary_nv_gpu<vecN<float2, float, 2>, float, 64>(desc, c, a, b, stream, 4);
+            default:
+                return binary_nv_gpu<vecN<float2, float, 2>, float, 256>(desc, c, a, b, stream, 4);
+        }
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/binary/cuda/binary.cuh b/src/ops/binary/cuda/binary.cuh
new file mode 100644
index 00000000..efcc8adc
--- /dev/null
+++ b/src/ops/binary/cuda/binary.cuh
@@ -0,0 +1,38 @@
+#ifndef __CUDA_BINARY_H__
+#define __CUDA_BINARY_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../binary.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct BinaryCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t c_data_size;
+    uint64_t max_grid_size;
+    int64_t const *strides_d;
+    bool broadcasted;
+    int mode;
+};
+
+typedef struct BinaryCudaDescriptor *BinaryCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateBinaryDescriptor(CudaHandle_t,
+                                            BinaryCudaDescriptor_t *,
+                                            infiniopTensorDescriptor_t c,
+                                            infiniopTensorDescriptor_t a,
+                                            infiniopTensorDescriptor_t b,
+                                            int mode);
+
+infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc,
+                            void *c, void const *a, void const *b,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyBinaryDescriptor(BinaryCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/binary/operator.cc b/src/ops/binary/operator.cc
new file mode 100644
index 00000000..5ad7b94f
--- /dev/null
+++ b/src/ops/binary/operator.cc
@@ -0,0 +1,74 @@
+#include "../utils.h"
+#include "binary.h"
+#include "operators.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/binary_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/binary.cuh"
+#endif
+
+__C infiniopStatus_t infiniopCreateBinaryDescriptor(
+    infiniopHandle_t handle,
+    infiniopBinaryDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c,
+    infiniopTensorDescriptor_t a,
+    infiniopTensorDescriptor_t b,
+    int mode) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateBinaryDescriptor(handle, (BinaryCpuDescriptor_t *) desc_ptr, c, a, b, mode);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateBinaryDescriptor((CudaHandle_t) handle, (BinaryCudaDescriptor_t *) desc_ptr, c, a, b, mode);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopBinary(infiniopBinaryDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuBinary((BinaryCpuDescriptor_t) desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaBinary((BinaryCudaDescriptor_t) desc, c, a, b, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyBinaryDescriptor(infiniopBinaryDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyBinaryDescriptor((BinaryCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyBinaryDescriptor((BinaryCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/div/operator.cc b/src/ops/div/operator.cc
new file mode 100644
index 00000000..794ffa05
--- /dev/null
+++ b/src/ops/div/operator.cc
@@ -0,0 +1,44 @@
+#include "../binary/binary.h"
+#include "../utils.h"
+#include "ops/div/div.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _DivDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
+
+typedef struct _DivDescriptor *_DivDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
+                                                          infiniopDivDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Divide), STATUS_SUCCESS);
+
+    *(_DivDescriptor_t *) desc_ptr = new _DivDescriptor{
+        handle->device,
+        binary_desc,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_DivDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_DivDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/max/operator.cc b/src/ops/max/operator.cc
new file mode 100644
index 00000000..2e79eaa8
--- /dev/null
+++ b/src/ops/max/operator.cc
@@ -0,0 +1,44 @@
+#include "../binary/binary.h"
+#include "../utils.h"
+#include "ops/max/max.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _MaxDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
+
+typedef struct _MaxDescriptor *_MaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle,
+                                                          infiniopMaxDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Max), STATUS_SUCCESS);
+
+    *(_MaxDescriptor_t *) desc_ptr = new _MaxDescriptor{
+        handle->device,
+        binary_desc,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopMax(infiniopMaxDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_MaxDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MaxDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/min/operator.cc b/src/ops/min/operator.cc
new file mode 100644
index 00000000..f6fd0924
--- /dev/null
+++ b/src/ops/min/operator.cc
@@ -0,0 +1,44 @@
+#include "../binary/binary.h"
+#include "../utils.h"
+#include "ops/min/min.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _MinDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
+
+typedef struct _MinDescriptor *_MinDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle,
+                                                          infiniopMinDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Min), STATUS_SUCCESS);
+
+    *(_MinDescriptor_t *) desc_ptr = new _MinDescriptor{
+        handle->device,
+        binary_desc,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopMin(infiniopMinDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_MinDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MinDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/mod/operator.cc b/src/ops/mod/operator.cc
new file mode 100644
index 00000000..3b92d729
--- /dev/null
+++ b/src/ops/mod/operator.cc
@@ -0,0 +1,44 @@
+#include "../binary/binary.h"
+#include "../utils.h"
+#include "ops/mod/mod.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _ModDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
+
+typedef struct _ModDescriptor *_ModDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle,
+                                                          infiniopModDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Mod), STATUS_SUCCESS);
+
+    *(_ModDescriptor_t *) desc_ptr = new _ModDescriptor{
+        handle->device,
+        binary_desc,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopMod(infiniopModDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_ModDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_ModDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/mul/operator.cc b/src/ops/mul/operator.cc
new file mode 100644
index 00000000..6c115c37
--- /dev/null
+++ b/src/ops/mul/operator.cc
@@ -0,0 +1,44 @@
+#include "../binary/binary.h"
+#include "../utils.h"
+#include "ops/mul/mul.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _MulDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
+
+typedef struct _MulDescriptor *_MulDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
+                                                          infiniopMulDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Multiply), STATUS_SUCCESS);
+
+    *(_MulDescriptor_t *) desc_ptr = new _MulDescriptor{
+        handle->device,
+        binary_desc,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopMul(infiniopMulDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_MulDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MulDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/pow/operator.cc b/src/ops/pow/operator.cc
new file mode 100644
index 00000000..4a47f3e3
--- /dev/null
+++ b/src/ops/pow/operator.cc
@@ -0,0 +1,44 @@
+#include "../binary/binary.h"
+#include "../utils.h"
+#include "ops/pow/pow.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _PowDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
+
+typedef struct _PowDescriptor *_PowDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle,
+                                                          infiniopPowDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Pow), STATUS_SUCCESS);
+
+    *(_PowDescriptor_t *) desc_ptr = new _PowDescriptor{
+        handle->device,
+        binary_desc,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopPow(infiniopPowDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_PowDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_PowDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/sub/operator.cc b/src/ops/sub/operator.cc
new file mode 100644
index 00000000..0943548d
--- /dev/null
+++ b/src/ops/sub/operator.cc
@@ -0,0 +1,44 @@
+#include "../binary/binary.h"
+#include "../utils.h"
+#include "ops/sub/sub.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _SubDescriptor {
+    Device device;
+    infiniopBinaryDescriptor_t binary_desc;
+};
+
+typedef struct _SubDescriptor *_SubDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle,
+                                                          infiniopSubDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c_desc,
+                                                          infiniopTensorDescriptor_t a_desc,
+                                                          infiniopTensorDescriptor_t b_desc) {
+    // binary desc
+    infiniopBinaryDescriptor_t binary_desc;
+    CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Subtract), STATUS_SUCCESS);
+
+    *(_SubDescriptor_t *) desc_ptr = new _SubDescriptor{
+        handle->device,
+        binary_desc,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopSub(infiniopSubDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream) {
+    auto _desc = (_SubDescriptor_t) desc;
+    CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_SubDescriptor_t) desc)->binary_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}

From 79715a1f0b175f67c98d489ac22e63fc00070c05 Mon Sep 17 00:00:00 2001
From: Zimin Li <coollizimin@gmail.com>
Date: Mon, 25 Nov 2024 15:31:43 +0800
Subject: [PATCH 2/5] Add corresponding include statements to
 infini_operators.h and optimized vecN and binary kernel

---
 include/infini_operators.h    | 12 ++++++++++--
 src/ops/binary/cuda/binary.cu | 34 +++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/include/infini_operators.h b/include/infini_operators.h
index 906a3771..8e8fdf4d 100644
--- a/include/infini_operators.h
+++ b/include/infini_operators.h
@@ -2,16 +2,24 @@
 #include "ops/add/add.h"
 #include "ops/attention/attention.h"
 #include "ops/causal_softmax/causal_softmax.h"
-#include "ops/global_avg_pool/global_avg_pool.h"
+#include "ops/conv/conv.h"
+#include "ops/div/div.h"
 #include "ops/expand/expand.h"
 #include "ops/gemm/gemm.h"
-#include "ops/conv/conv.h"
+#include "ops/global_avg_pool/global_avg_pool.h"
 #include "ops/matmul/matmul.h"
+#include "ops/max/max.h"
+#include "ops/min/min.h"
 #include "ops/mlp/mlp.h"
+#include "ops/mod/mod.h"
+#include "ops/mul/mul.h"
+#include "ops/pow/pow.h"
 #include "ops/random_sample/random_sample.h"
 #include "ops/rearrange/rearrange.h"
 #include "ops/relu/relu.h"
 #include "ops/rms_norm/rms_norm.h"
 #include "ops/rotary_embedding/rotary_embedding.h"
+#include "ops/sub/sub.h"
 #include "ops/swiglu/swiglu.h"
+#include "ops/transpose/transpose.h"
 #include "tensor/tensor_descriptor.h"
diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu
index a690ad14..55ddadee 100644
--- a/src/ops/binary/cuda/binary.cu
+++ b/src/ops/binary/cuda/binary.cu
@@ -1,6 +1,7 @@
 #include "../../../devices/cuda/common_cuda.h"
 #include "../../utils.h"
 #include "binary.cuh"
+#include <cuda_fp16.h>
 
 /**
  * @brief A templated vector type that supports binary elementwise operation.
@@ -16,14 +17,15 @@ struct vecN {
 
     __device__ __forceinline__ vecN compute(const vecN<T, TComp, N> &other, BinaryMode::Mode mode) const {
         vecN<T, TComp, N> result;
+#pragma unroll
         for (int i = 0; i < N; ++i) {
-            if constexpr (std::is_same<T, TComp>::value) {
+            if constexpr (std::is_same_v<T, TComp>) {
                 result[i] = binary_op(data[i], other[i], mode);
             } else {
-                auto data_ = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(result.data);
-                auto this_data = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(data);
-                auto other_data = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(other.data);
-                data_[i] = binary_op(this_data[i], other_data[i], mode);
+                auto &data_ = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(result.data)[i];
+                const auto this_data = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(data)[i];
+                const auto other_data = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(other.data)[i];
+                data_ = binary_op(this_data, other_data, mode);
             }
         }
         return result;
@@ -89,7 +91,13 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m
     switch (mode) {
         // Arithmetic operations:
         case BinaryMode::Add:
-            return a + b;
+            if constexpr (std::is_same_v<Tdata, half2>) {
+                return __hadd2_rn(a, b);
+            } else if constexpr (std::is_same_v<Tdata, float>) {
+                return __fadd_rn(a, b);
+            } else {
+                return a + b;
+            }
         case BinaryMode::Subtract:
             return a - b;
         case BinaryMode::Multiply:
@@ -199,7 +207,9 @@ __global__ void binary(
             }
             return;
         }
-        c[idx] = binary_op(a[idx], b[idx], mode);
+        Tdata a_data = a[idx];
+        Tdata b_data = b[idx];
+        c[idx] = binary_op(a_data, b_data, mode);
     }
 }
 
@@ -238,10 +248,12 @@ infiniopStatus_t binary_nv_gpu(BinaryCudaDescriptor_t desc, void *c, void const
     _binary_nv_gpu<Tdata, TIdata, BLOCK_SIZE>(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream);
 
     const auto remainder = desc->c_data_size % pack_size;
-    const auto a_ = reinterpret_cast<const TIdata *>(a);
-    const auto b_ = reinterpret_cast<const TIdata *>(b);
-    const auto c_ = reinterpret_cast<TIdata *>(c);
-    _binary_nv_gpu<TIdata, TIdata, BLOCK_SIZE>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
+    if (remainder > 0) {
+        const auto a_ = reinterpret_cast<const TIdata *>(a);
+        const auto b_ = reinterpret_cast<const TIdata *>(b);
+        const auto c_ = reinterpret_cast<TIdata *>(c);
+        _binary_nv_gpu<TIdata, TIdata, BLOCK_SIZE>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
+    }
     cudaDeviceSynchronize();
     return STATUS_SUCCESS;
 }

From 1dc21fc40f065f7b5bc558480b8838560d660b5f Mon Sep 17 00:00:00 2001
From: Zimin Li <coollizimin@gmail.com>
Date: Tue, 26 Nov 2024 10:28:52 +0800
Subject: [PATCH 3/5] Add specific functions for add, sub, and mul for half2
 and float

---
 src/ops/binary/cuda/binary.cu | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu
index 55ddadee..5be4b714 100644
--- a/src/ops/binary/cuda/binary.cu
+++ b/src/ops/binary/cuda/binary.cu
@@ -99,9 +99,21 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m
                 return a + b;
             }
         case BinaryMode::Subtract:
-            return a - b;
+            if constexpr (std::is_same_v<Tdata, half2>) {
+                return __hsub2(a, b);
+            } else if constexpr (std::is_same_v<Tdata, float>) {
+                return __fsub_rn(a, b);
+            } else {
+                return a - b;
+            }
         case BinaryMode::Multiply:
-            return a * b;
+            if constexpr (std::is_same_v<Tdata, half2>) {
+                return __hmul2(a, b);
+            } else if constexpr (std::is_same_v<Tdata, float>) {
+                return __fmul_rd(a, b);
+            } else {
+                return a * b;
+            }
         case BinaryMode::Divide:
             return a / b;
         case BinaryMode::Pow:

From b1d4ac3409ad74a46ac678ec2cdd8d0455e74993 Mon Sep 17 00:00:00 2001
From: Zimin Li <coollizimin@gmail.com>
Date: Thu, 28 Nov 2024 14:52:04 +0800
Subject: [PATCH 4/5] Add check_error to the tests, optimize various operators

---
 operatorspy/tests/add.py      |  12 ++-
 operatorspy/tests/div.py      |  12 ++-
 operatorspy/tests/max.py      |  12 ++-
 operatorspy/tests/min.py      |  76 ++-------------
 operatorspy/tests/mod.py      | 171 +++++++++++++++++++++++++---------
 operatorspy/tests/mul.py      |  12 ++-
 operatorspy/tests/pow.py      |  12 ++-
 operatorspy/tests/sub.py      |  12 ++-
 src/ops/binary/cuda/binary.cu |  50 +++++++---
 9 files changed, 219 insertions(+), 150 deletions(-)

diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py
index b4d2203e..0217d7d2 100644
--- a/operatorspy/tests/add.py
+++ b/operatorspy/tests/add.py
@@ -95,14 +95,18 @@ def test(
     )
     
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopAdd(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopAdd(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopAdd(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopAdd(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
diff --git a/operatorspy/tests/div.py b/operatorspy/tests/div.py
index c303c635..37ebbc84 100644
--- a/operatorspy/tests/div.py
+++ b/operatorspy/tests/div.py
@@ -114,14 +114,18 @@ def test(
     )
     
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopDiv(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopDiv(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopDiv(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopDiv(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
diff --git a/operatorspy/tests/max.py b/operatorspy/tests/max.py
index 39eaf2b3..2efeb651 100644
--- a/operatorspy/tests/max.py
+++ b/operatorspy/tests/max.py
@@ -95,14 +95,18 @@ def test(
     )
     
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopMax(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopMax(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopMax(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopMax(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
diff --git a/operatorspy/tests/min.py b/operatorspy/tests/min.py
index 050ea64a..68923344 100644
--- a/operatorspy/tests/min.py
+++ b/operatorspy/tests/min.py
@@ -95,14 +95,18 @@ def test(
     )
     
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopMin(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopMin(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopMin(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopMin(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
@@ -124,66 +128,4 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
-    destroy_handle(lib, handle)
-
-
-if __name__ == "__main__":
-    test_cases = [
-        # c_shape, a_shape, b_shape, inplace
-        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
-        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
-        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
-        ((), (), (), Inplace.OUT_OF_PLACE),
-        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
-        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
-        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
-        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
-        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
-        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateMinDescriptor.restype = c_int32
-    lib.infiniopCreateMinDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopMinDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopMin.restype = c_int32
-    lib.infiniopMin.argtypes = [
-        infiniopMinDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyMinDescriptor.restype = c_int32
-    lib.infiniopDestroyMinDescriptor.argtypes = [
-        infiniopMinDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=
\ No newline at end of file
diff --git a/operatorspy/tests/mod.py b/operatorspy/tests/mod.py
index 342e3662..9092f90b 100644
--- a/operatorspy/tests/mod.py
+++ b/operatorspy/tests/mod.py
@@ -27,6 +27,7 @@
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+
 class Inplace(Enum):
     OUT_OF_PLACE = auto()
     INPLACE_A = auto()
@@ -40,9 +41,8 @@ class ModDescriptor(Structure):
 
 infiniopModDescriptor_t = POINTER(ModDescriptor)
 
-def find_and_print_differing_indices(
-    a, b, c, ans, atol=0, rtol=1e-2
-):
+
+def find_and_print_differing_indices(a, b, c, ans, atol=0, rtol=1e-2):
     if c.shape != ans.shape:
         raise ValueError("Tensors must have the same shape to compare.")
 
@@ -63,7 +63,7 @@ def find_and_print_differing_indices(
 def mod(x, y):
     if PROFILE:
         ans = torch.fmod(x, y)
-        torch.cuda.synchronize() 
+        torch.cuda.synchronize()
         return ans
     return torch.fmod(x, y)
 
@@ -72,8 +72,8 @@ def test(
     lib,
     handle,
     torch_device,
-    c_shape, 
-    a_shape, 
+    c_shape,
+    a_shape,
     b_shape,
     tensor_dtype=torch.float16,
     inplace=Inplace.OUT_OF_PLACE,
@@ -86,9 +86,17 @@ def test(
         return
 
     a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10
-    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a
-    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
-    
+    b = (
+        torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
+        if inplace != Inplace.INPLACE_AB
+        else a
+    )
+    c = (
+        torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
+
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = mod(a, b)
     if PROFILE:
@@ -100,7 +108,11 @@ def test(
 
     a_tensor = to_tensor(a, lib)
     b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor
-    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
     descriptor = infiniopModDescriptor_t()
 
     check_error(
@@ -112,16 +124,20 @@ def test(
             b_tensor.descriptor,
         )
     )
-    
+
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopMod(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopMod(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopMod(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopMod(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
@@ -134,8 +150,26 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        test(
+            lib,
+            handle,
+            "cpu",
+            c_shape,
+            a_shape,
+            b_shape,
+            tensor_dtype=torch.float16,
+            inplace=inplace,
+        )
+        test(
+            lib,
+            handle,
+            "cpu",
+            c_shape,
+            a_shape,
+            b_shape,
+            tensor_dtype=torch.float32,
+            inplace=inplace,
+        )
     destroy_handle(lib, handle)
 
 
@@ -143,8 +177,26 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        test(
+            lib,
+            handle,
+            "cuda",
+            c_shape,
+            a_shape,
+            b_shape,
+            tensor_dtype=torch.float16,
+            inplace=inplace,
+        )
+        test(
+            lib,
+            handle,
+            "cuda",
+            c_shape,
+            a_shape,
+            b_shape,
+            tensor_dtype=torch.float32,
+            inplace=inplace,
+        )
     destroy_handle(lib, handle)
 
 
@@ -154,8 +206,26 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        test(
+            lib,
+            handle,
+            "mlu",
+            c_shape,
+            a_shape,
+            b_shape,
+            tensor_dtype=torch.float16,
+            inplace=inplace,
+        )
+        test(
+            lib,
+            handle,
+            "mlu",
+            c_shape,
+            a_shape,
+            b_shape,
+            tensor_dtype=torch.float32,
+            inplace=inplace,
+        )
     destroy_handle(lib, handle)
 
 
@@ -169,10 +239,41 @@ def test_bang(lib, test_cases):
         ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
         ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
         ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.OUT_OF_PLACE),
-        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
-        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
-        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        (
+            ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A)
+            if not PROFILE
+            else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE)
+        ),
+        (
+            ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B)
+            if not PROFILE
+            else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE)
+        ),
+        (
+            ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB)
+            if not PROFILE
+            else (
+                (32, 256, 112, 112),
+                (32, 256, 112, 1),
+                (32, 256, 112, 112),
+                Inplace.OUT_OF_PLACE,
+            )
+        ),
+        (
+            (
+                (32, 3, 112, 112),
+                (32, 3, 112, 112),
+                (32, 3, 112, 112),
+                Inplace.OUT_OF_PLACE,
+            )
+            if not PROFILE
+            else (
+                (32, 256, 112, 112),
+                (32, 256, 112, 112),
+                (32, 256, 112, 112),
+                Inplace.OUT_OF_PLACE,
+            )
+        ),
     ]
     args = get_args()
     lib = open_lib()
@@ -187,22 +288,4 @@ def test_bang(lib, test_cases):
     lib.infiniopMod.restype = c_int32
     lib.infiniopMod.argtypes = [
         infiniopModDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyModDescriptor.restype = c_int32
-    lib.infiniopDestroyModDescriptor.argtypes = [
-        infiniopModDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
+   
\ No newline at end of file
diff --git a/operatorspy/tests/mul.py b/operatorspy/tests/mul.py
index 57085b42..11baabb9 100644
--- a/operatorspy/tests/mul.py
+++ b/operatorspy/tests/mul.py
@@ -95,14 +95,18 @@ def test(
     )
     
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopMul(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopMul(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopMul(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopMul(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
diff --git a/operatorspy/tests/pow.py b/operatorspy/tests/pow.py
index e5106b36..7e1a72ee 100644
--- a/operatorspy/tests/pow.py
+++ b/operatorspy/tests/pow.py
@@ -92,14 +92,18 @@ def test(
     )
     
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopPow(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopPow(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopPow(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopPow(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
diff --git a/operatorspy/tests/sub.py b/operatorspy/tests/sub.py
index cc8eb406..ef68b1a4 100644
--- a/operatorspy/tests/sub.py
+++ b/operatorspy/tests/sub.py
@@ -95,14 +95,18 @@ def test(
     )
     
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopSub(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        check_error(
+            lib.infiniopSub(
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            lib.infiniopSub(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+            check_error(
+                lib.infiniopSub(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu
index 5be4b714..68ea712a 100644
--- a/src/ops/binary/cuda/binary.cu
+++ b/src/ops/binary/cuda/binary.cu
@@ -115,12 +115,26 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m
                 return a * b;
             }
         case BinaryMode::Divide:
-            return a / b;
+            if constexpr (std::is_same_v<Tdata, half2>) {
+                return __h2div(a, b);
+            } else if constexpr (std::is_same_v<Tdata, float>) {
+                return __fdividef(a, b);
+            } else {
+                return a / b;
+            }
+            // return a / b;
         case BinaryMode::Pow:
             if constexpr (std::is_arithmetic_v<Tdata>) {
-                return std::pow(a, b);
+                if constexpr (std::is_same_v<Tdata, float>) {
+                    return __powf(a, b);
+                } else {
+                    return std::pow(a, b);
+                }
             } else if constexpr (std::is_same_v<Tdata, half>) {
-                return __float2half(std::pow(__half2float(a), __half2float(b)));
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                float ans_f = __powf(a_, b_);
+                return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
             } else if constexpr (!std::is_same_v<Tdata, half2>) {
                 return a.pow(b);
             }
@@ -128,10 +142,10 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m
         case BinaryMode::Mod:
             if constexpr (std::is_floating_point_v<Tdata>) {
                 return std::fmod(a, b);
-            } else if constexpr (std::is_arithmetic_v<Tdata>) {
-                return a % b;
             } else if constexpr (std::is_same_v<Tdata, half>) {
                 return __float2half(std::fmod(__half2float(a), __half2float(b)));
+            } else if constexpr (std::is_arithmetic_v<Tdata>) {
+                return a % b;
             } else if constexpr (!std::is_same_v<Tdata, half2>) {
                 return a.mod(b);
             }
@@ -144,8 +158,10 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m
                     return std::max(a, b);
                 }
             } else if constexpr (std::is_same_v<Tdata, half>) {
-                return __half2float(a) > __half2float(b) ? a : b;
-            } else if constexpr (!std::is_same_v<Tdata, half2>) {
+                return __hmax(a, b);
+            } else if constexpr (std::is_same_v<Tdata, half2>) {
+                return __hmax2(a, b);
+            } else {
                 return a.max(b);
             }
         case BinaryMode::Min:
@@ -156,35 +172,37 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m
                     return std::min(a, b);
                 }
             } else if constexpr (std::is_same_v<Tdata, half>) {
-                return __half2float(a) < __half2float(b) ? a : b;
-            } else if constexpr (!std::is_same_v<Tdata, half2>) {
+                return __hmin(a, b);
+            } else if constexpr (std::is_same_v<Tdata, half2>) {
+                return __hmin2(a, b);
+            } else {
                 return a.min(b);
             }
         case BinaryMode::BitwiseAnd:
             if constexpr (std::is_integral_v<Tdata>) {
                 return a & b;
             } else {
-                printf("Error: Non-integral input to bitwise operatior\n");
+                printf("Error: Non-integral input to bitwise operator\n");
                 return a;
             }
         case BinaryMode::BitwiseOr:
             if constexpr (std::is_integral_v<Tdata>) {
                 return a | b;
             } else {
-                printf("Error: Non-integral input to bitwise operatior\n");
+                printf("Error: Non-integral input to bitwise operator\n");
                 return a;
             }
         case BinaryMode::BitwiseXor:
             if constexpr (std::is_integral_v<Tdata>) {
                 return a ^ b;
             } else {
-                printf("Error: Non-integral input to bitwise operatior\n");
+                printf("Error: Non-integral input to bitwise operator\n");
                 return a;
             }
         // Logical operations:
         // TODO. Currently Not supported yet
         default:
-            printf("Unrecognized binary operation: how did you get here??\n");
+            printf("Unrecognized binary operation\n");
             return a;
     }
 }
@@ -282,7 +300,10 @@ infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc,
             case BinaryMode::Divide:
                 return binary_nv_gpu<vecN<float2, half2, 2>, half, 256>(desc, c, a, b, stream, 8);
             case BinaryMode::Pow:
-                return binary_nv_gpu<vecN<float4, half, 1>, half, 128>(desc, c, a, b, stream, 8);
+                return binary_nv_gpu<vecN<float4, half, 2>, half, 128>(desc, c, a, b, stream, 16);
+            case BinaryMode::Max:
+            case BinaryMode::Min:
+                return binary_nv_gpu<vecN<float2, half2, 2>, half, 256>(desc, c, a, b, stream, 8);
             default:
                 return binary_nv_gpu<vecN<float4, half, 1>, half, 64>(desc, c, a, b, stream, 8);
         }
@@ -295,7 +316,6 @@ infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc,
             case BinaryMode::Divide:
                 return binary_nv_gpu<vecN<float2, float, 2>, float, 256>(desc, c, a, b, stream, 4);
             case BinaryMode::Pow:
-                return binary_nv_gpu<vecN<float4, float, 2>, float, 256>(desc, c, a, b, stream, 8);
             case BinaryMode::Mod:
                 return binary_nv_gpu<vecN<float4, float, 1>, float, 128>(desc, c, a, b, stream, 4);
             case BinaryMode::Max:

From 8d7be0ca166413991ca2a7581583f0ec8f19d290 Mon Sep 17 00:00:00 2001
From: Zimin Li <coollizimin@gmail.com>
Date: Thu, 28 Nov 2024 15:05:13 +0800
Subject: [PATCH 5/5] Fixed incomplete test files

---
 operatorspy/tests/min.py |  64 +++++++++++++++++++-
 operatorspy/tests/mod.py | 125 +++++++++------------------------------
 2 files changed, 92 insertions(+), 97 deletions(-)

diff --git a/operatorspy/tests/min.py b/operatorspy/tests/min.py
index 68923344..548f4819 100644
--- a/operatorspy/tests/min.py
+++ b/operatorspy/tests/min.py
@@ -128,4 +128,66 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=
\ No newline at end of file
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateMinDescriptor.restype = c_int32
+    lib.infiniopCreateMinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMinDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopMin.restype = c_int32
+    lib.infiniopMin.argtypes = [
+        infiniopMinDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMinDescriptor.restype = c_int32
+    lib.infiniopDestroyMinDescriptor.argtypes = [
+        infiniopMinDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/mod.py b/operatorspy/tests/mod.py
index 9092f90b..7bf5beaf 100644
--- a/operatorspy/tests/mod.py
+++ b/operatorspy/tests/mod.py
@@ -150,26 +150,8 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(
-            lib,
-            handle,
-            "cpu",
-            c_shape,
-            a_shape,
-            b_shape,
-            tensor_dtype=torch.float16,
-            inplace=inplace,
-        )
-        test(
-            lib,
-            handle,
-            "cpu",
-            c_shape,
-            a_shape,
-            b_shape,
-            tensor_dtype=torch.float32,
-            inplace=inplace,
-        )
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
     destroy_handle(lib, handle)
 
 
@@ -177,26 +159,8 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(
-            lib,
-            handle,
-            "cuda",
-            c_shape,
-            a_shape,
-            b_shape,
-            tensor_dtype=torch.float16,
-            inplace=inplace,
-        )
-        test(
-            lib,
-            handle,
-            "cuda",
-            c_shape,
-            a_shape,
-            b_shape,
-            tensor_dtype=torch.float32,
-            inplace=inplace,
-        )
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
     destroy_handle(lib, handle)
 
 
@@ -206,26 +170,8 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(
-            lib,
-            handle,
-            "mlu",
-            c_shape,
-            a_shape,
-            b_shape,
-            tensor_dtype=torch.float16,
-            inplace=inplace,
-        )
-        test(
-            lib,
-            handle,
-            "mlu",
-            c_shape,
-            a_shape,
-            b_shape,
-            tensor_dtype=torch.float32,
-            inplace=inplace,
-        )
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
     destroy_handle(lib, handle)
 
 
@@ -239,41 +185,10 @@ def test_bang(lib, test_cases):
         ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
         ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
         ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.OUT_OF_PLACE),
-        (
-            ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A)
-            if not PROFILE
-            else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE)
-        ),
-        (
-            ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B)
-            if not PROFILE
-            else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE)
-        ),
-        (
-            ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB)
-            if not PROFILE
-            else (
-                (32, 256, 112, 112),
-                (32, 256, 112, 1),
-                (32, 256, 112, 112),
-                Inplace.OUT_OF_PLACE,
-            )
-        ),
-        (
-            (
-                (32, 3, 112, 112),
-                (32, 3, 112, 112),
-                (32, 3, 112, 112),
-                Inplace.OUT_OF_PLACE,
-            )
-            if not PROFILE
-            else (
-                (32, 256, 112, 112),
-                (32, 256, 112, 112),
-                (32, 256, 112, 112),
-                Inplace.OUT_OF_PLACE,
-            )
-        ),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE),
+        ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
     ]
     args = get_args()
     lib = open_lib()
@@ -288,4 +203,22 @@ def test_bang(lib, test_cases):
     lib.infiniopMod.restype = c_int32
     lib.infiniopMod.argtypes = [
         infiniopModDescriptor_t,
-   
\ No newline at end of file
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyModDescriptor.restype = c_int32
+    lib.infiniopDestroyModDescriptor.argtypes = [
+        infiniopModDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file